Added basic support for autotagging microdata annotated item types.

With the appropriate vocabulary settings in Vocabulary_p.html page, this can produce Vocabulary search facets displaying item types referenced in html documents by microdata annotation. Tested notably, but not limited to, vocabulary classes/types defined by Schema.org and Dublin Core.
7 years ago · 9412881230
parent 5a14d34a7d
commit 9412881230
12 changed files with 412 additions and 14 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -279,6 +279,11 @@ surrogates.out = DATA/SURROGATES/out
 # this directory also contains subdirectories for input sources, the did-you-mean function and other
 dictionaries = DATA/DICTIONARIES

+# Set of comma separated vocabulary names whose terms should only be matched 
+# from linked data types annotations in documents (with microdata, RDFa, microformats...) 
+# instead of cleartext words
+vocabularies.matchLinkedData.names =
+
 # a path to the classification directory
 # each subdirectory is the name of a context (which becomes a navigator) with '.txt' files
 # containing texts to teach a bayesian filter. One of the files must be named 'negative.txt'.
@ -887,6 +892,7 @@ search.result.show.cache = true
 search.result.show.proxy = false
 search.result.show.hostbrowser = true
 search.result.show.vocabulary = false
+# Set of comma separated vocabulary names not to be used as search results facets
 search.result.show.vocabulary.omit = 
 search.result.show.snapshots = false
 # when true, display the raw ranking score value
--- a/htroot/Vocabulary_p.html
+++ b/htroot/Vocabulary_p.html
@ -162,6 +162,15 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
        <dt>Prefix</dt><dd>#[prefix]#</dd>
        <dt>Objectspace</dt><dd>#(editable)##[objectspace]#::<input type="text" name="objectspace" value="#[objectspace]#" size="78" maxlength="128" /><br/>if set, uses the predicate <a href="#[objectspacepredicate]#" target="_blank">#[objectspacepredicate]#</a> for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)#</dd>
        <dt>Is Facet?</dt><dd><input type="checkbox" name="isFacet"#(isFacet)#:: checked="checked"#(/isFacet)#/> (If checked, this vocabulary is used for search facets. Not feasible for large vocabularies!)</dd>
+        <dt>Match terms from</dt>
+        <dd>
+        	<label>
+          		<input type="radio" name="vocabularies.matchLinkedData" value="false" #(vocabularies.matchLinkedData)#checked="checked"::#(/vocabularies.matchLinkedData)# />Cleartext
+          	</label>
+          	<label>
+          		<input type="radio" name="vocabularies.matchLinkedData" value="true" #(vocabularies.matchLinkedData)#::checked="checked"#(/vocabularies.matchLinkedData)# />Linked data/Semantic web annotations
+          	</label>
+        </dd>
      </dl>
      <table class="sortable" border="0">
      <tr class="TableHeader" valign="bottom">
--- a/htroot/Vocabulary_p.java
+++ b/htroot/Vocabulary_p.java
@ -49,6 +49,7 @@ import net.yacy.document.LibraryProvider;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.search.Switchboard;
+import net.yacy.search.SwitchboardConstants;
 import net.yacy.search.index.Segment;
 import net.yacy.server.serverObjects;
 import net.yacy.server.serverSwitch;
@ -238,13 +239,27 @@ public class Vocabulary_p {
                        }
                    }
                    
-                    // check the isFacet property
+                    // check the isFacet and isMatchFromLinkedData properties
                    if (vocabulary != null && post.containsKey("set")) {
                        boolean isFacet = post.getBoolean("isFacet");
                        vocabulary.setFacet(isFacet);
                        Set<String> omit = env.getConfigSet("search.result.show.vocabulary.omit");
-                        if (isFacet) omit.remove(vocabularyName); else omit.add(vocabularyName);
+                        if (isFacet) {
+                        	omit.remove(vocabularyName); 
+                        } else {
+                        	omit.add(vocabularyName);
+                        }
                        env.setConfig("search.result.show.vocabulary.omit", omit);
+                        
+                        boolean isMatchFromLinkedData = post.getBoolean("vocabularies.matchLinkedData");
+                        vocabulary.setMatchFromLinkedData(isMatchFromLinkedData);
+                        final Set<String> matchLinkedDataVocs = env.getConfigSet(SwitchboardConstants.VOCABULARIES_MATCH_LINKED_DATA_NAMES);
+                        if (isMatchFromLinkedData) {
+                        	matchLinkedDataVocs.add(vocabularyName);
+                        } else {
+                        	matchLinkedDataVocs.remove(vocabularyName); 
+                        }
+                        env.setConfig(SwitchboardConstants.VOCABULARIES_MATCH_LINKED_DATA_NAMES, matchLinkedDataVocs);
                    }
                }
            } catch (final IOException e) {
@ -273,6 +288,7 @@ public class Vocabulary_p {
            prop.putXML("edit_namexml", vocabulary.getName());
            prop.putHTML("edit_namespace", vocabulary.getNamespace());
            prop.put("edit_isFacet", vocabulary.isFacet() ? 1 : 0);
+            prop.put("edit_vocabularies.matchLinkedData", vocabulary.isMatchFromLinkedData());
            prop.put("edit_size", vocabulary.size());
            prop.putHTML("edit_predicate", vocabulary.getPredicate());
            prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX);
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@ -951,6 +951,11 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
        return this.protocol;
    }

+    /**
+     * @return this URL fragment or null if has no fragment
+     * @see <a href="https://url.spec.whatwg.org/#concept-url-fragment">URL fragment concept at WHATWG</a>
+     * @see <a href="https://tools.ietf.org/html/rfc3986#section-3.5">URL fragment section in RFC 3986</a> 
+     */
    public String getRef() {
        return this.anchor;
    }
--- a/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java
+++ b/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java
@ -28,6 +28,9 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;

+import org.apache.commons.lang.StringUtils;
+
+import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.geo.Locations;
 import net.yacy.cora.lod.vocabulary.Tagging;
 import net.yacy.cora.util.ConcurrentLog;
@ -44,7 +47,8 @@ public class AutotaggingLibrary {
    private final static Object PRESENT = new Object();

    private final File autotaggingPath;
-    private final Map<String, Tagging> vocabularies; // mapping from vocabulary name to the tagging vocabulary
+    /** mapping from vocabulary name to the tagging vocabulary */
+    private final Map<String, Tagging> vocabularies;
    private final Map<String, Object> allTags;

    /**
@ -81,6 +85,25 @@ public class AutotaggingLibrary {
            }
        }
    }
+    
+	/**
+	 * Create a new Autotagging instance from the provided vocabularies. Can be used
+	 * for example for testing purpose.
+	 */
+    protected AutotaggingLibrary(final Map<String, Tagging> vocabularies) {
+    	if(vocabularies != null) {
+    		this.vocabularies = vocabularies;
+    	} else {
+    		this.vocabularies = new ConcurrentHashMap<String, Tagging>();
+    	}
+    	this.allTags = new ConcurrentHashMap<String, Object>();
+    	this.autotaggingPath = null;
+    	for(final Tagging voc : this.vocabularies.values()) {
+            for (final String t: voc.tags()) {
+                this.allTags.put(t, PRESENT);
+            }
+    	}
+    }

    public File getVocabularyFile(String name) {
        return new File(this.autotaggingPath, name + ".vocabulary");
@ -152,19 +175,90 @@ public class AutotaggingLibrary {
    	return 4;
    }

-    public Tagging.Metatag getTagFromTerm(Set<String> vocabularies, String term) {
+    /**
+     * Search a term in the given active vocabularies matching clear text words.
+     * @param vocabularies the vocabularies names to search for term
+     * @param term the word to search
+     * @return a instance of Metatag from the first matching vocabulary, or null when no one was found
+     */
+    public Tagging.Metatag getTagFromTerm(final Set<String> vocabularies, String term) {
        if (this.vocabularies.isEmpty()) return null;
        Tagging.Metatag tag;
        term = Tagging.normalizeTerm(term);
        for (String vocabularyName: vocabularies) {
            Tagging t = this.vocabularies.get(vocabularyName);
-            if (t != null) {
+            if (t != null && !t.isMatchFromLinkedData()) {
                tag = t.getMetatagFromSynonym(term);
                if (tag != null) return tag;
            }
        }
        return null;
    }
+    
+	/**
+	 * Search in the active vocabularies matching linked data for Metatag entries with objectspace + term
+	 * matching the given term URL. Returns at most one Metatag instance per
+	 * vocabulary.
+	 * 
+	 * @param termURL
+	 *            the vocabulary term identifier (an absolute URL) to search
+	 * @return a set of matching Metatag instances eventually empty
+	 */
+	public Set<Tagging.Metatag> getTagsFromTermURL(final DigestURL termURL) {
+		final Set<Tagging.Metatag> tags = new HashSet<>();
+		if (termURL == null || this.vocabularies.isEmpty()) {
+			return tags;
+		}
+		final String termURLStr = termURL.toNormalform(false);
+		String termNamespace = null;
+
+		/* If the objectLink URL has a fragment, this should be the vocabulary term */
+		String term = termURL.getRef();
+		if (term == null) {
+			/*
+			 * No fragment in the URL : the term should then be the last segment of the URL
+			 */
+			term = termURL.getFileName();
+			if (StringUtils.isNotEmpty(term)) {
+				final int lastPathSeparatorPos = termURLStr.lastIndexOf("/");
+				if (lastPathSeparatorPos > 0) {
+					termNamespace = termURLStr.substring(0, lastPathSeparatorPos + 1);
+				}
+			}
+		} else {
+			final int fragmentPos = termURLStr.indexOf("#");
+			if (fragmentPos > 0) {
+				termNamespace = termURLStr.substring(0, fragmentPos + 1);
+			}
+		}
+		if (StringUtils.isNotEmpty(term) && termNamespace != null) {
+			final String alternativeTermNamespace;
+			/*
+			 * http://example.org/ and https://example.org/ are considered equivalent forms
+			 * for the namespace URL
+			 */
+			if (termURL.isHTTP()) {
+				alternativeTermNamespace = "https" + termNamespace.substring("http".length());
+			} else if (termURL.isHTTPS()) {
+				alternativeTermNamespace = "http" + termNamespace.substring("https".length());
+			} else {
+				alternativeTermNamespace = null;
+			}
+
+			for (final Tagging vocabulary : this.vocabularies.values()) {
+				if (vocabulary != null && vocabulary.isMatchFromLinkedData()) {
+					if ((termNamespace.equals(vocabulary.getObjectspace())) || (alternativeTermNamespace != null
+							&& alternativeTermNamespace.equals(vocabulary.getObjectspace()))) {
+						final Tagging.Metatag tag = vocabulary.getMetatagFromTerm(term);
+						if (tag != null) {
+							tags.add(tag);
+						}
+					}
+				}
+			}
+		}
+		return tags;
+	}

    public Tagging.Metatag metatag(String vocName, String term) {
        Tagging tagging = this.vocabularies.get(vocName);
--- a/source/net/yacy/cora/lod/vocabulary/Tagging.java
+++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java
@ -47,6 +47,9 @@ public class Tagging {

    public final static String DEFAULT_NAMESPACE= "http://yacy.net/autotagging#";
    public final static String DEFAULT_PREFIX = "tags";
+    
+    /** Default value for the property matchFromLinkedData */
+    public final static boolean DEFAULT_MATCH_FROM_LINKED_DATA = false;

    private final String navigatorName;
    private final Map<String, String> synonym2term;
@ -55,7 +58,16 @@ public class Tagging {
    private final Map<String, TaggingEntry> term2entries;
    
    private File propFile;
-    private boolean isFacet; // true if the vocabulary shall generate a navigation facet
+    
+    /** true if the vocabulary shall generate a navigation facet */
+    private boolean isFacet;
+    
+	/**
+	 * True when this vocabulary terms should only be matched from linked data types
+	 * annotations (with microdata, RDFa, microformats...) instead of clear text
+	 * words
+	 */
+	private boolean matchFromLinkedData;

    private String predicate, namespace, objectspace;

@ -101,6 +113,7 @@ public class Tagging {
        this.objectspace = null;
        this.propFile = null;
        this.isFacet = true;
+        this.matchFromLinkedData = DEFAULT_MATCH_FROM_LINKED_DATA;
    }

    public Tagging(String name, File propFile) throws IOException {
@ -285,6 +298,25 @@ public class Tagging {
        this.isFacet = isFacet;
    }
    
+	/**
+	 * @return true when this vocabulary terms should be matched from linked data
+	 *         types annotations (with microdata, RDFa, microformats...) instead of
+	 *         clear text words
+	 */
+    public boolean isMatchFromLinkedData() {
+		return this.matchFromLinkedData;
+	}
+    
+	/**
+	 * @param facetFromLinkedData
+	 *            true when this vocabulary terms should be matched from linked
+	 *            data types annotations (with microdata, RDFa, microformats...)
+	 *            instead of clear text words
+	 */
+    public void setMatchFromLinkedData(final boolean facetFromLinkedData) {
+    	this.matchFromLinkedData = facetFromLinkedData;
+    }
+    
    public int size() {
        return this.term2entries.size();
    }
@ -525,16 +557,41 @@ public class Tagging {
        return this.propFile;
    }

+	/**
+	 * @param word
+	 *            a synonym to look for
+	 * @return a Metatag instance with the matching term, or null when the synonym
+	 *         is not in this vocabulary.
+	 */
    public Metatag getMetatagFromSynonym(final String word) {
        String printname = this.synonym2term.get(word);
        if (printname == null) return null;
        return new Metatag(printname);
    }
+    
+	/**
+	 * @param term
+	 *            a term to look for
+	 * @return a Metatag instance with the matching term, or null when it is not in
+	 *         this vocabulary.
+	 */
+    public Metatag getMetatagFromTerm(final String term) {
+        TaggingEntry entry = this.term2entries.get(term);
+        if(entry == null) {
+        	return null;
+        }
+        return new Metatag(term);
+    }

-    public Metatag getMetatagFromTerm(final String word) {
+	/**
+	 * @param word
+	 *            the object of the Metatag
+	 * @return a new Metatag instance related to this vocabulary
+	 */
+    public Metatag buildMetatagFromTerm(final String word) {
        return new Metatag(word);
    }
-
+    
    public Set<String> tags() {
        return this.synonym2term.keySet();
    }
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -26,11 +26,13 @@ import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.Date;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedHashSet;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
+import java.util.Set;
 import java.util.SortedSet;

 import org.apache.solr.common.params.MapSolrParams;
@ -40,8 +42,12 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain;
 import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.federate.solr.Ranking;
+import net.yacy.cora.language.synonyms.AutotaggingLibrary;
+import net.yacy.cora.lod.vocabulary.Tagging;
+import net.yacy.cora.lod.vocabulary.Tagging.Metatag;
 import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.document.language.Identificator;
@ -190,6 +196,10 @@ public final class Condenser extends Tokenizer {
                }
            }
        }
+        
+        if(doAutotagging) {
+        	extractAutoTagsFromLinkedDataTypes(document.getLinkedDataTypes(), LibraryProvider.autotagging);
+        }

        // extend the tags in the document object with autotagging tags
        if (!this.tags.isEmpty()) {
@ -214,6 +224,36 @@ public final class Condenser extends Tokenizer {
        /* Restore the current thread initial name */
        Thread.currentThread().setName(initialThreadName);
    }
+    
+	/**
+	 * Search for tags matching the given linked data types identifiers (absolute
+	 * URLs) in the given autotagging library. Then fill this instance "tags" map
+	 * with the eventually matching tags found.
+	 * 
+	 * @param linkedDataTypes
+	 *            a set of linked data typed items identifiers (absolute URLs) to
+	 *            search
+	 * @param tagLibrary
+	 *            the autotagging library holding vocabularies to search in
+	 */
+	protected void extractAutoTagsFromLinkedDataTypes(final Set<DigestURL> linkedDataTypes,
+			final AutotaggingLibrary tagLibrary) {
+		if (linkedDataTypes == null || tagLibrary == null) {
+			return;
+		}
+		for (final DigestURL linkedDataType : linkedDataTypes) {
+			final Set<Metatag> tags = tagLibrary.getTagsFromTermURL(linkedDataType);
+			for (final Metatag tag : tags) {
+				final String navigatorName = tag.getVocabularyName();
+				Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
+				if (tagset == null) {
+					tagset = new HashSet<Metatag>();
+					this.tags.put(navigatorName, tagset);
+				}
+				tagset.add(tag);
+			}
+		}
+	}

    private void insertTextToWords(
            final SentenceReader text,
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -87,8 +87,16 @@ public class Document {
    // text in image tags.
    private LinkedHashMap<AnchorURL, String> audiolinks, videolinks, applinks, hyperlinks; // TODO: check if redundant value (set to key.getNameProperty()) is needed
    private LinkedHashMap<DigestURL, String> inboundlinks, outboundlinks;
+    
    /** links to icons that belongs to the document (mapped by absolute URL) */
    private Map<DigestURL, IconEntry> icons;
+    
+	/**
+	 * URLs of linked data item types/classes referenced by the document (for example in
+	 * HTML with standard annotations such as RDFa, microdata, microformats or
+	 * JSON-LD)
+	 */
+	private Set<DigestURL> linkedDataTypes;
    private boolean resorted;
    private final Set<String> languages;
    private boolean indexingDenied;
@ -145,6 +153,7 @@ public class Document {
        this.videolinks = null;
        this.applinks = null;
        this.icons = new HashMap<>();
+        this.linkedDataTypes = new HashSet<>();
        this.resorted = false;
        this.inboundlinks = null;
        this.outboundlinks = null;
@ -817,7 +826,7 @@ dc_rights
     * Set links to icons that belongs to the document (mapped by absolute URL)
     * @param icons
     */
-    public void setIcons(Map<DigestURL, IconEntry> icons) {
+    public void setIcons(final Map<DigestURL, IconEntry> icons) {
    	/* Better to ensure now icons property will not be null */
    	if(icons != null) {
    		this.icons = icons;	
@ -825,6 +834,28 @@ dc_rights
    		this.icons = new HashMap<>();
    	}
 	}
+    
+	/**
+	 * @return URLs of linked data item types/classes referenced by the document (for example in
+	 * HTML with standard annotations such as RDFa, microdata, microformats or
+	 * JSON-LD)
+	 */
+    public Set<DigestURL> getLinkedDataTypes() {
+		return this.linkedDataTypes;
+	}
+    
+	/**
+	 * @return URLs of linked data item types/classes referenced by the document
+	 */
+    public void setLinkedDataTypes(final Set<DigestURL> linkedDataTypes) {
+    	if(linkedDataTypes != null) {
+    		/* Ensure non null property */
+    		this.linkedDataTypes = linkedDataTypes;
+    	} else {
+    		this.linkedDataTypes.clear();
+    	}
+    }
+    

    public int inboundLinkNofollowCount() {
        if (this.inboundlinks == null) resortLinks();
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -214,6 +214,7 @@ public class htmlParser extends AbstractParser implements Parser {
                scraper.getDate());
        ppd.setScraperObject(scraper);
        ppd.setIcons(scraper.getIcons());
+        ppd.setLinkedDataTypes(scraper.getLinkedDataTypes());
        ppd.setPartiallyParsed(scraper.isLimitsExceeded());
        
        return ppd;
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -419,12 +419,28 @@ public final class Switchboard extends serverSwitch {
            public void run() {
                Thread.currentThread().setName("LibraryProvider.initialize");
                LibraryProvider.initialize(Switchboard.this.dictionariesPath);
-                // persistent Vocabulary Switch
-                Set<String> omit = Switchboard.this.getConfigSet("search.result.show.vocabulary.omit");
-                for (String o: omit) {
-                    Tagging t = LibraryProvider.autotagging.getVocabulary(o);
-                    if (t != null) t.setFacet(false);
+                // persistent Vocabulary Switches
+                final Set<String> omit = Switchboard.this.getConfigSet("search.result.show.vocabulary.omit");
+                for (final String o: omit) {
+                    final Tagging t = LibraryProvider.autotagging.getVocabulary(o);
+                    if (t != null) {
+                    	t.setFacet(false);
+                    } else {
+                    	log.config("search.result.show.vocabulary.omit configuration value contains an unknown vocabulary name : " + o);
+                    }
                }
+                
+				final Set<String> linkedDataVocs = Switchboard.this
+						.getConfigSet(SwitchboardConstants.VOCABULARIES_MATCH_LINKED_DATA_NAMES);
+				for (final String vocName : linkedDataVocs) {
+					final Tagging t = LibraryProvider.autotagging.getVocabulary(vocName);
+					if (t != null) {
+						t.setMatchFromLinkedData(true);
+					} else {
+						log.config(SwitchboardConstants.VOCABULARIES_MATCH_LINKED_DATA_NAMES
+								+ " configuration value contains an unknown vocabulary name : " + vocName);
+					}
+				}

                Thread.currentThread().setName("ProbabilisticClassification.initialize");
                ProbabilisticClassifier.initialize(Switchboard.this.classificationPath);
--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@ -411,6 +411,11 @@ public final class SwitchboardConstants {
    public static final String DICTIONARY_SOURCE_PATH         = "dictionaries";
    public static final String DICTIONARY_SOURCE_PATH_DEFAULT = "DATA/DICTIONARIES";
    
+    /** Setting key for a set of comma separated vocabulary names whose terms should only be matched 
+    * from linked data types annotations in documents (with microdata, RDFa, microformats...) 
+    * instead of cleartext words */
+    public static final String VOCABULARIES_MATCH_LINKED_DATA_NAMES = "vocabularies.matchLinkedData.names";
+    
    public static final String CLASSIFICATION_SOURCE_PATH         = "classification";
    public static final String CLASSIFICATION_SOURCE_PATH_DEFAULT = "DATA/CLASSIFICATION";

--- a/test/java/net/yacy/cora/language/synonyms/AutotaggingLibraryTest.java
+++ b/test/java/net/yacy/cora/language/synonyms/AutotaggingLibraryTest.java
@ -0,0 +1,118 @@
+// AutotaggingLibraryTest.java
+// Copyright 2018 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.cora.language.synonyms;
+
+import java.io.IOException;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.lod.vocabulary.Tagging;
+import net.yacy.cora.lod.vocabulary.Tagging.SOTuple;
+
+/**
+ * Unit tests for the {@link AutotaggingLibrary} class.
+ * @author luccioman
+ *
+ */
+public class AutotaggingLibraryTest {
+
+	/**
+	 * Test tags search from term URL
+	 * @throws IOException when an unexpected error occurred
+	 */
+	@Test
+	public void testGetTagsFromTermURL() throws IOException {
+		final ConcurrentHashMap<String, Tagging> vocabularies = new ConcurrentHashMap<String, Tagging>();
+		final AutotaggingLibrary lib = new AutotaggingLibrary(vocabularies);
+		
+		Map<String, SOTuple> table = new LinkedHashMap<String, Tagging.SOTuple>();
+		/* Sample types extracted from https://www.w3.org/TR/activitystreams-vocabulary/#activity-types */
+		table.put("Accept", new Tagging.SOTuple(Tagging.normalizeTerm("Accept"), null));
+		table.put("Add", new Tagging.SOTuple(Tagging.normalizeTerm("Add"), null));
+		Tagging voc = new Tagging("activitystream", null, "https://www.w3.org/ns/activitystreams#", table);
+		voc.setMatchFromLinkedData(true);
+		vocabularies.put("activitystream", voc);
+		
+		table = new LinkedHashMap<String, Tagging.SOTuple>();
+		/* Sample classes extracted from http://dublincore.org/documents/dcmi-terms/#H2 */
+		table.put("Agent", new Tagging.SOTuple(Tagging.normalizeTerm("Agent"), null));
+		table.put("MediaType", new Tagging.SOTuple(Tagging.normalizeTerm("MediaType"), null));
+		voc = new Tagging("DublinCore", null, "http://purl.org/dc/terms/", table);
+		voc.setMatchFromLinkedData(true);
+		vocabularies.put("DublinCore", voc);
+		
+		table = new LinkedHashMap<String, Tagging.SOTuple>();
+		/* Sample types extracted from http://schema.org/docs/full.html */
+		table.put("Article", new Tagging.SOTuple(Tagging.normalizeTerm("Article"), null));
+		table.put("Blog", new Tagging.SOTuple(Tagging.normalizeTerm("Blog"), null));
+		voc = new Tagging("Schema.org", null, "http://schema.org/", table);
+		voc.setMatchFromLinkedData(true);
+		vocabularies.put("Schema.org", voc);
+		
+		
+		/* Term URL with fragment, path and file parts */
+		Set<Tagging.Metatag> tags = lib.getTagsFromTermURL(new DigestURL("https://www.w3.org/ns/activitystreams#Accept"));
+		Assert.assertEquals(1, tags.size());
+		Tagging.Metatag tag = tags.iterator().next();
+		Assert.assertEquals("activitystream", tag.getVocabularyName());
+		Assert.assertEquals("Accept", tag.getObject());
+		
+		/* Alternate accepted term URL form with http protocol */
+		tags = lib.getTagsFromTermURL(new DigestURL("http://www.w3.org/ns/activitystreams#Accept"));
+		Assert.assertEquals(1, tags.size());
+		tag = tags.iterator().next();
+		Assert.assertEquals("activitystream", tag.getVocabularyName());
+		Assert.assertEquals("Accept", tag.getObject());
+		
+		/* Term URL with path and file parts */
+		tags = lib.getTagsFromTermURL(new DigestURL("http://purl.org/dc/terms/MediaType"));
+		Assert.assertEquals(1, tags.size());
+		tag = tags.iterator().next();
+		Assert.assertEquals("DublinCore", tag.getVocabularyName());
+		Assert.assertEquals("MediaType", tag.getObject());
+		
+		/* Term URL with file part only */
+		tags = lib.getTagsFromTermURL(new DigestURL("http://schema.org/Article"));
+		Assert.assertEquals(1, tags.size());
+		tag = tags.iterator().next();
+		Assert.assertEquals("Schema.org", tag.getVocabularyName());
+		Assert.assertEquals("Article", tag.getObject());
+		
+		/* Missing terms */
+		tags = lib.getTagsFromTermURL(new DigestURL("https://www.w3.org/ns/activitystreams#MissingTerm"));
+		Assert.assertEquals(0, tags.size());
+		
+		tags = lib.getTagsFromTermURL(new DigestURL("https://www.w3.org/ns/activitystreams#Accepting"));
+		Assert.assertEquals(0, tags.size());
+		
+		/* Wrong namespace */
+		tags = lib.getTagsFromTermURL(new DigestURL("https://example.org/namespace#Accept"));
+		Assert.assertEquals(0, tags.size());
+	}
+
+}