Added HTML microdata typed items parsing capability.

This adds the possibility for the HTML parser to gather typed items URLs annotated in HTML tags with itemscope and itemtype attributes (see microdata specification https://www.w3.org/TR/microdata/ ), notably Types from the schema.org vocabulary, but also Types/Classes from any other vocabulary, such as the common ones listed in the RDFa core context ( https://www.w3.org/2011/rdfa-context/rdfa-1.1.html ).
7 years ago · 58b9834729
parent 80fb1026d0
commit 58b9834729
4 changed files with 191 additions and 4 deletions
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -191,6 +191,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private final SizeLimitedMap<AnchorURL, EmbedEntry> embeds; // urlhash/embed relation
    private final List<ImageEntry> images; 
    private final SizeLimitedSet<AnchorURL> script, frames, iframes;
+    
+	/**
+	 * URLs of linked data item types referenced from HTML content with standard
+	 * annotations such as RDFa, microdata, microformats or JSON-LD
+	 */
+    private final SizeLimitedSet<DigestURL> linkedDataTypes;
+    
    private final SizeLimitedMap<String, String> metas;
    private final SizeLimitedMap<String, DigestURL> hreflang, navigation;
    private LinkedHashSet<String> titles;
@ -260,6 +267,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        this.embeds = new SizeLimitedMap<AnchorURL, EmbedEntry>(maxLinks);
        this.frames = new SizeLimitedSet<AnchorURL>(maxLinks);
        this.iframes = new SizeLimitedSet<AnchorURL>(maxLinks);
+        this.linkedDataTypes = new SizeLimitedSet<>(maxLinks);
        this.metas = new SizeLimitedMap<String, String>(maxLinks);
        this.hreflang = new SizeLimitedMap<String, DigestURL>(maxLinks);
        this.navigation = new SizeLimitedMap<String, DigestURL>(maxLinks);
@ -543,12 +551,49 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        }
    }
    
-    private void checkOpts(Tag tag) {
+	/**
+	 * Parse the eventual microdata itemtype attribute of a tag and extract its
+	 * valid URL tokens when the itemscope attribute is present.
+	 * 
+	 * @param tagAttributes parsed HTML tag attributes.
+	 * @return a set of URLs eventually empty when no itemtype attribute is present
+	 *         or when its value is not valid
+	 * @see <a href="https://www.w3.org/TR/microdata/#dfn-itemtype">itemtype
+	 *      definition at W3C</a>
+	 * @see <a href=
+	 *      "https://html.spec.whatwg.org/multipage/microdata.html#attr-itemtype">itemtype
+	 *      definition at WHATWG</a>
+	 */
+	private Set<DigestURL> parseMicrodataItemType(final Properties tagAttributes) {
+		final Set<DigestURL> types = new HashSet<>();
+		if (tagAttributes != null) {
+			/*
+			 * The itemtype attribute must not be specified on elements that do not have an
+			 * itemscope attribute specified. So we lazily check here for itemscope boolean
+			 * attribute presence (strictly conforming parsing would also check it has no
+			 * value or the value is the empty string or "itemscope")
+			 */
+			if (tagAttributes.getProperty("itemscope") != null) {
+				final Set<String> itemTypes = parseSpaceSeparatedTokens(tagAttributes.getProperty("itemtype"));
+
+				for (final String itemType : itemTypes) {
+					try {
+						types.add(new DigestURL(itemType));
+					} catch (final MalformedURLException ignored) {
+						/* Each itemtype space-separated token must be a valid absolute URL */
+					}
+				}
+			}
+		}
+		return types;
+	}
+    
+    private void checkOpts(final Tag tag) {
        // vocabulary classes
        final String classprop = tag.opts.getProperty("class", EMPTY_STRING);
        this.vocabularyScraper.check(this.root, classprop, tag.content);
        
-        // itemprop (schema.org)
+        // itemprop microdata property (standard definition at https://www.w3.org/TR/microdata/#dfn-attr-itemprop)
        String itemprop = tag.opts.getProperty("itemprop");
        if (itemprop != null) {
            String propval = tag.opts.getProperty("content"); // value for <meta itemprop="" content=""> see https://html.spec.whatwg.org/multipage/microdata.html#values
@ -620,7 +665,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
 	 *            attribute string, may be null
 	 * @return a set of tokens eventually empty
 	 */
-	public static Set<String> parseSpaceSeparatedTokens(String attr) {
+	public static Set<String> parseSpaceSeparatedTokens(final String attr) {
 		Set<String> tokens = new HashSet<>();
 		/* Check attr string is not empty to avoid adding a single empty string
 		 * in result */
@ -923,6 +968,22 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        this.fireScrapeTag1(tag.name, tag.opts, tag.content.getChars());
    }
    
+	/**
+	 * Scraping operation applied to any kind of tag opening, being either singleton
+	 * or paired tag, not restricted to tags listed in
+	 * {@link ContentScraper#linkTags0} and {@link ContentScraper#linkTags1}.
+	 */
+	@Override
+	public void scrapeAnyTagOpening(final String tagName, final Properties tagAttributes) {
+		if (tagAttributes != null) {
+			/*
+			 * HTML microdata can be annotated on any kind of tag, so we don't restrict this
+			 * scraping to the limited sets in linkTags0 and linkTags1
+			 */
+			this.linkedDataTypes.addAll(parseMicrodataItemType(tagAttributes));
+		}
+	}
+    
    /**
     * Add an anchor to the anchors list, and trigger any eventual listener
     * @param anchor anchor to add. Must not be null.
@ -1092,6 +1153,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        // returns a url (String) / name (String) relation
        return this.iframes;
    }
+    
+	/**
+	 * @return URLs of linked data item types referenced from HTML content with standard
+	 *         annotations such as RDFa, microdata, microformats or JSON-LD
+	 */
+	public SizeLimitedSet<DigestURL> getLinkedDataTypes() {
+		return this.linkedDataTypes;
+	}

    public Set<AnchorURL> getScript() {
        return this.script;
@ -1164,7 +1233,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
 		return this.contentSizeLimitExceeded || this.maxAnchorsExceeded || this.css.isLimitExceeded()
 				|| this.rss.isLimitExceeded() || this.embeds.isLimitExceeded() || this.metas.isLimitExceeded()
 				|| this.hreflang.isLimitExceeded() || this.navigation.isLimitExceeded() || this.script.isLimitExceeded()
-				|| this.frames.isLimitExceeded() || this.iframes.isLimitExceeded();
+				|| this.frames.isLimitExceeded() || this.iframes.isLimitExceeded() || this.linkedDataTypes.isLimitExceeded();
 	}
    
    /*
@ -1384,6 +1453,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        this.script.clear();
        this.frames.clear();
        this.iframes.clear();
+        this.linkedDataTypes.clear();
        this.embeds.clear();
        this.images.clear();
        this.icons.clear();
--- a/source/net/yacy/document/parser/html/Scraper.java
+++ b/source/net/yacy/document/parser/html/Scraper.java
@ -24,17 +24,52 @@

 package net.yacy.document.parser.html;

+import java.util.Properties;
+
 public interface Scraper {

+	/**
+	 * @param tag
+	 *            a tag name
+	 * @return true when the tag name belongs to the first category of tags
+	 *         according to the Scraper implementation, and is therefore candidate
+	 *         for processing by
+	 *         {@link #scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)}
+	 *         implementation
+	 */
    public boolean isTag0(String tag);

+	/**
+	 * @param tag
+	 *            a tag name
+	 * @return true when the tag name belongs to the second category of tags
+	 *         according to the Scraper implementation, and is therefore candidate
+	 *         for processing by
+	 *         {@link #scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)}
+	 *         implementation
+	 */
    public boolean isTag1(String tag);

    public void scrapeText(char[] text, String insideTag);

+    /**
+     * Process a tag belonging to the first category of tags according to the Scraper implementation
+     * @param tag a parsed tag
+     */
    public void scrapeTag0(ContentScraper.Tag tag);

+    /**
+     * Process a tag belonging to the second category of tags according to the Scraper implementation
+     * @param tag a parsed tag
+     */
    public void scrapeTag1(ContentScraper.Tag tag);
+    
+    /**
+     * Processing applied to any kind of tag opening.
+     * @param tagName the tag name
+     * @param tagAttributes the atttributes of the tag
+     */
+    public void scrapeAnyTagOpening(String tagName, Properties tagAttributes);

    public void scrapeComment(final char[] comment);

--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@ -292,6 +292,10 @@ public final class TransformerWriter extends Writer {
        final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
        ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser());
        charBuffer.close();
+        
+        /* Apply processing relevant for any kind of tag opening */
+        this.scraper.scrapeAnyTagOpening(tag.name, tag.opts);
+        
        if (this.scraper != null && this.scraper.isTag0(tagname)) {
            // this single tag is collected at once here
            this.scraper.scrapeTag0(tag);
--- a/test/java/net/yacy/document/parser/html/ContentScraperTest.java
+++ b/test/java/net/yacy/document/parser/html/ContentScraperTest.java
@ -29,8 +29,11 @@ import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Collection;
 import java.util.Date;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Set;

 import org.junit.Assert;
@ -338,5 +341,80 @@ public class ContentScraperTest {
    	Assert.assertEquals("{abc}{def}", ContentScraper.removeUnpairedBrackets("{abc}{def}", '{', '}'));
    	Assert.assertEquals("{{abc}{def}}", ContentScraper.removeUnpairedBrackets("{{abc}{def}}", '{', '}'));
    }
+    
+    /**
+     * Test microdata itemtype attribute parsing
+     * @throws IOException 
+     */
+    @Test
+    public void testParseMicroDataItemType() throws IOException {
+    	final String htmlHeader = "<!DOCTYPE html><head><title>Test document</title></head>";
+        final DigestURL docUrl = new DigestURL("http://example.org/microdata.html");
+        
+        
+        final Map<String, String[]> html2Results = new HashMap<>();
+        /* Basic microdata syntax example with no item type */
+    	String html = htmlHeader + "<div itemscope><p>My name is <span itemprop=\"name\">Elizabeth</span>.</p></div>";
+    	String[] expectedUrls = {};
+    	html2Results.put(html, expectedUrls);
+    	
+    	/* Nested items with no item type */
+    	html = "<div itemscope>\n" + 
+    	" <p>Name: <span itemprop=\"name\">Amanda</span></p>\n" + 
+    	" <p>Band: <span itemprop=\"band\" itemscope> <span itemprop=\"name\">Jazz Band</span> (<span itemprop=\"size\">12</span> players)</span></p>\n" + 
+    	"</div>";
+    	expectedUrls = new String[0];
+    	html2Results.put(html, expectedUrls);
+    	
+    	/* One typed item */
+    	html = htmlHeader + "<div itemscope itemtype=\"https://schema.org/LocalBusiness\"><img itemprop=\"logo\" src=\"our-logo.png\" alt=\"Our Company\"></div>";
+    	expectedUrls = new String[]{"https://schema.org/LocalBusiness"};
+    	html2Results.put(html, expectedUrls);
+    	
+    	/* more than one type per item */
+    	html = htmlHeader + "<dl itemscope itemtype=\"https://md.example.com/loco https://md.example.com/lighting\">" + 
+    	" <dt>Name:\n" + 
+    	" <dd itemprop=\"name\">Tank Locomotive (DB 80)\n" + 
+    	" <dt>Product code:\n" + 
+    	" <dd itemprop=\"product-code\">33041\n" + 
+    	" <dt>Scale:\n" + 
+    	" <dd itemprop=\"scale\">HO\n" + 
+    	" <dt>Digital:\n" + 
+    	" <dd itemprop=\"digital\">Delta\n" + 
+    	"</dl>";
+    	expectedUrls = new String[]{"https://md.example.com/loco", "https://md.example.com/lighting"};
+    	html2Results.put(html, expectedUrls);
+    	
+    	/* Nested typed items */
+    	html = htmlHeader + "<div itemscope itemtype=\"http://schema.org/Product\">\n" + 
+    	" <span itemprop=\"name\">Panasonic White 60L Refrigerator</span>\n" + 
+    	" <img src=\"panasonic-fridge-60l-white.jpg\" alt=\"\">\n" + 
+    	"  <div itemprop=\"aggregateRating\"\n" + 
+    	"       itemscope itemtype=\"http://schema.org/AggregateRating\">\n" + 
+    	"   <meter itemprop=\"ratingValue\" min=0 value=3.5 max=5>Rated 3.5/5</meter>\n" + 
+    	"   (based on <span itemprop=\"reviewCount\">11</span> customer reviews)\n" + 
+    	"  </div>\n" + 
+    	"</div>";
+    	expectedUrls = new String[]{"http://schema.org/Product", "http://schema.org/AggregateRating"};
+    	html2Results.put(html, expectedUrls);
+  
+
+		for (final Entry<String, String[]> html2Result : html2Results.entrySet()) {
+			ContentScraper scraper = new ContentScraper(docUrl, 10, new HashSet<String>(), new VocabularyScraper(), 0);
+			try (final Writer writer = new TransformerWriter(null, null, scraper, null, false)) {
+				FileUtils.copy(new StringReader(html2Result.getKey()), writer);
+
+				final Set<DigestURL> expected = new HashSet<>();
+				for (final String url : html2Result.getValue()) {
+					expected.add(new DigestURL(url));
+				}
+
+				Assert.assertEquals(expected.size(), scraper.getLinkedDataTypes().size());
+				Assert.assertTrue(expected.containsAll(scraper.getLinkedDataTypes()));
+			} finally {
+				scraper.close();
+			}
+		}
+    }

 }