include html5 <time> tag in content scraper,

add "datetime" property of <time> tag to scrapers startdate list. Datetime is parsed as iso8601 (xml) date, html5 allows partial as well as duration (not handled by this)
8 years ago · cb95b7339a
parent f153cc4b5d
commit cb95b7339a
2 changed files with 47 additions and 1 deletions
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -120,7 +120,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        script(TagType.pair),
        span(TagType.pair),
        div(TagType.pair),
-        article(TagType.pair),
+        article(TagType.pair), // html5
+        time(TagType.pair), // html5 <time datetime>
        // tags used to capture tag content
        // TODO: considere to use </head> or <body> as trigger to scape for text content
        style(TagType.pair); // embedded css (if not declared as tag content is parsed as text)
@ -724,6 +725,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        } else if (tag.name.equalsIgnoreCase("article")) {
            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
            if (h.length() > 0) this.articles.add(h);
+        } else if (tag.name.equalsIgnoreCase(TagName.time.name())) { // html5 tag <time datetime="2016-12-23">Event</time>
+            h = tag.opts.getProperty("datetime");
+            if (h != null) {
+                try {
+                    Date startDate = ISO8601Formatter.FORMATTER.parse(h, this.timezoneOffset).getTime();
+                    this.startDates.add(startDate);
+                } catch (ParseException ex) { }
+            }
        }

        // fire event
--- a/test/java/net/yacy/document/parser/html/ContentScraperTest.java
+++ b/test/java/net/yacy/document/parser/html/ContentScraperTest.java
@ -21,7 +21,17 @@
 package net.yacy.document.parser.html;

 import java.awt.Dimension;
+import java.io.IOException;
+import java.io.StringReader;
+import java.io.Writer;
+import java.net.MalformedURLException;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.List;
 import java.util.Set;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.document.VocabularyScraper;
+import net.yacy.kelondro.util.FileUtils;

 import org.junit.Assert;
 import org.junit.Test;
@ -122,4 +132,31 @@ public class ContentScraperTest {
 		Assert.assertEquals(0, tokens.size());
 	}

+    @Test
+    public void testGetStartDates() throws MalformedURLException, IOException {
+        List<Date> dateResultList;
+        DigestURL root = new DigestURL("http://test.org/test.html");
+
+        String page = "<html><body>"
+                + "<time datetime='2016-12-23'>23. Dezember 2016</time>" // html5 time tag
+                + "</body></html>";
+
+        ContentScraper scraper = new ContentScraper(root, 10, new VocabularyScraper(), 0);
+        final Writer writer = new TransformerWriter(null, null, scraper, null, false);
+
+        FileUtils.copy(new StringReader(page), writer);
+        writer.close();
+
+        dateResultList = scraper.getStartDates();
+
+        Calendar cal = Calendar.getInstance();
+        cal.setTimeInMillis(0); // to zero hours
+        cal.set(2016, Calendar.DECEMBER, 23);
+
+        for (Date d : dateResultList) {
+            Assert.assertEquals(cal.getTime(), d);
+        }
+        scraper.close();
+    }
+
 }