include html5 <time> tag in content scraper,

add "datetime" property of <time> tag to scrapers startdate list.
Datetime is parsed as iso8601 (xml) date, html5 allows partial as well
as duration (not handled by this)
pull/98/head
reger 8 years ago
parent f153cc4b5d
commit cb95b7339a

@ -120,7 +120,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
script(TagType.pair),
span(TagType.pair),
div(TagType.pair),
article(TagType.pair),
article(TagType.pair), // html5
time(TagType.pair), // html5 <time datetime>
// tags used to capture tag content
// TODO: considere to use </head> or <body> as trigger to scape for text content
style(TagType.pair); // embedded css (if not declared as tag content is parsed as text)
@ -724,6 +725,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if (tag.name.equalsIgnoreCase("article")) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.articles.add(h);
} else if (tag.name.equalsIgnoreCase(TagName.time.name())) { // html5 tag <time datetime="2016-12-23">Event</time>
h = tag.opts.getProperty("datetime");
if (h != null) {
try {
Date startDate = ISO8601Formatter.FORMATTER.parse(h, this.timezoneOffset).getTime();
this.startDates.add(startDate);
} catch (ParseException ex) { }
}
}
// fire event

@ -21,7 +21,17 @@
package net.yacy.document.parser.html;
import java.awt.Dimension;
import java.io.IOException;
import java.io.StringReader;
import java.io.Writer;
import java.net.MalformedURLException;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils;
import org.junit.Assert;
import org.junit.Test;
@ -122,4 +132,31 @@ public class ContentScraperTest {
Assert.assertEquals(0, tokens.size());
}
@Test
public void testGetStartDates() throws MalformedURLException, IOException {
List<Date> dateResultList;
DigestURL root = new DigestURL("http://test.org/test.html");
String page = "<html><body>"
+ "<time datetime='2016-12-23'>23. Dezember 2016</time>" // html5 time tag
+ "</body></html>";
ContentScraper scraper = new ContentScraper(root, 10, new VocabularyScraper(), 0);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(new StringReader(page), writer);
writer.close();
dateResultList = scraper.getStartDates();
Calendar cal = Calendar.getInstance();
cal.setTimeInMillis(0); // to zero hours
cal.set(2016, Calendar.DECEMBER, 23);
for (Date d : dateResultList) {
Assert.assertEquals(cal.getTime(), d);
}
scraper.close();
}
}

Loading…
Cancel
Save