From b522d540b9492de5b16293bee309615619584eef Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 25 Dec 2016 23:39:55 +0100 Subject: [PATCH] Include itemprop latitude/longitude (see schema.org) in attribute parsing for lat/lon. Harmonize number parsing for lat/lon to parseDouble. Fix endDate_dts value assignment. --- .../document/parser/html/ContentScraper.java | 53 ++++++++++++------- .../schema/CollectionConfiguration.java | 3 +- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 4dc27ca2e..15f3bace2 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -300,29 +300,29 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (newtext[r] == ' ') { r--; if (newtext[r] == 'N') { - this.lat = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) + - Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0d; + this.lat = Double.parseDouble(new String(newtext, r + 2, p - r - 2)) + + Double.parseDouble(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0d; if (this.lon != 0.0d) break location; s = q + 6; continue location; } if (newtext[r] == 'S') { - this.lat = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) - - Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0d; + this.lat = -Double.parseDouble(new String(newtext, r + 2, p - r - 2)) - + Double.parseDouble(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0d; if (this.lon != 0.0d) break location; s = q + 6; continue location; } if (newtext[r] == 'E') { - this.lon = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) + - Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0d; + this.lon = Double.parseDouble(new String(newtext, r + 2, p - r - 2)) + + Double.parseDouble(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0d; if (this.lat != 0.0d) break location; s = q + 6; continue location; } if (newtext[r] == 'W') { - this.lon = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) - - Float.parseFloat(new String(newtext, p + 2, q - p - pl - 1)) / 60.0d; + this.lon = -Double.parseDouble(new String(newtext, r + 2, p - r - 2)) - + Double.parseDouble(new String(newtext, p + 2, q - p - pl - 1)) / 60.0d; if (this.lat != 0.0d) break location; s = q + 6; continue location; @@ -399,19 +399,34 @@ public class ContentScraper extends AbstractScraper implements Scraper { // itemprop (schema.org) String itemprop = tag.opts.getProperty("itemprop"); if (itemprop != null) { - String propval = tag.opts.getProperty("content"); + String propval = tag.opts.getProperty("content"); // value for see https://html.spec.whatwg.org/multipage/microdata.html#values if (propval == null) propval = tag.opts.getProperty("datetime"); // html5 + schema.org#itemprop example: while each prop is optional if (propval != null) { // html5 example: while each prop is optional - if ("startDate".equals(itemprop)) try { - // parse ISO 8601 date - Date startDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime(); - this.startDates.add(startDate); - } catch (ParseException e) {} - if ("endDate".equals(itemprop)) try { - // parse ISO 8601 date - Date endDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime(); - this.endDates.add(endDate); - } catch (ParseException e) {} + // check (schema.org) + switch (itemprop) { + // itemprops of main element with microdata
+ case "latitude": // + this.lat = Double.parseDouble(propval); // TODO: possibly overwrite existing value (multiple coordinates in document) + break; // TODO: risk to mix up existing coordinate if longitude not given too + case "longitude": // + this.lon = Double.parseDouble(propval); // TODO: possibly overwrite existing value (multiple coordinates in document) + break; // TODO: risk to mix up existing coordinate if latitude not given too + + case "startDate": // + try { + // parse ISO 8601 date + Date startDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime(); + this.startDates.add(startDate); + } catch (ParseException e) {} + break; + case "endDate": + try { + // parse ISO 8601 date + Date endDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime(); + this.endDates.add(endDate); + } catch (ParseException e) {} + break; + } } } } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 6e26ff390..556275392 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -87,7 +87,6 @@ import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; -import net.yacy.crawler.retrieval.Response; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.ProbabilisticClassifier; @@ -637,7 +636,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final List startDates = html.getStartDates(); if (startDates.size() > 0) add(doc, CollectionSchema.startDates_dts, startDates.toArray(new Date[startDates.size()])); - final List endDates = html.getStartDates(); + final List endDates = html.getEndDates(); if (endDates.size() > 0) add(doc, CollectionSchema.endDates_dts, endDates.toArray(new Date[endDates.size()])); final List articles = html.getArticles();