Include itemprop latitude/longitude (see schema.org) in attribute

parsing for lat/lon.
Harmonize number parsing for lat/lon to parseDouble.
Fix endDate_dts value assignment.
pull/97/merge
reger 8 years ago
parent 083df255e4
commit b522d540b9

@ -300,29 +300,29 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (newtext[r] == ' ') { if (newtext[r] == ' ') {
r--; r--;
if (newtext[r] == 'N') { if (newtext[r] == 'N') {
this.lat = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) + this.lat = Double.parseDouble(new String(newtext, r + 2, p - r - 2)) +
Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0d; Double.parseDouble(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0d;
if (this.lon != 0.0d) break location; if (this.lon != 0.0d) break location;
s = q + 6; s = q + 6;
continue location; continue location;
} }
if (newtext[r] == 'S') { if (newtext[r] == 'S') {
this.lat = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) - this.lat = -Double.parseDouble(new String(newtext, r + 2, p - r - 2)) -
Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0d; Double.parseDouble(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0d;
if (this.lon != 0.0d) break location; if (this.lon != 0.0d) break location;
s = q + 6; s = q + 6;
continue location; continue location;
} }
if (newtext[r] == 'E') { if (newtext[r] == 'E') {
this.lon = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) + this.lon = Double.parseDouble(new String(newtext, r + 2, p - r - 2)) +
Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0d; Double.parseDouble(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0d;
if (this.lat != 0.0d) break location; if (this.lat != 0.0d) break location;
s = q + 6; s = q + 6;
continue location; continue location;
} }
if (newtext[r] == 'W') { if (newtext[r] == 'W') {
this.lon = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) - this.lon = -Double.parseDouble(new String(newtext, r + 2, p - r - 2)) -
Float.parseFloat(new String(newtext, p + 2, q - p - pl - 1)) / 60.0d; Double.parseDouble(new String(newtext, p + 2, q - p - pl - 1)) / 60.0d;
if (this.lat != 0.0d) break location; if (this.lat != 0.0d) break location;
s = q + 6; s = q + 6;
continue location; continue location;
@ -399,19 +399,34 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// itemprop (schema.org) // itemprop (schema.org)
String itemprop = tag.opts.getProperty("itemprop"); String itemprop = tag.opts.getProperty("itemprop");
if (itemprop != null) { if (itemprop != null) {
String propval = tag.opts.getProperty("content"); String propval = tag.opts.getProperty("content"); // value for <meta itemprop="" content=""> see https://html.spec.whatwg.org/multipage/microdata.html#values
if (propval == null) propval = tag.opts.getProperty("datetime"); // html5 + schema.org#itemprop example: <time itemprop="startDate" datetime="2016-01-26">today</time> while each prop is optional if (propval == null) propval = tag.opts.getProperty("datetime"); // html5 + schema.org#itemprop example: <time itemprop="startDate" datetime="2016-01-26">today</time> while each prop is optional
if (propval != null) { // html5 example: <time datetime="2016-01-26">today</time> while each prop is optional if (propval != null) { // html5 example: <time datetime="2016-01-26">today</time> while each prop is optional
if ("startDate".equals(itemprop)) try { // check <itemprop with value="" > (schema.org)
// parse ISO 8601 date switch (itemprop) {
Date startDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime(); // <meta> itemprops of main element with microdata <div itemprop="geo" itemscope itemtype="http://schema.org/GeoCoordinates">
this.startDates.add(startDate); case "latitude": // <meta itemprop="latitude" content="47.2649990" />
} catch (ParseException e) {} this.lat = Double.parseDouble(propval); // TODO: possibly overwrite existing value (multiple coordinates in document)
if ("endDate".equals(itemprop)) try { break; // TODO: risk to mix up existing coordinate if longitude not given too
// parse ISO 8601 date case "longitude": // <meta itemprop="longitude" content="11.3428720" />
Date endDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime(); this.lon = Double.parseDouble(propval); // TODO: possibly overwrite existing value (multiple coordinates in document)
this.endDates.add(endDate); break; // TODO: risk to mix up existing coordinate if latitude not given too
} catch (ParseException e) {}
case "startDate": // <meta itemprop="startDate" content="2016-04-21T20:00">
try {
// parse ISO 8601 date
Date startDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime();
this.startDates.add(startDate);
} catch (ParseException e) {}
break;
case "endDate":
try {
// parse ISO 8601 date
Date endDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime();
this.endDates.add(endDate);
} catch (ParseException e) {}
break;
}
} }
} }
} }

@ -87,7 +87,6 @@ import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.ProbabilisticClassifier; import net.yacy.document.ProbabilisticClassifier;
@ -637,7 +636,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
final List<Date> startDates = html.getStartDates(); final List<Date> startDates = html.getStartDates();
if (startDates.size() > 0) add(doc, CollectionSchema.startDates_dts, startDates.toArray(new Date[startDates.size()])); if (startDates.size() > 0) add(doc, CollectionSchema.startDates_dts, startDates.toArray(new Date[startDates.size()]));
final List<Date> endDates = html.getStartDates(); final List<Date> endDates = html.getEndDates();
if (endDates.size() > 0) add(doc, CollectionSchema.endDates_dts, endDates.toArray(new Date[endDates.size()])); if (endDates.size() > 0) add(doc, CollectionSchema.endDates_dts, endDates.toArray(new Date[endDates.size()]));
final List<String> articles = html.getArticles(); final List<String> articles = html.getArticles();

Loading…
Cancel
Save