added parsing of contentprop attribute in html tags for

content='startDate' and content='endDate'. The value of these field is
now written to new solr fields startDates_dts and endDates_dts.
pull/1/head
Michael Peter Christen 10 years ago
parent a08a3c5f29
commit b060ba900d

@ -24,6 +24,12 @@ dates_in_content_dts
## the number of entries in dates_in_content_sxt
dates_in_content_count_i
## content of itemprop attributes with content='startDate'
startDates_dts
## content of itemprop attributes with content='endDate'
endDates_dts
## mime-type of document, string (mandatory field)
content_type

@ -26,7 +26,7 @@ public enum SolrType {
text_general("t", "txt"), // tokenizes with StandardTokenizer, removes stop words from case-insensitive "stopwords.txt", down cases, applies synonyms.
text_en_splitting_tight(null, null), // can insert dashes in the wrong place and still match
location("p", null), // lat,lon - format: specialized field for geospatial search.
date("dt", null), // date format as in http://www.w3.org/TR/xmlschema-2/#dateTime with trailing 'Z'
date("dt", "dts"), // date format as in http://www.w3.org/TR/xmlschema-2/#dateTime with trailing 'Z'
bool("b", "bs", "boolean"),
num_integer("i", "val", "int"),
num_long("l", "ls", "long"),

@ -177,6 +177,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final Map<String, DigestURL> hreflang, navigation;
private LinkedHashSet<String> titles;
private final List<String> articles;
private final List<Date> startDates, endDates;
//private String headline;
private List<String>[] headlines;
private final ClusteredScoreMap<String> bold, italic, underline;
@ -234,6 +235,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.script = new SizeLimitedSet<AnchorURL>(maxLinks);
this.titles = new LinkedHashSet<String>();
this.articles = new ArrayList<String>();
this.startDates = new ArrayList<>();
this.endDates = new ArrayList<>();
this.headlines = (List<String>[]) Array.newInstance(ArrayList.class, 6);
for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>();
this.bold = new ClusteredScoreMap<String>(false);
@ -374,8 +377,33 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
private void checkOpts(Tag tag) {
// vocabulary classes
final String classprop = tag.opts.getProperty("class", EMPTY_STRING);
this.vocabularyScraper.check(this.root, classprop, tag.content);
// itemprop
String itemprop = tag.opts.getProperty("itemprop");
if (itemprop != null) {
String content = tag.opts.getProperty("content");
if (content != null) {
if ("startDate".equals(itemprop)) try {
// parse ISO 8601 date
Date startDate = ISO8601Formatter.FORMATTER.parse(content);
this.startDates.add(startDate);
} catch (ParseException e) {}
if ("endDate".equals(itemprop)) try {
// parse ISO 8601 date
Date endDate = ISO8601Formatter.FORMATTER.parse(content);
this.endDates.add(endDate);
} catch (ParseException e) {}
}
}
}
@Override
public void scrapeTag0(Tag tag) {
checkOpts(tag);
if (tag.name.equalsIgnoreCase("img")) {
final String src = tag.opts.getProperty("src", EMPTY_STRING);
try {
@ -514,9 +542,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
@Override
public void scrapeTag1(Tag tag) {
final String classprop = tag.opts.getProperty("class", EMPTY_STRING);
//System.out.println("class = " + classprop);
this.vocabularyScraper.check(this.root, classprop, tag.content);
checkOpts(tag);
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
String href = tag.opts.getProperty("href", EMPTY_STRING);
@ -748,6 +774,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return this.dd.toArray(new String[this.dd.size()]);
}
public List<Date> getStartDates() {
return this.startDates;
}
public List<Date> getEndDates() {
return this.endDates;
}
public DigestURL[] getFlash() {
String ext;
ArrayList<DigestURL> f = new ArrayList<DigestURL>();

@ -650,11 +650,16 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
final String[] dt = html.getDt();
add(doc, CollectionSchema.dtcount_i, dt.length);
if (dt.length > 0) add(doc, CollectionSchema.dt_txt, li);
if (dt.length > 0) add(doc, CollectionSchema.dt_txt, dt);
final String[] dd = html.getLi();
final String[] dd = html.getDd();
add(doc, CollectionSchema.ddcount_i, dd.length);
if (dd.length > 0) add(doc, CollectionSchema.dd_txt, li);
if (dd.length > 0) add(doc, CollectionSchema.dd_txt, dd);
final List<Date> startDates = html.getStartDates();
if (startDates.size() > 0) add(doc, CollectionSchema.startDates_dts, startDates.toArray(new Date[startDates.size()]));
final List<Date> endDates = html.getStartDates();
if (endDates.size() > 0) add(doc, CollectionSchema.endDates_dts, endDates.toArray(new Date[endDates.size()]));
final List<String> articles = html.getArticles();
add(doc, CollectionSchema.articlecount_i, articles.size());

@ -37,6 +37,8 @@ public enum CollectionSchema implements SchemaDeclaration {
last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
dates_in_content_dts(SolrType.date, true, true, true, false, true, "if date expressions can be found in the content, these dates are listed here as date objects in order of the appearances"),
dates_in_content_count_i(SolrType.num_integer, true, true, false, false, false, "the number of entries in dates_in_content_sxt"),
startDates_dts(SolrType.date, true, true, true, false, true, "content of itemprop attributes with content='startDate'"),
endDates_dts(SolrType.date, true, true, true, false, true, "content of itemprop attributes with content='endDate'"),
content_type(SolrType.string, true, true, true, false, false, "mime-type of document"),
http_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url which was http then appears as https (or vice versa) then the field is false"),
www_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false"),
@ -272,6 +274,7 @@ public enum CollectionSchema implements SchemaDeclaration {
assert !ext.equals("s") || (type == SolrType.string && !multiValued) : name;
assert !ext.equals("sxt") || (type == SolrType.string && multiValued) : name;
assert !ext.equals("dt") || (type == SolrType.date && !multiValued) : name;
assert !ext.equals("dts") || (type == SolrType.date && multiValued) : name;
assert !ext.equals("t") || (type == SolrType.text_general && !multiValued) : name;
assert !ext.equals("coordinate") || (type == SolrType.coordinate && !multiValued) : name;
assert !ext.equals("txt") || (type == SolrType.text_general && multiValued) : name;

Loading…
Cancel
Save