added parsing of contentprop attribute in html tags for

content='startDate' and content='endDate'. The value of these field is
now written to new solr fields startDates_dts and endDates_dts.
pull/1/head
Michael Peter Christen 10 years ago
parent a08a3c5f29
commit b060ba900d

@ -24,6 +24,12 @@ dates_in_content_dts
## the number of entries in dates_in_content_sxt ## the number of entries in dates_in_content_sxt
dates_in_content_count_i dates_in_content_count_i
## content of itemprop attributes with content='startDate'
startDates_dts
## content of itemprop attributes with content='endDate'
endDates_dts
## mime-type of document, string (mandatory field) ## mime-type of document, string (mandatory field)
content_type content_type

@ -26,7 +26,7 @@ public enum SolrType {
text_general("t", "txt"), // tokenizes with StandardTokenizer, removes stop words from case-insensitive "stopwords.txt", down cases, applies synonyms. text_general("t", "txt"), // tokenizes with StandardTokenizer, removes stop words from case-insensitive "stopwords.txt", down cases, applies synonyms.
text_en_splitting_tight(null, null), // can insert dashes in the wrong place and still match text_en_splitting_tight(null, null), // can insert dashes in the wrong place and still match
location("p", null), // lat,lon - format: specialized field for geospatial search. location("p", null), // lat,lon - format: specialized field for geospatial search.
date("dt", null), // date format as in http://www.w3.org/TR/xmlschema-2/#dateTime with trailing 'Z' date("dt", "dts"), // date format as in http://www.w3.org/TR/xmlschema-2/#dateTime with trailing 'Z'
bool("b", "bs", "boolean"), bool("b", "bs", "boolean"),
num_integer("i", "val", "int"), num_integer("i", "val", "int"),
num_long("l", "ls", "long"), num_long("l", "ls", "long"),

@ -177,6 +177,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final Map<String, DigestURL> hreflang, navigation; private final Map<String, DigestURL> hreflang, navigation;
private LinkedHashSet<String> titles; private LinkedHashSet<String> titles;
private final List<String> articles; private final List<String> articles;
private final List<Date> startDates, endDates;
//private String headline; //private String headline;
private List<String>[] headlines; private List<String>[] headlines;
private final ClusteredScoreMap<String> bold, italic, underline; private final ClusteredScoreMap<String> bold, italic, underline;
@ -234,6 +235,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.script = new SizeLimitedSet<AnchorURL>(maxLinks); this.script = new SizeLimitedSet<AnchorURL>(maxLinks);
this.titles = new LinkedHashSet<String>(); this.titles = new LinkedHashSet<String>();
this.articles = new ArrayList<String>(); this.articles = new ArrayList<String>();
this.startDates = new ArrayList<>();
this.endDates = new ArrayList<>();
this.headlines = (List<String>[]) Array.newInstance(ArrayList.class, 6); this.headlines = (List<String>[]) Array.newInstance(ArrayList.class, 6);
for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>(); for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>();
this.bold = new ClusteredScoreMap<String>(false); this.bold = new ClusteredScoreMap<String>(false);
@ -374,8 +377,33 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
} }
private void checkOpts(Tag tag) {
// vocabulary classes
final String classprop = tag.opts.getProperty("class", EMPTY_STRING);
this.vocabularyScraper.check(this.root, classprop, tag.content);
// itemprop
String itemprop = tag.opts.getProperty("itemprop");
if (itemprop != null) {
String content = tag.opts.getProperty("content");
if (content != null) {
if ("startDate".equals(itemprop)) try {
// parse ISO 8601 date
Date startDate = ISO8601Formatter.FORMATTER.parse(content);
this.startDates.add(startDate);
} catch (ParseException e) {}
if ("endDate".equals(itemprop)) try {
// parse ISO 8601 date
Date endDate = ISO8601Formatter.FORMATTER.parse(content);
this.endDates.add(endDate);
} catch (ParseException e) {}
}
}
}
@Override @Override
public void scrapeTag0(Tag tag) { public void scrapeTag0(Tag tag) {
checkOpts(tag);
if (tag.name.equalsIgnoreCase("img")) { if (tag.name.equalsIgnoreCase("img")) {
final String src = tag.opts.getProperty("src", EMPTY_STRING); final String src = tag.opts.getProperty("src", EMPTY_STRING);
try { try {
@ -514,9 +542,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
@Override @Override
public void scrapeTag1(Tag tag) { public void scrapeTag1(Tag tag) {
final String classprop = tag.opts.getProperty("class", EMPTY_STRING); checkOpts(tag);
//System.out.println("class = " + classprop);
this.vocabularyScraper.check(this.root, classprop, tag.content);
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text)); // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) { if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
String href = tag.opts.getProperty("href", EMPTY_STRING); String href = tag.opts.getProperty("href", EMPTY_STRING);
@ -748,6 +774,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return this.dd.toArray(new String[this.dd.size()]); return this.dd.toArray(new String[this.dd.size()]);
} }
public List<Date> getStartDates() {
return this.startDates;
}
public List<Date> getEndDates() {
return this.endDates;
}
public DigestURL[] getFlash() { public DigestURL[] getFlash() {
String ext; String ext;
ArrayList<DigestURL> f = new ArrayList<DigestURL>(); ArrayList<DigestURL> f = new ArrayList<DigestURL>();

@ -650,11 +650,16 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
final String[] dt = html.getDt(); final String[] dt = html.getDt();
add(doc, CollectionSchema.dtcount_i, dt.length); add(doc, CollectionSchema.dtcount_i, dt.length);
if (dt.length > 0) add(doc, CollectionSchema.dt_txt, li); if (dt.length > 0) add(doc, CollectionSchema.dt_txt, dt);
final String[] dd = html.getLi(); final String[] dd = html.getDd();
add(doc, CollectionSchema.ddcount_i, dd.length); add(doc, CollectionSchema.ddcount_i, dd.length);
if (dd.length > 0) add(doc, CollectionSchema.dd_txt, li); if (dd.length > 0) add(doc, CollectionSchema.dd_txt, dd);
final List<Date> startDates = html.getStartDates();
if (startDates.size() > 0) add(doc, CollectionSchema.startDates_dts, startDates.toArray(new Date[startDates.size()]));
final List<Date> endDates = html.getStartDates();
if (endDates.size() > 0) add(doc, CollectionSchema.endDates_dts, endDates.toArray(new Date[endDates.size()]));
final List<String> articles = html.getArticles(); final List<String> articles = html.getArticles();
add(doc, CollectionSchema.articlecount_i, articles.size()); add(doc, CollectionSchema.articlecount_i, articles.size());

@ -37,6 +37,8 @@ public enum CollectionSchema implements SchemaDeclaration {
last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"), last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
dates_in_content_dts(SolrType.date, true, true, true, false, true, "if date expressions can be found in the content, these dates are listed here as date objects in order of the appearances"), dates_in_content_dts(SolrType.date, true, true, true, false, true, "if date expressions can be found in the content, these dates are listed here as date objects in order of the appearances"),
dates_in_content_count_i(SolrType.num_integer, true, true, false, false, false, "the number of entries in dates_in_content_sxt"), dates_in_content_count_i(SolrType.num_integer, true, true, false, false, false, "the number of entries in dates_in_content_sxt"),
startDates_dts(SolrType.date, true, true, true, false, true, "content of itemprop attributes with content='startDate'"),
endDates_dts(SolrType.date, true, true, true, false, true, "content of itemprop attributes with content='endDate'"),
content_type(SolrType.string, true, true, true, false, false, "mime-type of document"), content_type(SolrType.string, true, true, true, false, false, "mime-type of document"),
http_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url which was http then appears as https (or vice versa) then the field is false"), http_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url which was http then appears as https (or vice versa) then the field is false"),
www_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false"), www_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false"),
@ -272,6 +274,7 @@ public enum CollectionSchema implements SchemaDeclaration {
assert !ext.equals("s") || (type == SolrType.string && !multiValued) : name; assert !ext.equals("s") || (type == SolrType.string && !multiValued) : name;
assert !ext.equals("sxt") || (type == SolrType.string && multiValued) : name; assert !ext.equals("sxt") || (type == SolrType.string && multiValued) : name;
assert !ext.equals("dt") || (type == SolrType.date && !multiValued) : name; assert !ext.equals("dt") || (type == SolrType.date && !multiValued) : name;
assert !ext.equals("dts") || (type == SolrType.date && multiValued) : name;
assert !ext.equals("t") || (type == SolrType.text_general && !multiValued) : name; assert !ext.equals("t") || (type == SolrType.text_general && !multiValued) : name;
assert !ext.equals("coordinate") || (type == SolrType.coordinate && !multiValued) : name; assert !ext.equals("coordinate") || (type == SolrType.coordinate && !multiValued) : name;
assert !ext.equals("txt") || (type == SolrType.text_general && multiValued) : name; assert !ext.equals("txt") || (type == SolrType.text_general && multiValued) : name;

Loading…
Cancel
Save