|
|
@ -146,22 +146,22 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|
|
|
public String name;
|
|
|
|
public String name;
|
|
|
|
public Properties opts;
|
|
|
|
public Properties opts;
|
|
|
|
public CharBuffer content;
|
|
|
|
public CharBuffer content;
|
|
|
|
|
|
|
|
private TagValency tv;
|
|
|
|
/** Set to true when this tag should be ignored from scraping */
|
|
|
|
public Tag(final String name, TagValency defaultValency) {
|
|
|
|
private boolean ignore = false;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public Tag(final String name) {
|
|
|
|
|
|
|
|
this.name = name;
|
|
|
|
this.name = name;
|
|
|
|
|
|
|
|
this.tv = defaultValency;
|
|
|
|
this.opts = new Properties();
|
|
|
|
this.opts = new Properties();
|
|
|
|
this.content = new CharBuffer(MAX_TAGSIZE);
|
|
|
|
this.content = new CharBuffer(MAX_TAGSIZE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
public Tag(final String name, final Properties opts) {
|
|
|
|
public Tag(final String name, TagValency defaultValency, final Properties opts) {
|
|
|
|
this.name = name;
|
|
|
|
this.name = name;
|
|
|
|
|
|
|
|
this.tv = defaultValency;
|
|
|
|
this.opts = opts;
|
|
|
|
this.opts = opts;
|
|
|
|
this.content = new CharBuffer(MAX_TAGSIZE);
|
|
|
|
this.content = new CharBuffer(MAX_TAGSIZE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
public Tag(final String name, final Properties opts, final CharBuffer content) {
|
|
|
|
public Tag(final String name, TagValency defaultValency, final Properties opts, final CharBuffer content) {
|
|
|
|
this.name = name;
|
|
|
|
this.name = name;
|
|
|
|
|
|
|
|
this.tv = defaultValency;
|
|
|
|
this.opts = opts;
|
|
|
|
this.opts = opts;
|
|
|
|
this.content = content;
|
|
|
|
this.content = content;
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -178,14 +178,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|
|
|
|
|
|
|
|
|
|
|
/** @return true when this tag should be ignored from scraping */
|
|
|
|
/** @return true when this tag should be ignored from scraping */
|
|
|
|
public boolean isIgnore() {
|
|
|
|
public boolean isIgnore() {
|
|
|
|
return this.ignore;
|
|
|
|
return this.tv == TagValency.IGNORE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public TagValency getValency() {
|
|
|
|
/**
|
|
|
|
return this.tv;
|
|
|
|
* @param ignore true when this tag should be ignored from scraping
|
|
|
|
}
|
|
|
|
*/
|
|
|
|
public void setValency(final TagValency tv) {
|
|
|
|
public void setIgnore(final boolean ignore) {
|
|
|
|
this.tv = tv;
|
|
|
|
this.ignore = ignore;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -230,8 +229,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|
|
|
|
|
|
|
|
|
|
|
private final VocabularyScraper vocabularyScraper;
|
|
|
|
private final VocabularyScraper vocabularyScraper;
|
|
|
|
|
|
|
|
|
|
|
|
/** Set of CSS class names whose matching div elements content should be ignored */
|
|
|
|
/** Set of CSS class names whose matching div elements may switch from IGNORE to EVAL or vice versa */
|
|
|
|
private final Set<String> ignoreDivClassNames;
|
|
|
|
private final Set<String> valencySwitchTagNames;
|
|
|
|
|
|
|
|
private final TagValency defaultValency;
|
|
|
|
|
|
|
|
|
|
|
|
private final int timezoneOffset;
|
|
|
|
private final int timezoneOffset;
|
|
|
|
private int breadcrumbs;
|
|
|
|
private int breadcrumbs;
|
|
|
@ -261,19 +261,28 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|
|
|
* @param root the document root url
|
|
|
|
* @param root the document root url
|
|
|
|
* @param maxAnchors the maximum number of URLs to process and store in the anchors property.
|
|
|
|
* @param maxAnchors the maximum number of URLs to process and store in the anchors property.
|
|
|
|
* @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
|
|
|
|
* @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
|
|
|
|
* @param ignoreDivClassNames an eventual set of CSS class names whose matching div elements content should be ignored
|
|
|
|
* @param valencySwitchTagNames an eventual set of CSS class names whose matching div elements content should be ignored
|
|
|
|
|
|
|
|
* @param defaultValency the valency default; should be TagValency.EVAL by default
|
|
|
|
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
|
|
|
|
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
|
|
|
|
* @param timezoneOffset local time zone offset
|
|
|
|
* @param timezoneOffset local time zone offset
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
@SuppressWarnings("unchecked")
|
|
|
|
@SuppressWarnings("unchecked")
|
|
|
|
public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set<String> ignoreDivClassNames, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
|
|
|
|
public ContentScraper(
|
|
|
|
|
|
|
|
final DigestURL root,
|
|
|
|
|
|
|
|
final int maxAnchors,
|
|
|
|
|
|
|
|
final int maxLinks,
|
|
|
|
|
|
|
|
final Set<String> valencySwitchTagNames,
|
|
|
|
|
|
|
|
final TagValency defaultValency,
|
|
|
|
|
|
|
|
final VocabularyScraper vocabularyScraper,
|
|
|
|
|
|
|
|
int timezoneOffset) {
|
|
|
|
// the root value here will not be used to load the resource.
|
|
|
|
// the root value here will not be used to load the resource.
|
|
|
|
// it is only the reference for relative links
|
|
|
|
// it is only the reference for relative links
|
|
|
|
super(linkTags0, linkTags1);
|
|
|
|
super(linkTags0, linkTags1);
|
|
|
|
assert root != null;
|
|
|
|
assert root != null;
|
|
|
|
this.root = root;
|
|
|
|
this.root = root;
|
|
|
|
this.vocabularyScraper = vocabularyScraper;
|
|
|
|
this.vocabularyScraper = vocabularyScraper;
|
|
|
|
this.ignoreDivClassNames = ignoreDivClassNames;
|
|
|
|
this.valencySwitchTagNames = valencySwitchTagNames;
|
|
|
|
|
|
|
|
this.defaultValency = defaultValency;
|
|
|
|
this.timezoneOffset = timezoneOffset;
|
|
|
|
this.timezoneOffset = timezoneOffset;
|
|
|
|
this.evaluationScores = new Evaluation();
|
|
|
|
this.evaluationScores = new Evaluation();
|
|
|
|
this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
|
|
|
|
this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
|
|
|
@ -321,8 +330,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|
|
|
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
|
|
|
|
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
|
|
|
|
* @param timezoneOffset local time zone offset
|
|
|
|
* @param timezoneOffset local time zone offset
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
public ContentScraper(final DigestURL root, final int maxLinks, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
|
|
|
|
public ContentScraper(
|
|
|
|
this(root, Integer.MAX_VALUE, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
|
|
|
|
final DigestURL root,
|
|
|
|
|
|
|
|
final int maxLinks,
|
|
|
|
|
|
|
|
final Set<String> valencySwitchTagNames,
|
|
|
|
|
|
|
|
final TagValency defaultValency,
|
|
|
|
|
|
|
|
final VocabularyScraper vocabularyScraper,
|
|
|
|
|
|
|
|
int timezoneOffset) {
|
|
|
|
|
|
|
|
this(root, Integer.MAX_VALUE, maxLinks, valencySwitchTagNames, defaultValency, vocabularyScraper, timezoneOffset);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public TagValency defaultValency() {
|
|
|
|
|
|
|
|
return this.defaultValency;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
@Override
|
|
|
@ -333,7 +352,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|
|
|
@Override
|
|
|
|
@Override
|
|
|
|
public void scrapeText(final char[] newtext0, final Tag insideTag) {
|
|
|
|
public void scrapeText(final char[] newtext0, final Tag insideTag) {
|
|
|
|
if (insideTag != null) {
|
|
|
|
if (insideTag != null) {
|
|
|
|
if(insideTag.ignore) {
|
|
|
|
if (insideTag.tv == TagValency.IGNORE) {
|
|
|
|
return;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ((TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name))) {
|
|
|
|
if ((TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name))) {
|
|
|
@ -720,7 +739,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
@Override
|
|
|
|
@Override
|
|
|
|
public void scrapeTag0(final Tag tag) {
|
|
|
|
public void scrapeTag0(final Tag tag) {
|
|
|
|
if(tag.ignore) {
|
|
|
|
if (tag.tv == TagValency.IGNORE) {
|
|
|
|
return;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
checkOpts(tag);
|
|
|
|
checkOpts(tag);
|
|
|
@ -893,7 +912,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
@Override
|
|
|
|
@Override
|
|
|
|
public void scrapeTag1(final Tag tag) {
|
|
|
|
public void scrapeTag1(final Tag tag) {
|
|
|
|
if(tag.ignore) {
|
|
|
|
if (tag.tv == TagValency.IGNORE) {
|
|
|
|
return;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
checkOpts(tag);
|
|
|
|
checkOpts(tag);
|
|
|
@ -1003,7 +1022,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
@Override
|
|
|
|
@Override
|
|
|
|
public void scrapeAnyTagOpening(final Tag tag) {
|
|
|
|
public void scrapeAnyTagOpening(final Tag tag) {
|
|
|
|
if (tag != null && !tag.ignore && tag.opts != null) {
|
|
|
|
if (tag != null && tag.tv == TagValency.EVAL && tag.opts != null) {
|
|
|
|
/*
|
|
|
|
/*
|
|
|
|
* HTML microdata can be annotated on any kind of tag, so we don't restrict this
|
|
|
|
* HTML microdata can be annotated on any kind of tag, so we don't restrict this
|
|
|
|
* scraping to the limited sets in linkTags0 and linkTags1
|
|
|
|
* scraping to the limited sets in linkTags0 and linkTags1
|
|
|
@ -1013,24 +1032,17 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
@Override
|
|
|
|
public boolean shouldIgnoreTag(final Tag tag, final Tag parentTag) {
|
|
|
|
public TagValency tagValency(final Tag tag, final Tag parentTag) {
|
|
|
|
boolean ignore = false;
|
|
|
|
if (parentTag != null && parentTag.tv != this.defaultValency) return parentTag.tv;
|
|
|
|
|
|
|
|
|
|
|
|
/* First, inherit ignore property from eventual parent */
|
|
|
|
|
|
|
|
if(parentTag != null) {
|
|
|
|
|
|
|
|
ignore = parentTag.ignore;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Parent is not marked as ignored : let's check the current tag */
|
|
|
|
if (this.valencySwitchTagNames != null &&
|
|
|
|
if (!ignore &&
|
|
|
|
|
|
|
|
this.ignoreDivClassNames != null &&
|
|
|
|
|
|
|
|
tag != null &&
|
|
|
|
tag != null &&
|
|
|
|
(TagName.div.name().equals(tag.name) || TagName.nav.name().equals(tag.name))) {
|
|
|
|
(TagName.div.name().equals(tag.name) || TagName.nav.name().equals(tag.name))) {
|
|
|
|
final String classAttr = tag.opts.getProperty("class", EMPTY_STRING);
|
|
|
|
final String classAttr = tag.opts.getProperty("class", EMPTY_STRING);
|
|
|
|
final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr);
|
|
|
|
final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr);
|
|
|
|
ignore = !Collections.disjoint(this.ignoreDivClassNames, classes);
|
|
|
|
if (!Collections.disjoint(this.valencySwitchTagNames, classes)) return this.defaultValency.reverse();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ignore;
|
|
|
|
return this.defaultValency;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
/**
|
|
|
@ -1604,13 +1616,25 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|
|
|
if (page == null) throw new IOException("no content in file " + file.toString());
|
|
|
|
if (page == null) throw new IOException("no content in file " + file.toString());
|
|
|
|
|
|
|
|
|
|
|
|
// scrape document to look up charset
|
|
|
|
// scrape document to look up charset
|
|
|
|
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), new DigestURL("http://localhost"), false, maxLinks, timezoneOffset);
|
|
|
|
final ScraperInputStream htmlFilter = new ScraperInputStream(
|
|
|
|
|
|
|
|
new ByteArrayInputStream(page),
|
|
|
|
|
|
|
|
StandardCharsets.UTF_8.name(),
|
|
|
|
|
|
|
|
new HashSet<String>(), TagValency.EVAL,
|
|
|
|
|
|
|
|
new VocabularyScraper(),
|
|
|
|
|
|
|
|
new DigestURL("http://localhost"),
|
|
|
|
|
|
|
|
false, maxLinks, timezoneOffset);
|
|
|
|
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
|
|
|
|
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
|
|
|
|
htmlFilter.close();
|
|
|
|
htmlFilter.close();
|
|
|
|
if (charset == null) charset = Charset.defaultCharset().toString();
|
|
|
|
if (charset == null) charset = Charset.defaultCharset().toString();
|
|
|
|
|
|
|
|
|
|
|
|
// scrape content
|
|
|
|
// scrape content
|
|
|
|
final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
|
|
|
|
final ContentScraper scraper = new ContentScraper(
|
|
|
|
|
|
|
|
new DigestURL("http://localhost"),
|
|
|
|
|
|
|
|
maxLinks,
|
|
|
|
|
|
|
|
new HashSet<String>(),
|
|
|
|
|
|
|
|
TagValency.EVAL,
|
|
|
|
|
|
|
|
new VocabularyScraper(),
|
|
|
|
|
|
|
|
timezoneOffset);
|
|
|
|
final Writer writer = new TransformerWriter(null, null, scraper, false);
|
|
|
|
final Writer writer = new TransformerWriter(null, null, scraper, false);
|
|
|
|
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
|
|
|
|
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
|
|
|
|
writer.close();
|
|
|
|
writer.close();
|
|
|
|