introduction of tag-to-indexing relation TagValency

pull/554/head
Michael Peter Christen 2 years ago
parent 95e02e5291
commit 5acd98f4da

@ -28,13 +28,13 @@
<fieldset id="queues" style="width:210px;float:left;">
<legend>Queues</legend>
<table border="0" class="watchCrawler">
<thead>
<thead>
<tr class="TableHeader">
<th width="120">Queue<br/>&nbsp;</th>
<th width="60">Size<br/>&nbsp;</th>
<th width="30"><span class="glyphicon glyphicon-wrench"></span></th>
</tr>
</thead>
</thead>
<tbody>
<tr class="TableCellLight">
<td align="left"><a href="IndexCreateQueues_p.html?stack=LOCAL">Local Crawler</a></td>
@ -89,13 +89,13 @@
<fieldset id="indexsize" style="width:240px;float:left;">
<legend>Index Size</legend>
<table border="0" class="watchCrawler">
<thead>
<thead>
<tr class="TableHeader">
<th width="130">Database<br/>&nbsp;</th>
<th width="50">Entries<br/>&nbsp;</th>
<th width="40">Seg-<br/>ments</th>
</tr>
</thead>
</thead>
<tbody>
<tr class="TableCellLight">
<td align="left">Documents<br/><a href="#[urlpublictextSolrURL]#">solr search api</a></td>
@ -124,12 +124,12 @@
<legend>Progress</legend>
<form action="Crawler_p.html" method="get" enctype="multipart/form-data" accept-charset="UTF-8">
<table border="0" class="watchCrawler">
<thead>
<thead>
<tr class="TableHeader">
<th width="160">Indicator<br/>&nbsp;</th>
<th width="300" colspan="4">Level<br/>&nbsp;</th>
</tr>
</thead>
</thead>
<tbody>
<tr class="TableCellLight">
<td align="left">Speed / PPM<br/>(Pages Per Minute)</td>
@ -147,7 +147,7 @@
<td align="left">Crawler PPM</td>
<td align="left" width="60"><span id="ppmNum">&nbsp;&nbsp;&nbsp;</span></td>
<td align="left" width="260px" colspan="3">
<progress id="ppmbar" max="30000" value="0" style="width:94%;"/>
<progress id="ppmbar" max="30000" value="0" style="width:94%;"/>
</td>
</tr>
<tr class="TableCellLight">
@ -180,13 +180,13 @@
<script>
function setTableSize() {
var maxh = Math.max(document.getElementById("progress").children[1].clientHeight, document.getElementById("indexsize").children[1].clientHeight, document.getElementById("queues").children[1].clientHeight) + 42;
if(lastMaxh !== maxh) {
var lastMaxh = maxh;
document.getElementById("indexsize").style.height = maxh + "px";
document.getElementById("progress").style.height = maxh + "px";
document.getElementById("queues").style.height = maxh + "px";
}
var maxh = Math.max(document.getElementById("progress").children[1].clientHeight, document.getElementById("indexsize").children[1].clientHeight, document.getElementById("queues").children[1].clientHeight) + 42;
if(lastMaxh !== maxh) {
var lastMaxh = maxh;
document.getElementById("indexsize").style.height = maxh + "px";
document.getElementById("progress").style.height = maxh + "px";
document.getElementById("queues").style.height = maxh + "px";
}
}
window.setInterval("setTableSize()", 1000);
</script>
@ -219,23 +219,23 @@ window.setInterval("setTableSize()", 1000);
If you crawl any un-wanted pages, you can delete them <a href="IndexCreateQueues_p.html?stack=LOCAL">here</a>.<br />::
<!-- 9 -->
No embedded local Solr index is connected. This is required to use a Solr query filter.
You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.::
<!-- 10 -->
The Solr filter query syntax is not valid : <code>#[solrQuery]#</code>::
<!-- 11 -->
Could not parse the Solr filter query : <code>#[solrQuery]#</code>
You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.::
<!-- 10 -->
The Solr filter query syntax is not valid : <code>#[solrQuery]#</code>::
<!-- 11 -->
Could not parse the Solr filter query : <code>#[solrQuery]#</code>
#(/info)#
</p>
#(wontReceiptRemoteResults)#::
<div class="alert alert-warning">
<p>You asked for remote indexing, but remote crawl results won't be added to the local index as the remote crawler is currently disabled on this peer.</p>
<p>You can activate it in the <a href="RemoteCrawl_p.html">Remote Crawl Configuration</a> page.</p>
<p>You asked for remote indexing, but remote crawl results won't be added to the local index as the remote crawler is currently disabled on this peer.</p>
<p>You can activate it in the <a href="RemoteCrawl_p.html">Remote Crawl Configuration</a> page.</p>
</div>
#(/wontReceiptRemoteResults)#
<!-- #(noEmbeddedSolr)#::<div class="alert alert-error">No embedded local Solr index is connected. This is required to use the Solr filter query.
You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.</div>
You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.</div>
#(/noEmbeddedSolr)#
#(solrQuerySyntaxtError)#::<div class="alert alert-error">The Solr filter query syntax is not valid : #[solrQuery]#</div>
@ -252,32 +252,32 @@ window.setInterval("setTableSize()", 1000);
<tr><td>
<table border="0" summary="A list of crawl profiles and their current settings." id="crawlProfiles">
<colgroup>
<col width="16" />
<col width="140"/>
<col width="16" />
<col width="140"/>
</colgroup>
<thead>
<tr class="TableHeader">
<th><strong>Name</strong></th>
#(debug)#::<th id="headerDebug"><strong>Count</strong></th>#(/debug)#
<th><strong>Status</strong></th>
</tr>
<tr class="TableHeader">
<th><strong>Name</strong></th>
#(debug)#::<th id="headerDebug"><strong>Count</strong></th>#(/debug)#
<th><strong>Status</strong></th>
</tr>
</thead>
<tbody>
#{list}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#" id="#[handle]#">
<td>#[name]#</td>
#(debug)#::<td>#[count]#</td>#(/debug)#
<td id="#[handle]#_status_cell">#(terminateButton)#::
<div id="#[handle]#_status" style="text-decoration:blink;float:left;">Running</div>
<form id="#[handle]#_terminate" style="float:left;" action="Crawler_p.html" method="get" enctype="multipart/form-data" accept-charset="UTF-8">
<div>
<input type="hidden" name="handle" value="#[handle]#" />
<input type="submit" name="terminate" value="Terminate" class="btn btn-danger btn-xs"/>
</div>
</form>
#(/terminateButton)#
</td>
</tr>
<tr class="TableCell#(dark)#Light::Dark#(/dark)#" id="#[handle]#">
<td>#[name]#</td>
#(debug)#::<td>#[count]#</td>#(/debug)#
<td id="#[handle]#_status_cell">#(terminateButton)#::
<div id="#[handle]#_status" style="text-decoration:blink;float:left;">Running</div>
<form id="#[handle]#_terminate" style="float:left;" action="Crawler_p.html" method="get" enctype="multipart/form-data" accept-charset="UTF-8">
<div>
<input type="hidden" name="handle" value="#[handle]#" />
<input type="submit" name="terminate" value="Terminate" class="btn btn-danger btn-xs"/>
</div>
</form>
#(/terminateButton)#
</td>
</tr>
#{/list}#
</tbody>
</table>

@ -306,33 +306,33 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (ext != null) putAll(ext);
this.doms = new ConcurrentHashMap<String, AtomicInteger>();
String jsonString = ext.get(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key);
JSONArray a;
if(jsonString == null) {
a = new JSONArray();
} else {
try {
a = new JSONArray(new JSONTokener(jsonString));
} catch(final JSONException e) {
ConcurrentLog.logException(e);
a = new JSONArray();
}
}
JSONArray a;
if(jsonString == null) {
a = new JSONArray();
} else {
try {
a = new JSONArray(new JSONTokener(jsonString));
} catch(final JSONException e) {
ConcurrentLog.logException(e);
a = new JSONArray();
}
}
this.ignore_class_name = new HashSet<String>();
for (int i = 0; i < a.length(); i++) try {
this.ignore_class_name.add(a.getString(i));
} catch (JSONException e) {}
jsonString = ext.get(CrawlAttribute.SCRAPER.key);
if (jsonString == null || jsonString.length() == 0) {
this.scraper = new VocabularyScraper();
this.scraper = new VocabularyScraper();
} else {
VocabularyScraper loadedScraper;
try {
loadedScraper = new VocabularyScraper(jsonString);
} catch(final JSONException e) {
ConcurrentLog.logException(e);
loadedScraper = new VocabularyScraper();
}
this.scraper = loadedScraper;
VocabularyScraper loadedScraper;
try {
loadedScraper = new VocabularyScraper(jsonString);
} catch(final JSONException e) {
ConcurrentLog.logException(e);
loadedScraper = new VocabularyScraper();
}
this.scraper = loadedScraper;
}
}
@ -485,22 +485,22 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return this.crawlerurlmustmatch;
}
/**
* Render the urlMustMatchPattern as a String of limited size, suffixing it with
* "..." when it is truncated. Used to prevent unnecessary growth of the logs,
* and to prevent exceeding the field size limit for
* CollectionSchema.failreason_s (32k) when the pattern is present in a fail doc
* added to the Solr index.
*
* @return the urlMustMatchPattern formatted as a String of limited size
*/
/**
* Render the urlMustMatchPattern as a String of limited size, suffixing it with
* "..." when it is truncated. Used to prevent unnecessary growth of the logs,
* and to prevent exceeding the field size limit for
* CollectionSchema.failreason_s (32k) when the pattern is present in a fail doc
* added to the Solr index.
*
* @return the urlMustMatchPattern formatted as a String of limited size
*/
public String formattedUrlMustMatchPattern() {
String patternStr = urlMustMatchPattern().toString();
if(patternStr.length() > 1000) {
/* The pattern may be quite large when using the 'From Link-List of URL' crawl start point. */
patternStr = patternStr.substring(0, Math.min(patternStr.length(), 1000)) + "...";
}
return patternStr;
String patternStr = urlMustMatchPattern().toString();
if(patternStr.length() > 1000) {
/* The pattern may be quite large when using the 'From Link-List of URL' crawl start point. */
patternStr = patternStr.substring(0, Math.min(patternStr.length(), 1000)) + "...";
}
return patternStr;
}
/**
@ -517,47 +517,47 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return this.crawlerurlmustnotmatch;
}
/**
* Get the pattern on the URL a document must match to allow adding its embedded links to the crawl stack
*
* @return a {@link Pattern} instance, defaulting to
* {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression
* string is not set or its syntax is incorrect
*/
/**
* Get the pattern on the URL a document must match to allow adding its embedded links to the crawl stack
*
* @return a {@link Pattern} instance, defaulting to
* {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression
* string is not set or its syntax is incorrect
*/
public Pattern getCrawlerOriginUrlMustMatchPattern() {
if (this.crawlerOriginUrlMustMatch == null) {
/* Cache the compiled pattern for faster next calls */
final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key);
try {
this.crawlerOriginUrlMustMatch = (patternStr == null
|| patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN
: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) {
this.crawlerOriginUrlMustMatch = CrawlProfile.MATCH_ALL_PATTERN;
}
}
if (this.crawlerOriginUrlMustMatch == null) {
/* Cache the compiled pattern for faster next calls */
final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key);
try {
this.crawlerOriginUrlMustMatch = (patternStr == null
|| patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN
: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) {
this.crawlerOriginUrlMustMatch = CrawlProfile.MATCH_ALL_PATTERN;
}
}
return this.crawlerOriginUrlMustMatch;
}
/**
* Get the pattern on the URL a document must not match to allow adding its embedded links to the crawl stack
*
* @return a {@link Pattern} instance, defaulting to
* {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression
* string is not set or its syntax is incorrect
*/
/**
* Get the pattern on the URL a document must not match to allow adding its embedded links to the crawl stack
*
* @return a {@link Pattern} instance, defaulting to
* {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression
* string is not set or its syntax is incorrect
*/
public Pattern getCrawlerOriginUrlMustNotMatchPattern() {
if (this.crawlerOriginUrlMustNotMatch == null) {
/* Cache the compiled pattern for faster next calls */
final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key);
try {
this.crawlerOriginUrlMustNotMatch = (patternStr == null
|| patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN
: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) {
this.crawlerOriginUrlMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN;
}
}
if (this.crawlerOriginUrlMustNotMatch == null) {
/* Cache the compiled pattern for faster next calls */
final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key);
try {
this.crawlerOriginUrlMustNotMatch = (patternStr == null
|| patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN
: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) {
this.crawlerOriginUrlMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN;
}
}
return this.crawlerOriginUrlMustNotMatch;
}
@ -672,47 +672,47 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return this.indexcontentmustnotmatch;
}
/**
* Get the Pattern on media type that documents must match in order to be indexed
*
* @return a {@link Pattern} instance, defaulting to
* {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression
* string is not set or its syntax is incorrect
*/
/**
* Get the Pattern on media type that documents must match in order to be indexed
*
* @return a {@link Pattern} instance, defaulting to
* {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression
* string is not set or its syntax is incorrect
*/
public Pattern getIndexMediaTypeMustMatchPattern() {
if (this.indexMediaTypeMustMatch == null) {
/* Cache the compiled pattern for faster next calls */
final String patternStr = get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key);
try {
this.indexMediaTypeMustMatch = (patternStr == null
|| patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN
: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) {
this.indexMediaTypeMustMatch = CrawlProfile.MATCH_ALL_PATTERN;
}
}
if (this.indexMediaTypeMustMatch == null) {
/* Cache the compiled pattern for faster next calls */
final String patternStr = get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key);
try {
this.indexMediaTypeMustMatch = (patternStr == null
|| patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN
: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) {
this.indexMediaTypeMustMatch = CrawlProfile.MATCH_ALL_PATTERN;
}
}
return this.indexMediaTypeMustMatch;
}
/**
* Get the Pattern on media type that documents must not match in order to be indexed
*
* @return a {@link Pattern} instance, defaulting to
* {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression
* string is not set or its syntax is incorrect
*/
/**
* Get the Pattern on media type that documents must not match in order to be indexed
*
* @return a {@link Pattern} instance, defaulting to
* {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression
* string is not set or its syntax is incorrect
*/
public Pattern getIndexMediaTypeMustNotMatchPattern() {
if (this.indexMediaTypeMustNotMatch == null) {
/* Cache the compiled pattern for faster next calls */
final String patternStr = get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key);
try {
this.indexMediaTypeMustNotMatch = (patternStr == null
|| patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN
: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) {
this.indexMediaTypeMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN;
}
}
if (this.indexMediaTypeMustNotMatch == null) {
/* Cache the compiled pattern for faster next calls */
final String patternStr = get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key);
try {
this.indexMediaTypeMustNotMatch = (patternStr == null
|| patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN
: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) {
this.indexMediaTypeMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN;
}
}
return this.indexMediaTypeMustNotMatch;
}
@ -734,29 +734,29 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
}
/**
* @return true when URLs of unsupported resources (no parser available or denied format) should
* be indexed as links (with metadata only on URL and not on content).
*/
/**
* @return true when URLs of unsupported resources (no parser available or denied format) should
* be indexed as links (with metadata only on URL and not on content).
*/
public boolean isIndexNonParseableUrls() {
final String r = get(CrawlAttribute.DIRECT_DOC_BY_URL.key);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
/**
* @return true when the crawler must always cross check the eventual URL file
* extension against the actual Media Type, even when file extension is
* unknown or unsupported. False when the crawler should not load URLs
* with an unknown or unsupported file extension.
*/
public boolean isCrawlerAlwaysCheckMediaType() {
final String r = get(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key);
if (r == null) {
return false;
}
return (r.equals(Boolean.TRUE.toString()));
}
/**
* @return true when the crawler must always cross check the eventual URL file
* extension against the actual Media Type, even when file extension is
* unknown or unsupported. False when the crawler should not load URLs
* with an unknown or unsupported file extension.
*/
public boolean isCrawlerAlwaysCheckMediaType() {
final String r = get(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key);
if (r == null) {
return false;
}
return (r.equals(Boolean.TRUE.toString()));
}
public CacheStrategy cacheStrategy() {
final String r = get(CrawlAttribute.CACHE_STRAGEGY.key);
@ -952,7 +952,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
public void putProfileEntry(
final String CRAWL_PROFILE_PREFIX,
final String CRAWL_PROFILE_PREFIX,
final serverObjects prop,
final boolean active,
final boolean dark,
@ -1022,22 +1022,22 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
public static void main(String[] args) {
// test to convert the key set from set to string and back
Set<String> a = new HashSet<>();
a.add("eins"); a.add("zwei"); a.add("drei");
JSONArray j = new JSONArray(a);
String s = j.toString();
System.out.println(s);
JSONTokener o = new JSONTokener(s);
try {
j = new JSONArray(o);
System.out.println(j);
Set<String> h = new HashSet<String>();
public static void main(String[] args) {
// test to convert the key set from set to string and back
Set<String> a = new HashSet<>();
a.add("eins"); a.add("zwei"); a.add("drei");
JSONArray j = new JSONArray(a);
String s = j.toString();
System.out.println(s);
JSONTokener o = new JSONTokener(s);
try {
j = new JSONArray(o);
System.out.println(j);
Set<String> h = new HashSet<String>();
for (int i = 0; i < j.length(); i++) h.add(j.getString(i));
System.out.println(h);
} catch (JSONException e) {
e.printStackTrace();
}
System.out.println(h);
} catch (JSONException e) {
e.printStackTrace();
}
}
}

@ -60,6 +60,7 @@ import net.yacy.data.BookmarksDB.Bookmark;
import net.yacy.data.BookmarksDB.Tag;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.util.FileUtils;
@ -117,12 +118,12 @@ public class BookmarkHelper {
// --------------------------------------
public static int importFromBookmarks(final BookmarksDB db, final DigestURL baseURL, final String input, final String tag, final boolean importPublic){
// convert string to input stream
final ByteArrayInputStream byteIn = new ByteArrayInputStream(UTF8.getBytes(input));
final InputStreamReader reader = new InputStreamReader(byteIn, StandardCharsets.UTF_8);
// convert string to input stream
final ByteArrayInputStream byteIn = new ByteArrayInputStream(UTF8.getBytes(input));
final InputStreamReader reader = new InputStreamReader(byteIn, StandardCharsets.UTF_8);
// import stream
return importFromBookmarks(db, baseURL, reader, tag, importPublic);
// import stream
return importFromBookmarks(db, baseURL, reader, tag, importPublic);
}
private static int importFromBookmarks(final BookmarksDB db, final DigestURL baseURL, final InputStreamReader input, final String tag, final boolean importPublic){
@ -135,7 +136,7 @@ public class BookmarkHelper {
final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
try {
//load the links
final ContentScraper scraper = new ContentScraper(baseURL, 10000, new HashSet<String>(), new VocabularyScraper(), 0);
final ContentScraper scraper = new ContentScraper(baseURL, 10000, new HashSet<String>(), TagValency.EVAL, new VocabularyScraper(), 0);
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, false);
FileUtils.copy(input,writer);

@ -73,7 +73,7 @@ import net.yacy.kelondro.util.ISO639;
public class ContentScraper extends AbstractScraper implements Scraper {
private final static int MAX_TAGSIZE = 1024 * 1024;
public static final int MAX_DOCSIZE = 40 * 1024 * 1024;
public static final int MAX_DOCSIZE = 40 * 1024 * 1024;
private final char degree = '\u00B0';
private final char[] minuteCharsHTML = "&#039;".toCharArray();
@ -88,9 +88,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private static final Pattern LB = Pattern.compile("\n");
public enum TagType {
/** Tag with no end tag (see https://www.w3.org/TR/html51/syntax.html#void-elements),
* optional end tag (see https://www.w3.org/TR/html51/syntax.html#optional-tags),
* or where processing directly only the start tag is desired. */
/** Tag with no end tag (see https://www.w3.org/TR/html51/syntax.html#void-elements),
* optional end tag (see https://www.w3.org/TR/html51/syntax.html#optional-tags),
* or where processing directly only the start tag is desired. */
singleton,
/** Paired tag : has a start tag and an end tag (https://www.w3.org/TR/html51/syntax.html#normal-elements) */
pair;
@ -146,22 +146,22 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public String name;
public Properties opts;
public CharBuffer content;
/** Set to true when this tag should be ignored from scraping */
private boolean ignore = false;
public Tag(final String name) {
private TagValency tv;
public Tag(final String name, TagValency defaultValency) {
this.name = name;
this.tv = defaultValency;
this.opts = new Properties();
this.content = new CharBuffer(MAX_TAGSIZE);
}
public Tag(final String name, final Properties opts) {
public Tag(final String name, TagValency defaultValency, final Properties opts) {
this.name = name;
this.tv = defaultValency;
this.opts = opts;
this.content = new CharBuffer(MAX_TAGSIZE);
}
public Tag(final String name, final Properties opts, final CharBuffer content) {
public Tag(final String name, TagValency defaultValency, final Properties opts, final CharBuffer content) {
this.name = name;
this.tv = defaultValency;
this.opts = opts;
this.content = content;
}
@ -178,15 +178,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
/** @return true when this tag should be ignored from scraping */
public boolean isIgnore() {
return this.ignore;
}
/**
* @param ignore true when this tag should be ignored from scraping
*/
public void setIgnore(final boolean ignore) {
this.ignore = ignore;
}
return this.tv == TagValency.IGNORE;
}
public TagValency getValency() {
return this.tv;
}
public void setValency(final TagValency tv) {
this.tv = tv;
}
}
// all these tags must be given in lowercase, because the tags from the files are compared in lowercase
@ -205,10 +204,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final List<ImageEntry> images;
private final SizeLimitedSet<AnchorURL> script, frames, iframes;
/**
* URLs of linked data item types referenced from HTML content with standard
* annotations such as RDFa, microdata, microformats or JSON-LD
*/
/**
* URLs of linked data item types referenced from HTML content with standard
* annotations such as RDFa, microdata, microformats or JSON-LD
*/
private final SizeLimitedSet<DigestURL> linkedDataTypes;
private final SizeLimitedMap<String, String> metas;
@ -230,8 +229,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final VocabularyScraper vocabularyScraper;
/** Set of CSS class names whose matching div elements content should be ignored */
private final Set<String> ignoreDivClassNames;
/** Set of CSS class names whose matching div elements may switch from IGNORE to EVAL or vice versa */
private final Set<String> valencySwitchTagNames;
private final TagValency defaultValency;
private final int timezoneOffset;
private int breadcrumbs;
@ -261,19 +261,28 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @param root the document root url
* @param maxAnchors the maximum number of URLs to process and store in the anchors property.
* @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
* @param ignoreDivClassNames an eventual set of CSS class names whose matching div elements content should be ignored
* @param valencySwitchTagNames an eventual set of CSS class names whose matching div elements content should be ignored
* @param defaultValency the valency default; should be TagValency.EVAL by default
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
* @param timezoneOffset local time zone offset
*/
@SuppressWarnings("unchecked")
public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set<String> ignoreDivClassNames, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
public ContentScraper(
final DigestURL root,
final int maxAnchors,
final int maxLinks,
final Set<String> valencySwitchTagNames,
final TagValency defaultValency,
final VocabularyScraper vocabularyScraper,
int timezoneOffset) {
// the root value here will not be used to load the resource.
// it is only the reference for relative links
super(linkTags0, linkTags1);
assert root != null;
this.root = root;
this.vocabularyScraper = vocabularyScraper;
this.ignoreDivClassNames = ignoreDivClassNames;
this.valencySwitchTagNames = valencySwitchTagNames;
this.defaultValency = defaultValency;
this.timezoneOffset = timezoneOffset;
this.evaluationScores = new Evaluation();
this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
@ -321,10 +330,20 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
* @param timezoneOffset local time zone offset
*/
public ContentScraper(final DigestURL root, final int maxLinks, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
this(root, Integer.MAX_VALUE, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
public ContentScraper(
final DigestURL root,
final int maxLinks,
final Set<String> valencySwitchTagNames,
final TagValency defaultValency,
final VocabularyScraper vocabularyScraper,
int timezoneOffset) {
this(root, Integer.MAX_VALUE, maxLinks, valencySwitchTagNames, defaultValency, vocabularyScraper, timezoneOffset);
}
public TagValency defaultValency() {
return this.defaultValency;
}
@Override
public void finish() {
this.content.trimToSize();
@ -333,12 +352,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
@Override
public void scrapeText(final char[] newtext0, final Tag insideTag) {
if (insideTag != null) {
if(insideTag.ignore) {
return;
}
if ((TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name))) {
return;
}
if (insideTag.tv == TagValency.IGNORE) {
return;
}
if ((TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name))) {
return;
}
}
int p, pl, q, s = 0;
char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray();
@ -414,21 +433,21 @@ public class ContentScraper extends AbstractScraper implements Scraper {
List<ContentScraperListener> anchorListeners = new ArrayList<>();
for (int i = 0; i < listeners.length; i += 2) {
if (listeners[i] == ContentScraperListener.class) {
anchorListeners.add((ContentScraperListener)listeners[i+1]);
anchorListeners.add((ContentScraperListener)listeners[i+1]);
}
}
if(!this.maxAnchorsExceeded) {
int maxLinksToDetect = this.maxAnchors - this.anchors.size();
if(maxLinksToDetect < Integer.MAX_VALUE) {
/* Add one to the anchors limit to detect when the limit is exceeded */
maxLinksToDetect++;
}
findAbsoluteURLs(b, this.anchors, anchorListeners, maxLinksToDetect);
if(this.anchors.size() > this.maxAnchors) {
this.maxAnchorsExceeded = true;
this.anchors.remove(this.anchors.size() -1);
}
int maxLinksToDetect = this.maxAnchors - this.anchors.size();
if(maxLinksToDetect < Integer.MAX_VALUE) {
/* Add one to the anchors limit to detect when the limit is exceeded */
maxLinksToDetect++;
}
findAbsoluteURLs(b, this.anchors, anchorListeners, maxLinksToDetect);
if(this.anchors.size() > this.maxAnchors) {
this.maxAnchorsExceeded = true;
this.anchors.remove(this.anchors.size() -1);
}
}
// append string to content
@ -453,7 +472,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
*/
public static long findAbsoluteURLs(final String text, final Collection<AnchorURL> urls, final Collection<ContentScraperListener> listeners, final long maxURLs) {
if(text == null) {
return 0;
return 0;
}
int schemePosition, offset = 0;
boolean hasWhiteSpace;
@ -465,7 +484,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
long detectedURLsCount = 0;
while (offset < text.length() && detectedURLsCount < maxURLs) {
if(!urlSchemeMatcher.find(offset)) {
break;
break;
}
schemePosition = urlSchemeMatcher.start();
@ -473,26 +492,26 @@ public class ContentScraper extends AbstractScraper implements Scraper {
urlString = text.substring(schemePosition, hasWhiteSpace ? whiteSpaceMatcher.start() : text.length());
if (urlString.endsWith(".")) {
urlString = urlString.substring(0, urlString.length() - 1); // remove the '.' that was appended above
urlString = urlString.substring(0, urlString.length() - 1); // remove the '.' that was appended above
}
/* URLs can contain brackets, furthermore as they can even be reserved characters in the URI syntax (see https://tools.ietf.org/html/rfc3986#section-2.2)
* But when unpaired, in most cases this is that the unpaired bracket is not part of the URL, but rather used to wrap it in the text*/
urlString = removeUnpairedBrackets(urlString, '(', ')');
urlString = removeUnpairedBrackets(urlString, '{', '}');
urlString = removeUnpairedBrackets(urlString, '[', ']');
urlString = removeUnpairedBrackets(urlString, '[', ']');
offset = schemePosition + urlString.length();
try {
url = new AnchorURL(urlString);
detectedURLsCount++;
if(urls != null) {
urls.add(url);
}
if(listeners != null) {
for(ContentScraperListener listener : listeners) {
listener.anchorAdded(url.toNormalform(false));
}
}
url = new AnchorURL(urlString);
detectedURLsCount++;
if(urls != null) {
urls.add(url);
}
if(listeners != null) {
for(ContentScraperListener listener : listeners) {
listener.anchorAdded(url.toNormalform(false));
}
}
} catch (final MalformedURLException ignored) {}
}
return detectedURLsCount;
@ -505,62 +524,62 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @param listeners a collection of listeners to trigger.
*/
public static void findAbsoluteURLs(final String text, final Collection<AnchorURL> urls, final Collection<ContentScraperListener> listeners) {
findAbsoluteURLs(text, urls, listeners, Long.MAX_VALUE);
}
/**
* Analyze bracket pairs found in the string and eventually
* return a truncated version of that string when one or more pairs are incomplete
*
* @param str
* the string to analyze
* @param openingMark
* the opening bracket character (example : '{')
* @param closingMark
* the closing bracket character (example : '}')
* @return the original string or a truncated copy
*/
protected static String removeUnpairedBrackets(final String str, final char openingMark,
final char closingMark) {
if(str == null) {
return null;
}
String result = str;
char ch;
int depth = 0, index = 0, lastUnpairedOpeningIndex = -1;
/* Loop on all characters of the string */
for(; index < str.length(); index++) {
ch = str.charAt(index);
if(ch == openingMark) {
if(depth == 0) {
lastUnpairedOpeningIndex = index;
}
depth++;
} else if(ch == closingMark) {
depth--;
if(depth == 0) {
lastUnpairedOpeningIndex = -1;
}
}
if(depth < 0) {
/* Unpaired closing mark : stop the loop here */
break;
}
}
if (depth > 0) {
/* One or more unpaired opening marks : truncate at the first opening level */
if(lastUnpairedOpeningIndex >= 0) {
result = str.substring(0, lastUnpairedOpeningIndex);
}
} else if (depth < 0) {
/* One or more unpaired closing marks : truncate at the current index as the loop should have been exited with a break */
if(index >= 0) {
result = str.substring(0, index);
}
}
return result;
}
findAbsoluteURLs(text, urls, listeners, Long.MAX_VALUE);
}
/**
* Analyze bracket pairs found in the string and eventually
* return a truncated version of that string when one or more pairs are incomplete
*
* @param str
* the string to analyze
* @param openingMark
* the opening bracket character (example : '{')
* @param closingMark
* the closing bracket character (example : '}')
* @return the original string or a truncated copy
*/
protected static String removeUnpairedBrackets(final String str, final char openingMark,
final char closingMark) {
if(str == null) {
return null;
}
String result = str;
char ch;
int depth = 0, index = 0, lastUnpairedOpeningIndex = -1;
/* Loop on all characters of the string */
for(; index < str.length(); index++) {
ch = str.charAt(index);
if(ch == openingMark) {
if(depth == 0) {
lastUnpairedOpeningIndex = index;
}
depth++;
} else if(ch == closingMark) {
depth--;
if(depth == 0) {
lastUnpairedOpeningIndex = -1;
}
}
if(depth < 0) {
/* Unpaired closing mark : stop the loop here */
break;
}
}
if (depth > 0) {
/* One or more unpaired opening marks : truncate at the first opening level */
if(lastUnpairedOpeningIndex >= 0) {
result = str.substring(0, lastUnpairedOpeningIndex);
}
} else if (depth < 0) {
/* One or more unpaired closing marks : truncate at the current index as the loop should have been exited with a break */
if(index >= 0) {
result = str.substring(0, index);
}
}
return result;
}
/**
* @param relativePath relative path to this document base URL
@ -574,42 +593,42 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
/**
* Parse the eventual microdata itemtype attribute of a tag and extract its
* valid URL tokens when the itemscope attribute is present.
*
* @param tagAttributes parsed HTML tag attributes.
* @return a set of URLs eventually empty when no itemtype attribute is present
* or when its value is not valid
* @see <a href="https://www.w3.org/TR/microdata/#dfn-itemtype">itemtype
* definition at W3C</a>
* @see <a href=
* "https://html.spec.whatwg.org/multipage/microdata.html#attr-itemtype">itemtype
* definition at WHATWG</a>
*/
private Set<DigestURL> parseMicrodataItemType(final Properties tagAttributes) {
final Set<DigestURL> types = new HashSet<>();
if (tagAttributes != null) {
/*
* The itemtype attribute must not be specified on elements that do not have an
* itemscope attribute specified. So we lazily check here for itemscope boolean
* attribute presence (strictly conforming parsing would also check it has no
* value or the value is the empty string or "itemscope")
*/
if (tagAttributes.getProperty("itemscope") != null) {
final Set<String> itemTypes = parseSpaceSeparatedTokens(tagAttributes.getProperty("itemtype"));
for (final String itemType : itemTypes) {
try {
types.add(new DigestURL(itemType));
} catch (final MalformedURLException ignored) {
/* Each itemtype space-separated token must be a valid absolute URL */
}
}
}
}
return types;
}
/**
* Parse the eventual microdata itemtype attribute of a tag and extract its
* valid URL tokens when the itemscope attribute is present.
*
* @param tagAttributes parsed HTML tag attributes.
* @return a set of URLs eventually empty when no itemtype attribute is present
* or when its value is not valid
* @see <a href="https://www.w3.org/TR/microdata/#dfn-itemtype">itemtype
* definition at W3C</a>
* @see <a href=
* "https://html.spec.whatwg.org/multipage/microdata.html#attr-itemtype">itemtype
* definition at WHATWG</a>
*/
private Set<DigestURL> parseMicrodataItemType(final Properties tagAttributes) {
final Set<DigestURL> types = new HashSet<>();
if (tagAttributes != null) {
/*
* The itemtype attribute must not be specified on elements that do not have an
* itemscope attribute specified. So we lazily check here for itemscope boolean
* attribute presence (strictly conforming parsing would also check it has no
* value or the value is the empty string or "itemscope")
*/
if (tagAttributes.getProperty("itemscope") != null) {
final Set<String> itemTypes = parseSpaceSeparatedTokens(tagAttributes.getProperty("itemtype"));
for (final String itemType : itemTypes) {
try {
types.add(new DigestURL(itemType));
} catch (final MalformedURLException ignored) {
/* Each itemtype space-separated token must be a valid absolute URL */
}
}
}
}
return types;
}
private void checkOpts(final Tag tag) {
// vocabulary classes
@ -651,53 +670,53 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
/**
* Parses sizes icon link attribute. (see
* http://www.w3.org/TR/html5/links.html#attr-link-sizes) Eventual
* duplicates are removed.
*
* @param sizesAttr
* sizes attribute string, may be null
* @return a set of sizes eventually empty.
*/
public static Set<Dimension> parseSizes(String sizesAttr) {
Set<Dimension> sizes = new HashSet<Dimension>();
Set<String> tokens = parseSpaceSeparatedTokens(sizesAttr);
for (String token : tokens) {
/*
* "any" keyword may be present, but doesn't have to produce a
* dimension result
*/
if (token != null) {
Matcher matcher = IconEntry.SIZE_PATTERN.matcher(token);
if (matcher.matches()) {
/* With given pattern no NumberFormatException can occur */
sizes.add(new Dimension(Integer.parseInt(matcher.group(1)), Integer.parseInt(matcher.group(2))));
}
}
}
return sizes;
}
/**
* Parses a space separated tokens attribute value (see
* http://www.w3.org/TR/html5/infrastructure.html#space-separated-tokens).
* Eventual duplicates are removed.
*
* @param attr
* attribute string, may be null
* @return a set of tokens eventually empty
*/
public static Set<String> parseSpaceSeparatedTokens(final String attr) {
Set<String> tokens = new HashSet<>();
/* Check attr string is not empty to avoid adding a single empty string
* in result */
if (attr != null && !attr.trim().isEmpty()) {
String[] items = attr.trim().split(CommonPattern.SPACES.pattern());
Collections.addAll(tokens, items);
}
return tokens;
}
/**
* Parses sizes icon link attribute. (see
* http://www.w3.org/TR/html5/links.html#attr-link-sizes) Eventual
* duplicates are removed.
*
* @param sizesAttr
* sizes attribute string, may be null
* @return a set of sizes eventually empty.
*/
public static Set<Dimension> parseSizes(String sizesAttr) {
Set<Dimension> sizes = new HashSet<Dimension>();
Set<String> tokens = parseSpaceSeparatedTokens(sizesAttr);
for (String token : tokens) {
/*
* "any" keyword may be present, but doesn't have to produce a
* dimension result
*/
if (token != null) {
Matcher matcher = IconEntry.SIZE_PATTERN.matcher(token);
if (matcher.matches()) {
/* With given pattern no NumberFormatException can occur */
sizes.add(new Dimension(Integer.parseInt(matcher.group(1)), Integer.parseInt(matcher.group(2))));
}
}
}
return sizes;
}
/**
* Parses a space separated tokens attribute value (see
* http://www.w3.org/TR/html5/infrastructure.html#space-separated-tokens).
* Eventual duplicates are removed.
*
* @param attr
* attribute string, may be null
* @return a set of tokens eventually empty
*/
public static Set<String> parseSpaceSeparatedTokens(final String attr) {
Set<String> tokens = new HashSet<>();
/* Check attr string is not empty to avoid adding a single empty string
* in result */
if (attr != null && !attr.trim().isEmpty()) {
String[] items = attr.trim().split(CommonPattern.SPACES.pattern());
Collections.addAll(tokens, items);
}
return tokens;
}
/**
* Retain only icon relations (standard and non standard) from tokens .
@ -705,13 +724,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @return a Set of icon relations, eventually empty
*/
public Set<String> retainIconRelations(Collection<String> relTokens) {
HashSet<String> iconRels = new HashSet<>();
for(String token : relTokens) {
if(IconLinkRelations.isIconRel(token)) {
iconRels.add(token.toLowerCase(Locale.ENGLISH));
}
}
return iconRels;
HashSet<String> iconRels = new HashSet<>();
for(String token : relTokens) {
if(IconLinkRelations.isIconRel(token)) {
iconRels.add(token.toLowerCase(Locale.ENGLISH));
}
}
return iconRels;
}
/**
@ -720,9 +739,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
*/
@Override
public void scrapeTag0(final Tag tag) {
if(tag.ignore) {
return;
}
if (tag.tv == TagValency.IGNORE) {
return;
}
checkOpts(tag);
if (tag.name.equalsIgnoreCase("img")) {
final String src = tag.opts.getProperty("src", EMPTY_STRING);
@ -740,23 +759,23 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} catch (final NumberFormatException e) {}
this.evaluationScores.match(Element.imgpath, src);
} else if(tag.name.equalsIgnoreCase("base")) {
final String baseHref = tag.opts.getProperty("href", EMPTY_STRING);
if(!baseHref.isEmpty()) {
/* We must use here AnchorURL.newAnchor as the base href may also be an URL relative to the document URL */
try {
this.root = AnchorURL.newAnchor(this.root, baseHref);
} catch (final MalformedURLException | RuntimeException ignored) {
/* Nothing more to do when the base URL is malformed */
}
}
final String baseHref = tag.opts.getProperty("href", EMPTY_STRING);
if(!baseHref.isEmpty()) {
/* We must use here AnchorURL.newAnchor as the base href may also be an URL relative to the document URL */
try {
this.root = AnchorURL.newAnchor(this.root, baseHref);
} catch (final MalformedURLException | RuntimeException ignored) {
/* Nothing more to do when the base URL is malformed */
}
}
} else if (tag.name.equalsIgnoreCase("frame")) {
final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
if(src != null) {
tag.opts.put("src", src.toNormalform(true));
src.setAll(tag.opts);
//this.addAnchor(src); // don't add the frame to the anchors because the webgraph should not contain such links (by definition)
this.frames.add(src);
this.evaluationScores.match(Element.framepath, src.toNormalform(true));
tag.opts.put("src", src.toNormalform(true));
src.setAll(tag.opts);
//this.addAnchor(src); // don't add the frame to the anchors because the webgraph should not contain such links (by definition)
this.frames.add(src);
this.evaluationScores.match(Element.framepath, src.toNormalform(true));
}
} else if (tag.name.equalsIgnoreCase("body")) {
final String classprop = tag.opts.getProperty("class", EMPTY_STRING);
@ -786,9 +805,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
tag.opts.put("name", areatitle);
AnchorURL url = absolutePath(href);
if(url != null) {
tag.opts.put("href", url.toNormalform(true));
url.setAll(tag.opts);
this.addAnchor(url);
tag.opts.put("href", url.toNormalform(true));
url.setAll(tag.opts);
this.addAnchor(url);
}
}
} else if (tag.name.equalsIgnoreCase("link")) {
@ -808,18 +827,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
Set<String> iconRels = retainIconRelations(relTokens);
/* Distinguish icons from images. It will enable for example to later search only images and no icons */
if (!iconRels.isEmpty()) {
String sizesAttr = tag.opts.getProperty("sizes", EMPTY_STRING);
Set<Dimension> sizes = parseSizes(sizesAttr);
IconEntry icon = this.icons.get(newLink);
/* There is already an icon with same URL for this document :
* they may have different rel attribute or different sizes (multi sizes ico file) or this may be a duplicate */
if(icon != null) {
icon.getRel().addAll(iconRels);
icon.getSizes().addAll(sizes);
} else {
icon = new IconEntry(newLink, iconRels, sizes);
this.icons.put(newLink, icon);
}
String sizesAttr = tag.opts.getProperty("sizes", EMPTY_STRING);
Set<Dimension> sizes = parseSizes(sizesAttr);
IconEntry icon = this.icons.get(newLink);
/* There is already an icon with same URL for this document :
* they may have different rel attribute or different sizes (multi sizes ico file) or this may be a duplicate */
if(icon != null) {
icon.getRel().addAll(iconRels);
icon.getSizes().addAll(sizes);
} else {
icon = new IconEntry(newLink, iconRels, sizes);
this.icons.put(newLink, icon);
}
} else if (rel.equalsIgnoreCase("canonical")) {
tag.opts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
newLink.setAll(tag.opts);
@ -863,19 +882,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (name.equalsIgnoreCase("movie")) {
AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING));
if(url != null) {
tag.opts.put("value", url.toNormalform(true));
url.setAll(tag.opts);
this.addAnchor(url);
tag.opts.put("value", url.toNormalform(true));
url.setAll(tag.opts);
this.addAnchor(url);
}
}
} else if (tag.name.equalsIgnoreCase("iframe")) {
final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
if(src != null) {
tag.opts.put("src", src.toNormalform(true));
src.setAll(tag.opts);
// this.addAnchor(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition)
this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
tag.opts.put("src", src.toNormalform(true));
src.setAll(tag.opts);
// this.addAnchor(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition)
this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
}
} else if (tag.name.equalsIgnoreCase("html")) {
final String lang = tag.opts.getProperty("lang", EMPTY_STRING);
@ -893,9 +912,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
*/
@Override
public void scrapeTag1(final Tag tag) {
if(tag.ignore) {
return;
}
if (tag.tv == TagValency.IGNORE) {
return;
}
checkOpts(tag);
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
@ -916,12 +935,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
final String h;
if (tag.name.equalsIgnoreCase("div")) {
final String id = tag.opts.getProperty("id", EMPTY_STRING);
this.evaluationScores.match(Element.divid, id);
final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
breadcrumbs++;
}
final String id = tag.opts.getProperty("id", EMPTY_STRING);
this.evaluationScores.match(Element.divid, id);
final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
breadcrumbs++;
}
} else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[0].add(h);
@ -971,10 +990,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if (tag.name.equalsIgnoreCase("script")) {
final String src = tag.opts.getProperty("src", EMPTY_STRING);
if (src.length() > 0) {
AnchorURL absoluteSrc = absolutePath(src);
if(absoluteSrc != null) {
this.script.add(absoluteSrc);
}
AnchorURL absoluteSrc = absolutePath(src);
if(absoluteSrc != null) {
this.script.add(absoluteSrc);
}
this.evaluationScores.match(Element.scriptpath, src);
} else {
this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(tag.content.getChars())).replaceAll(" "));
@ -996,54 +1015,47 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.fireScrapeTag1(tag.name, tag.opts, tag.content.getChars());
}
/**
* Scraping operation applied to any kind of tag opening, being either singleton
* or paired tag, not restricted to tags listed in
* {@link ContentScraper#linkTags0} and {@link ContentScraper#linkTags1}.
*/
@Override
public void scrapeAnyTagOpening(final Tag tag) {
if (tag != null && !tag.ignore && tag.opts != null) {
/*
* HTML microdata can be annotated on any kind of tag, so we don't restrict this
* scraping to the limited sets in linkTags0 and linkTags1
*/
this.linkedDataTypes.addAll(parseMicrodataItemType(tag.opts));
}
}
@Override
public boolean shouldIgnoreTag(final Tag tag, final Tag parentTag) {
boolean ignore = false;
/* First, inherit ignore property from eventual parent */
if(parentTag != null) {
ignore = parentTag.ignore;
}
/* Parent is not marked as ignored : let's check the current tag */
if (!ignore &&
this.ignoreDivClassNames != null &&
tag != null &&
(TagName.div.name().equals(tag.name) || TagName.nav.name().equals(tag.name))) {
final String classAttr = tag.opts.getProperty("class", EMPTY_STRING);
final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr);
ignore = !Collections.disjoint(this.ignoreDivClassNames, classes);
}
return ignore;
}
/**
* Scraping operation applied to any kind of tag opening, being either singleton
* or paired tag, not restricted to tags listed in
* {@link ContentScraper#linkTags0} and {@link ContentScraper#linkTags1}.
*/
@Override
public void scrapeAnyTagOpening(final Tag tag) {
if (tag != null && tag.tv == TagValency.EVAL && tag.opts != null) {
/*
* HTML microdata can be annotated on any kind of tag, so we don't restrict this
* scraping to the limited sets in linkTags0 and linkTags1
*/
this.linkedDataTypes.addAll(parseMicrodataItemType(tag.opts));
}
}
@Override
public TagValency tagValency(final Tag tag, final Tag parentTag) {
if (parentTag != null && parentTag.tv != this.defaultValency) return parentTag.tv;
if (this.valencySwitchTagNames != null &&
tag != null &&
(TagName.div.name().equals(tag.name) || TagName.nav.name().equals(tag.name))) {
final String classAttr = tag.opts.getProperty("class", EMPTY_STRING);
final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr);
if (!Collections.disjoint(this.valencySwitchTagNames, classes)) return this.defaultValency.reverse();
}
return this.defaultValency;
}
/**
* Add an anchor to the anchors list, and trigger any eventual listener
* @param anchor anchor to add. Must not be null.
*/
protected void addAnchor(AnchorURL anchor) {
if(this.anchors.size() >= this.maxAnchors) {
this.maxAnchorsExceeded = true;
} else {
this.anchors.add(anchor);
this.fireAddAnchor(anchor.toNormalform(false));
}
if(this.anchors.size() >= this.maxAnchors) {
this.maxAnchorsExceeded = true;
} else {
this.anchors.add(anchor);
this.fireAddAnchor(anchor.toNormalform(false));
}
}
@ -1203,13 +1215,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return this.iframes;
}
/**
* @return URLs of linked data item types referenced from HTML content with standard
* annotations such as RDFa, microdata, microformats or JSON-LD
*/
public SizeLimitedSet<DigestURL> getLinkedDataTypes() {
return this.linkedDataTypes;
}
/**
* @return URLs of linked data item types referenced from HTML content with standard
* annotations such as RDFa, microdata, microformats or JSON-LD
*/
public SizeLimitedSet<DigestURL> getLinkedDataTypes() {
return this.linkedDataTypes;
}
public Set<AnchorURL> getScript() {
return this.script;
@ -1258,32 +1270,32 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @return true when the limit on content size scraped has been exceeded
*/
public boolean isContentSizeLimitExceeded() {
return this.contentSizeLimitExceeded;
}
return this.contentSizeLimitExceeded;
}
/**
* @param contentSizeLimitExceeded set to true when a limit on content size scraped has been exceeded
*/
public void setContentSizeLimitExceeded(final boolean contentSizeLimitExceeded) {
this.contentSizeLimitExceeded = contentSizeLimitExceeded;
}
this.contentSizeLimitExceeded = contentSizeLimitExceeded;
}
/**
* @return true when the maxAnchors limit has been exceeded
*/
public boolean isMaxAnchorsExceeded() {
return this.maxAnchorsExceeded;
}
return this.maxAnchorsExceeded;
}
/**
* @return true when at least one limit on content size, anchors number or links number has been exceeded
*/
public boolean isLimitsExceeded() {
return this.contentSizeLimitExceeded || this.maxAnchorsExceeded || this.css.isLimitExceeded()
|| this.rss.isLimitExceeded() || this.embeds.isLimitExceeded() || this.metas.isLimitExceeded()
|| this.hreflang.isLimitExceeded() || this.navigation.isLimitExceeded() || this.script.isLimitExceeded()
|| this.frames.isLimitExceeded() || this.iframes.isLimitExceeded() || this.linkedDataTypes.isLimitExceeded();
}
public boolean isLimitsExceeded() {
return this.contentSizeLimitExceeded || this.maxAnchorsExceeded || this.css.isLimitExceeded()
|| this.rss.isLimitExceeded() || this.embeds.isLimitExceeded() || this.metas.isLimitExceeded()
|| this.hreflang.isLimitExceeded() || this.navigation.isLimitExceeded() || this.script.isLimitExceeded()
|| this.frames.isLimitExceeded() || this.iframes.isLimitExceeded() || this.linkedDataTypes.isLimitExceeded();
}
/*
DC in html example:
@ -1544,11 +1556,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
@Override
public void registerHtmlFilterEventListener(final ScraperListener listener) {
if (listener != null) {
if(listener instanceof ContentScraperListener) {
this.htmlFilterEventListeners.add(ContentScraperListener.class, (ContentScraperListener)listener);
} else {
this.htmlFilterEventListeners.add(ScraperListener.class, listener);
}
if(listener instanceof ContentScraperListener) {
this.htmlFilterEventListeners.add(ContentScraperListener.class, (ContentScraperListener)listener);
} else {
this.htmlFilterEventListeners.add(ScraperListener.class, listener);
}
}
}
@ -1559,11 +1571,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
@Override
public void deregisterHtmlFilterEventListener(final ScraperListener listener) {
if (listener != null) {
if(listener instanceof ContentScraperListener) {
this.htmlFilterEventListeners.remove(ContentScraperListener.class, (ContentScraperListener)listener);
} else {
this.htmlFilterEventListeners.remove(ScraperListener.class, listener);
}
if(listener instanceof ContentScraperListener) {
this.htmlFilterEventListeners.remove(ContentScraperListener.class, (ContentScraperListener)listener);
} else {
this.htmlFilterEventListeners.remove(ScraperListener.class, listener);
}
}
}
@ -1604,13 +1616,25 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (page == null) throw new IOException("no content in file " + file.toString());
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), new DigestURL("http://localhost"), false, maxLinks, timezoneOffset);
final ScraperInputStream htmlFilter = new ScraperInputStream(
new ByteArrayInputStream(page),
StandardCharsets.UTF_8.name(),
new HashSet<String>(), TagValency.EVAL,
new VocabularyScraper(),
new DigestURL("http://localhost"),
false, maxLinks, timezoneOffset);
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
htmlFilter.close();
if (charset == null) charset = Charset.defaultCharset().toString();
// scrape content
final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
final ContentScraper scraper = new ContentScraper(
new DigestURL("http://localhost"),
maxLinks,
new HashSet<String>(),
TagValency.EVAL,
new VocabularyScraper(),
timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
writer.close();

@ -26,26 +26,26 @@ package net.yacy.document.parser.html;
public interface Scraper {
/**
* @param tag
* a tag name
* @return true when the tag name belongs to the first category of tags
* according to the Scraper implementation, and is therefore candidate
* for processing by
* {@link #scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)}
* implementation
*/
/**
* @param tag
* a tag name
* @return true when the tag name belongs to the first category of tags
* according to the Scraper implementation, and is therefore candidate
* for processing by
* {@link #scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)}
* implementation
*/
public boolean isTag0(String tag);
/**
* @param tag
* a tag name
* @return true when the tag name belongs to the second category of tags
* according to the Scraper implementation, and is therefore candidate
* for processing by
* {@link #scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)}
* implementation
*/
/**
* @param tag
* a tag name
* @return true when the tag name belongs to the second category of tags
* according to the Scraper implementation, and is therefore candidate
* for processing by
* {@link #scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)}
* implementation
*/
public boolean isTag1(String tag);
/**
@ -73,14 +73,16 @@ public interface Scraper {
*/
public void scrapeAnyTagOpening(ContentScraper.Tag tag);
/**
* @param tag
* a parsed tag
* @param parentTag the eventual parent tag
* @return true when the tag should be ignored according to the scraper
* implementation rules
*/
public boolean shouldIgnoreTag(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag);
/**
* @param tag
* a parsed tag
* @param parentTag the eventual parent tag
* @return true when the tag should be ignored according to the scraper
* implementation rules
*/
public TagValency tagValency(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag);
public TagValency defaultValency();
public void scrapeComment(final char[] comment);

@ -62,7 +62,8 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
public ScraperInputStream(
final InputStream inStream,
final String inputStreamCharset,
final Set<String> ignore_class_name,
final Set<String> valencySwitchTagNames,
final TagValency defaultValency,
final VocabularyScraper vocabularyScraper,
final DigestURL rooturl,
final boolean passbyIfBinarySuspect,
@ -73,7 +74,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
this.bufferedIn.mark((int) preBufferSize);
final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, valencySwitchTagNames, defaultValency, vocabularyScraper, timezoneOffset);
scraper.registerHtmlFilterEventListener(this);
try {

@ -0,0 +1,30 @@
/**
* TagValency
* Copyright 2023 by Michael Peter Christen, @0rb1t3r
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser.html;
public enum TagValency {
IGNORE, // do not index that tag
EVAL; // do index that tag
public TagValency reverse() {
return this == IGNORE ? EVAL : IGNORE;
}
}

@ -64,14 +64,14 @@ public final class TransformerWriter extends Writer {
private boolean inComment;
private boolean binaryUnsuspect;
private final boolean passbyIfBinarySuspect;
public TransformerWriter(
final OutputStream outStream,
final Charset charSet,
final Scraper scraper,
final boolean passbyIfBinarySuspect
) {
this(outStream, charSet, scraper, passbyIfBinarySuspect, 64);
this(outStream, charSet, scraper, passbyIfBinarySuspect, 64);
}
public TransformerWriter(
@ -229,7 +229,7 @@ public final class TransformerWriter extends Writer {
// we are not collection tag text -> case (1) - (3)
// case (1): this is not a tag opener/closer
if (this.scraper != null && content.length > 0) {
this.scraper.scrapeText(content, null);
this.scraper.scrapeText(content, null);
}
return content;
}
@ -283,24 +283,24 @@ public final class TransformerWriter extends Writer {
private char[] filterTagOpening(final String tagname, final char[] content) {
final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser());
ContentScraper.Tag tag = new ContentScraper.Tag(tagname, this.scraper.defaultValency(), charBuffer.propParser());
charBuffer.close();
final ContentScraper.Tag parentTag;
if(this.tagStack.size() > 0) {
parentTag = this.tagStack.lastElement();
parentTag = this.tagStack.lastElement();
} else {
parentTag = null;
parentTag = null;
}
/* Check scraper ignoring rules */
if (this.scraper != null && this.scraper.shouldIgnoreTag(tag, parentTag)) {
tag.setIgnore(true);
}
if (this.scraper != null) {
tag.setValency(this.scraper.tagValency(tag, parentTag));
}
/* Apply processing relevant for any kind of tag opening */
if(this.scraper != null) {
this.scraper.scrapeAnyTagOpening(tag);
this.scraper.scrapeAnyTagOpening(tag);
}
if (this.scraper != null && this.scraper.isTag0(tagname)) {

@ -57,6 +57,7 @@ import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.html.TransformerWriter;
@ -276,7 +277,16 @@ public class htmlParser extends AbstractParser implements Parser {
if (charset == null) {
ScraperInputStream htmlFilter = null;
try {
htmlFilter = new ScraperInputStream(sourceStream, documentCharset, ignore_class_name, vocabularyScraper, location, false, maxLinks, timezoneOffset);
htmlFilter = new ScraperInputStream(
sourceStream,
documentCharset,
ignore_class_name,
TagValency.EVAL,
vocabularyScraper,
location,
false,
maxLinks,
timezoneOffset);
sourceStream = htmlFilter;
charset = htmlFilter.detectCharset();
} catch (final IOException e1) {
@ -311,7 +321,14 @@ public class htmlParser extends AbstractParser implements Parser {
// parsing the content
// for this static method no need to init local this.scraperObject here
final ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
final ContentScraper scraper = new ContentScraper(
location,
maxAnchors,
maxLinks,
ignore_class_name,
TagValency.EVAL,
vocabularyScraper,
timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null, null, scraper, false, Math.max(64, Math.min(4096, sourceStream.available())));
try {
final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());

@ -66,6 +66,7 @@ import net.yacy.data.WorkTables;
import net.yacy.document.Document;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.FileUtils;
@ -733,8 +734,13 @@ public class Crawler_p {
} else {
/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
final String crawlingFileContent = post.get("crawlingFile$file", "");
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
new HashSet<String>(), new VocabularyScraper(), profile.timezoneOffset());
final ContentScraper scraper = new ContentScraper(
new DigestURL(crawlingFile),
10000000,
new HashSet<String>(),
TagValency.EVAL,
new VocabularyScraper(),
profile.timezoneOffset());
final FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
sb.crawler.putActive(handle, profile);
@ -874,20 +880,20 @@ public class Crawler_p {
final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException {
List<AnchorURL> hyperlinks_from_file;
// check if the crawl filter works correctly
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), TagValency.EVAL, new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, false);
if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
/* Let's report here detailed error to help user when he selected a wrong file */
if(!crawlingFile.exists()) {
writer.close();
writer.close();
throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " does not exists");
}
if(!crawlingFile.isFile()) {
writer.close();
writer.close();
throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " exists but is not a regular file");
}
if(!crawlingFile.canRead()) {
writer.close();
writer.close();
throw new IOException("Can not read : " + crawlingFile.getAbsolutePath());
}
}

Loading…
Cancel
Save