front-end integration of tag valency

pull/554/head
Michael Peter Christen 2 years ago
parent 7f728bb4b4
commit 5a52b01c09

@ -480,12 +480,24 @@
</fieldset> </fieldset>
<fieldset> <fieldset>
<legend>Content Filter</legend> <legend>Content Filter</legend>
<p>These are limitations on parts of a document. The filter will be applied after a web page was loaded.</p> <p>These are limitations on parts of a document. The filter will be applied after a web page was loaded.
You can choose to:</p>
<dl> <dl>
<dt>Evaluate by default</dt>
<dd><input type="radio" name="default_valency" id="default_valency_eval" value="EVAL" #(default_valency_eval)#::checked="checked"#(/default_valency_eval)#/>
Use all words in document by default until a CSS class as listed below appears; then ignore all
</dd>
<dt>Ignore by default</dt>
<dd><input type="radio" name="default_valency" id="default_valency_ignore" value="IGNORE" #(default_valency_ignore)#::checked="checked"#(/default_valency_ignore)#/>
Ignore all words in document by default until a CSS class as listed below appears, then evaluate all
</dd>
<dt>Filter div or nav class names</dt> <dt>Filter div or nav class names</dt>
<dd> <dd>
<table style="border-width: 0px"> <table style="border-width: 0px">
<tr><td style="width:110px">set of CSS class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of &lt;div&gt; or &lt;nav&gt; element class names which should be filtered out</td></tr> <tr>
<td><input name="valency_switch_tag_names" id="valency_switch_tag_names" type="text" size="55" maxlength="100000" value="#[valency_switch_tag_names]#" onblur="if (this.value=='') this.value='';"/></td>
<td>comma-separated list of &lt;div&gt; or &lt;nav&gt; element class names which should be filtered out/in according to switch above.</td>
</tr>
</table> </table>
</dd> </dd>
</dl> </dl>

@ -128,7 +128,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
INDEX_MEDIA ("indexMedia", false, CrawlAttribute.BOOLEAN, "Index Media"), INDEX_MEDIA ("indexMedia", false, CrawlAttribute.BOOLEAN, "Index Media"),
COLLECTIONS ("collections", false, CrawlAttribute.STRING, "Collections (comma-separated list)"), COLLECTIONS ("collections", false, CrawlAttribute.STRING, "Collections (comma-separated list)"),
DEFAULT_VALENCY ("default_valency", false, CrawlAttribute.STRING, "default tag valency"), DEFAULT_VALENCY ("default_valency", false, CrawlAttribute.STRING, "default tag valency"),
VALENCY_SWITCH_TAG_NAME ("valency_switch_tag_name", false, CrawlAttribute.STRING, "DIV Class names when default valency shall be switched"), VALENCY_SWITCH_TAG_NAMES ("valency_switch_tag_names", false, CrawlAttribute.STRING, "DIV Class names when default valency shall be switched"),
SCRAPER ("scraper", false, CrawlAttribute.STRING, "Declaration for Vocabulary Scraper"), SCRAPER ("scraper", false, CrawlAttribute.STRING, "Declaration for Vocabulary Scraper"),
TIMEZONEOFFSET ("timezoneOffset", true, CrawlAttribute.INTEGER, "Time Zone of Crawl Start Agent"); TIMEZONEOFFSET ("timezoneOffset", true, CrawlAttribute.INTEGER, "Time Zone of Crawl Start Agent");
@ -290,7 +290,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
this.valencySwitchTagNames = valencySwitchTagNames == null ? new HashSet<String>() : valencySwitchTagNames; this.valencySwitchTagNames = valencySwitchTagNames == null ? new HashSet<String>() : valencySwitchTagNames;
String jsonString = new JSONArray(valencySwitchTagNames).toString(); String jsonString = new JSONArray(valencySwitchTagNames).toString();
put(CrawlAttribute.DEFAULT_VALENCY.key, defaultValency.name()); put(CrawlAttribute.DEFAULT_VALENCY.key, defaultValency.name());
put(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key, jsonString); put(CrawlAttribute.VALENCY_SWITCH_TAG_NAMES.key, jsonString);
this.scraper = scraper == null ? new VocabularyScraper() : scraper; this.scraper = scraper == null ? new VocabularyScraper() : scraper;
jsonString = this.scraper.toString(); jsonString = this.scraper.toString();
assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString; assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
@ -312,7 +312,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
this.doms = new ConcurrentHashMap<String, AtomicInteger>(); this.doms = new ConcurrentHashMap<String, AtomicInteger>();
String defaultValency = ext.get(CrawlAttribute.DEFAULT_VALENCY.key); String defaultValency = ext.get(CrawlAttribute.DEFAULT_VALENCY.key);
this.defaultValency = defaultValency == null || defaultValency.length() == 0 ? TagValency.EVAL : TagValency.valueOf(defaultValency); this.defaultValency = defaultValency == null || defaultValency.length() == 0 ? TagValency.EVAL : TagValency.valueOf(defaultValency);
String jsonString = ext.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key); String jsonString = ext.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAMES.key);
JSONArray a; JSONArray a;
if (jsonString == null) { if (jsonString == null) {
a = new JSONArray(); a = new JSONArray();
@ -1014,7 +1014,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexMedia", this.indexMedia() ? 1 : 0); prop.put(CRAWL_PROFILE_PREFIX + count + "_indexMedia", this.indexMedia() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.COLLECTIONS.key, this.get(CrawlAttribute.COLLECTIONS.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.COLLECTIONS.key, this.get(CrawlAttribute.COLLECTIONS.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.DEFAULT_VALENCY.key, this.get(CrawlAttribute.DEFAULT_VALENCY.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.DEFAULT_VALENCY.key, this.get(CrawlAttribute.DEFAULT_VALENCY.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key, this.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.VALENCY_SWITCH_TAG_NAMES.key, this.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAMES.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.TIMEZONEOFFSET.key, this.get(CrawlAttribute.TIMEZONEOFFSET.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.TIMEZONEOFFSET.key, this.get(CrawlAttribute.TIMEZONEOFFSET.key));
int i = 0; int i = 0;

@ -588,12 +588,28 @@ public class CrawlStartExpert {
prop.put("list", agentNames.size()); prop.put("list", agentNames.size());
prop.put("defaultAgentName", sb.isIntranetMode() ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName); prop.put("defaultAgentName", sb.isIntranetMode() ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName);
// ---------- Ignore Class Name // ---------- Valency Switch Tag Names
if (post != null && post.containsKey("ignoreclassname")) { if (post != null && post.containsKey("valency_switch_tag_names")) {
prop.put("ignoreclassname", prop.put("valency_switch_tag_names", post.get("valency_switch_tag_names", ""));
post.get("ignoreclassname", ""));
} else { } else {
prop.put("ignoreclassname", ""); prop.put("valency_switch_tag_names", "");
}
if (post != null && post.containsKey("default_valency")) {
final String default_valency = post.get("default_valency", "");
if (default_valency.equalsIgnoreCase("EVAL")){
prop.put("default_valency_eval", 1);
prop.put("default_valency_ignore", 0);
} else if (default_valency.equalsIgnoreCase("IGNORE")) {
prop.put("default_valency_eval", 0);
prop.put("default_valency_ignore", 1);
prop.put("default_valency_ignore", 0);
} else {
prop.put("default_valency_eval", 1);
prop.put("default_valency_ignore", 0);
}
} else {
prop.put("default_valency_eval", 1);
prop.put("default_valency_ignore", 0);
} }
// ---------- Enrich Vocabulary // ---------- Enrich Vocabulary

@ -485,14 +485,19 @@ public class Crawler_p {
final boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld"); final boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
final String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", ""); final String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", "");
final String ignoreclassname_s = post.get("ignoreclassname"); final String valency_switch_tag_names_s = post.get("valency_switch_tag_names");
final Set<String> ignoreclassname = new HashSet<>(); final Set<String> valency_switch_tag_names = new HashSet<>();
if (ignoreclassname_s != null) { if (valency_switch_tag_names_s != null) {
final String[] ignoreclassname_a = ignoreclassname_s.trim().split(","); final String[] valency_switch_tag_name_a = valency_switch_tag_names_s.trim().split(",");
for (int i = 0; i < ignoreclassname_a.length; i++) { for (int i = 0; i < valency_switch_tag_name_a.length; i++) {
ignoreclassname.add(ignoreclassname_a[i].trim()); valency_switch_tag_names.add(valency_switch_tag_name_a[i].trim());
} }
} }
final String default_valency_radio = post.get("default_valency");
TagValency default_valency = TagValency.EVAL;
if (default_valency_radio != null && default_valency_radio.equals("IGNORE")) {
default_valency = TagValency.IGNORE;
}
// get vocabulary scraper info // get vocabulary scraper info
final JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context final JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context
@ -626,8 +631,8 @@ public class Crawler_p {
cachePolicy, cachePolicy,
collection, collection,
agentName, agentName,
TagValency.EVAL, default_valency,
ignoreclassname, valency_switch_tag_names,
new VocabularyScraper(vocabulary_scraper), new VocabularyScraper(vocabulary_scraper),
timezoneOffset); timezoneOffset);

Loading…
Cancel
Save