front-end integration of tag valency

pull/554/head
Michael Peter Christen 2 years ago
parent 7f728bb4b4
commit 5a52b01c09

@ -480,12 +480,24 @@
</fieldset>
<fieldset>
<legend>Content Filter</legend>
<p>These are limitations on parts of a document. The filter will be applied after a web page was loaded.</p>
<p>These are limitations on parts of a document. The filter will be applied after a web page was loaded.
You can choose to:</p>
<dl>
<dt>Evaluate by default</dt>
<dd><input type="radio" name="default_valency" id="default_valency_eval" value="EVAL" #(default_valency_eval)#::checked="checked"#(/default_valency_eval)#/>
Use all words in document by default until a CSS class as listed below appears; then ignore all
</dd>
<dt>Ignore by default</dt>
<dd><input type="radio" name="default_valency" id="default_valency_ignore" value="IGNORE" #(default_valency_ignore)#::checked="checked"#(/default_valency_ignore)#/>
Ignore all words in document by default until a CSS class as listed below appears, then evaluate all
</dd>
<dt>Filter div or nav class names</dt>
<dd>
<table style="border-width: 0px">
<tr><td style="width:110px">set of CSS class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of &lt;div&gt; or &lt;nav&gt; element class names which should be filtered out</td></tr>
<tr>
<td><input name="valency_switch_tag_names" id="valency_switch_tag_names" type="text" size="55" maxlength="100000" value="#[valency_switch_tag_names]#" onblur="if (this.value=='') this.value='';"/></td>
<td>comma-separated list of &lt;div&gt; or &lt;nav&gt; element class names which should be filtered out/in according to switch above.</td>
</tr>
</table>
</dd>
</dl>

@ -128,7 +128,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
INDEX_MEDIA ("indexMedia", false, CrawlAttribute.BOOLEAN, "Index Media"),
COLLECTIONS ("collections", false, CrawlAttribute.STRING, "Collections (comma-separated list)"),
DEFAULT_VALENCY ("default_valency", false, CrawlAttribute.STRING, "default tag valency"),
VALENCY_SWITCH_TAG_NAME ("valency_switch_tag_name", false, CrawlAttribute.STRING, "DIV Class names when default valency shall be switched"),
VALENCY_SWITCH_TAG_NAMES ("valency_switch_tag_names", false, CrawlAttribute.STRING, "DIV Class names when default valency shall be switched"),
SCRAPER ("scraper", false, CrawlAttribute.STRING, "Declaration for Vocabulary Scraper"),
TIMEZONEOFFSET ("timezoneOffset", true, CrawlAttribute.INTEGER, "Time Zone of Crawl Start Agent");
@ -290,7 +290,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
this.valencySwitchTagNames = valencySwitchTagNames == null ? new HashSet<String>() : valencySwitchTagNames;
String jsonString = new JSONArray(valencySwitchTagNames).toString();
put(CrawlAttribute.DEFAULT_VALENCY.key, defaultValency.name());
put(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key, jsonString);
put(CrawlAttribute.VALENCY_SWITCH_TAG_NAMES.key, jsonString);
this.scraper = scraper == null ? new VocabularyScraper() : scraper;
jsonString = this.scraper.toString();
assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
@ -312,7 +312,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
this.doms = new ConcurrentHashMap<String, AtomicInteger>();
String defaultValency = ext.get(CrawlAttribute.DEFAULT_VALENCY.key);
this.defaultValency = defaultValency == null || defaultValency.length() == 0 ? TagValency.EVAL : TagValency.valueOf(defaultValency);
String jsonString = ext.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key);
String jsonString = ext.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAMES.key);
JSONArray a;
if (jsonString == null) {
a = new JSONArray();
@ -1014,7 +1014,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexMedia", this.indexMedia() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.COLLECTIONS.key, this.get(CrawlAttribute.COLLECTIONS.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.DEFAULT_VALENCY.key, this.get(CrawlAttribute.DEFAULT_VALENCY.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key, this.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.VALENCY_SWITCH_TAG_NAMES.key, this.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAMES.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.TIMEZONEOFFSET.key, this.get(CrawlAttribute.TIMEZONEOFFSET.key));
int i = 0;

@ -588,12 +588,28 @@ public class CrawlStartExpert {
prop.put("list", agentNames.size());
prop.put("defaultAgentName", sb.isIntranetMode() ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName);
// ---------- Ignore Class Name
if (post != null && post.containsKey("ignoreclassname")) {
prop.put("ignoreclassname",
post.get("ignoreclassname", ""));
// ---------- Valency Switch Tag Names
if (post != null && post.containsKey("valency_switch_tag_names")) {
prop.put("valency_switch_tag_names", post.get("valency_switch_tag_names", ""));
} else {
prop.put("valency_switch_tag_names", "");
}
if (post != null && post.containsKey("default_valency")) {
final String default_valency = post.get("default_valency", "");
if (default_valency.equalsIgnoreCase("EVAL")){
prop.put("default_valency_eval", 1);
prop.put("default_valency_ignore", 0);
} else if (default_valency.equalsIgnoreCase("IGNORE")) {
prop.put("default_valency_eval", 0);
prop.put("default_valency_ignore", 1);
prop.put("default_valency_ignore", 0);
} else {
prop.put("default_valency_eval", 1);
prop.put("default_valency_ignore", 0);
}
} else {
prop.put("ignoreclassname", "");
prop.put("default_valency_eval", 1);
prop.put("default_valency_ignore", 0);
}
// ---------- Enrich Vocabulary

@ -485,14 +485,19 @@ public class Crawler_p {
final boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
final String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", "");
final String ignoreclassname_s = post.get("ignoreclassname");
final Set<String> ignoreclassname = new HashSet<>();
if (ignoreclassname_s != null) {
final String[] ignoreclassname_a = ignoreclassname_s.trim().split(",");
for (int i = 0; i < ignoreclassname_a.length; i++) {
ignoreclassname.add(ignoreclassname_a[i].trim());
final String valency_switch_tag_names_s = post.get("valency_switch_tag_names");
final Set<String> valency_switch_tag_names = new HashSet<>();
if (valency_switch_tag_names_s != null) {
final String[] valency_switch_tag_name_a = valency_switch_tag_names_s.trim().split(",");
for (int i = 0; i < valency_switch_tag_name_a.length; i++) {
valency_switch_tag_names.add(valency_switch_tag_name_a[i].trim());
}
}
final String default_valency_radio = post.get("default_valency");
TagValency default_valency = TagValency.EVAL;
if (default_valency_radio != null && default_valency_radio.equals("IGNORE")) {
default_valency = TagValency.IGNORE;
}
// get vocabulary scraper info
final JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context
@ -626,8 +631,8 @@ public class Crawler_p {
cachePolicy,
collection,
agentName,
TagValency.EVAL,
ignoreclassname,
default_valency,
valency_switch_tag_names,
new VocabularyScraper(vocabulary_scraper),
timezoneOffset);

Loading…
Cancel
Save