diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index 3133aebba..c3e60c1b8 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -480,12 +480,24 @@
Content Filter -

These are limitations on parts of a document. The filter will be applied after a web page was loaded.

+

These are limitations on parts of a document. The filter will be applied after a web page was loaded. + You can choose to:

+
Evaluate by default
+
+ Use all words in document by default until a CSS class as listed below appears; then ignore all +
+
Ignore by default
+
+ Ignore all words in document by default until a CSS class as listed below appears, then evaluate all +
Filter div or nav class names
- + + + +
set of CSS class namescomma-separated list of <div> or <nav> element class names which should be filtered out
comma-separated list of <div> or <nav> element class names which should be filtered out/in according to switch above.
diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 5222f9cd9..19f36b613 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -128,7 +128,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M INDEX_MEDIA ("indexMedia", false, CrawlAttribute.BOOLEAN, "Index Media"), COLLECTIONS ("collections", false, CrawlAttribute.STRING, "Collections (comma-separated list)"), DEFAULT_VALENCY ("default_valency", false, CrawlAttribute.STRING, "default tag valency"), - VALENCY_SWITCH_TAG_NAME ("valency_switch_tag_name", false, CrawlAttribute.STRING, "DIV Class names when default valency shall be switched"), + VALENCY_SWITCH_TAG_NAMES ("valency_switch_tag_names", false, CrawlAttribute.STRING, "DIV Class names when default valency shall be switched"), SCRAPER ("scraper", false, CrawlAttribute.STRING, "Declaration for Vocabulary Scraper"), TIMEZONEOFFSET ("timezoneOffset", true, CrawlAttribute.INTEGER, "Time Zone of Crawl Start Agent"); @@ -290,7 +290,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M this.valencySwitchTagNames = valencySwitchTagNames == null ? new HashSet() : valencySwitchTagNames; String jsonString = new JSONArray(valencySwitchTagNames).toString(); put(CrawlAttribute.DEFAULT_VALENCY.key, defaultValency.name()); - put(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key, jsonString); + put(CrawlAttribute.VALENCY_SWITCH_TAG_NAMES.key, jsonString); this.scraper = scraper == null ? new VocabularyScraper() : scraper; jsonString = this.scraper.toString(); assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString; @@ -312,7 +312,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M this.doms = new ConcurrentHashMap(); String defaultValency = ext.get(CrawlAttribute.DEFAULT_VALENCY.key); this.defaultValency = defaultValency == null || defaultValency.length() == 0 ? TagValency.EVAL : TagValency.valueOf(defaultValency); - String jsonString = ext.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key); + String jsonString = ext.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAMES.key); JSONArray a; if (jsonString == null) { a = new JSONArray(); @@ -1014,7 +1014,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M prop.put(CRAWL_PROFILE_PREFIX + count + "_indexMedia", this.indexMedia() ? 1 : 0); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.COLLECTIONS.key, this.get(CrawlAttribute.COLLECTIONS.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.DEFAULT_VALENCY.key, this.get(CrawlAttribute.DEFAULT_VALENCY.key)); - prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key, this.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.VALENCY_SWITCH_TAG_NAMES.key, this.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAMES.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.TIMEZONEOFFSET.key, this.get(CrawlAttribute.TIMEZONEOFFSET.key)); int i = 0; diff --git a/source/net/yacy/htroot/CrawlStartExpert.java b/source/net/yacy/htroot/CrawlStartExpert.java index 8a90510f5..9c8d55149 100644 --- a/source/net/yacy/htroot/CrawlStartExpert.java +++ b/source/net/yacy/htroot/CrawlStartExpert.java @@ -588,12 +588,28 @@ public class CrawlStartExpert { prop.put("list", agentNames.size()); prop.put("defaultAgentName", sb.isIntranetMode() ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName); - // ---------- Ignore Class Name - if (post != null && post.containsKey("ignoreclassname")) { - prop.put("ignoreclassname", - post.get("ignoreclassname", "")); + // ---------- Valency Switch Tag Names + if (post != null && post.containsKey("valency_switch_tag_names")) { + prop.put("valency_switch_tag_names", post.get("valency_switch_tag_names", "")); + } else { + prop.put("valency_switch_tag_names", ""); + } + if (post != null && post.containsKey("default_valency")) { + final String default_valency = post.get("default_valency", ""); + if (default_valency.equalsIgnoreCase("EVAL")){ + prop.put("default_valency_eval", 1); + prop.put("default_valency_ignore", 0); + } else if (default_valency.equalsIgnoreCase("IGNORE")) { + prop.put("default_valency_eval", 0); + prop.put("default_valency_ignore", 1); + prop.put("default_valency_ignore", 0); + } else { + prop.put("default_valency_eval", 1); + prop.put("default_valency_ignore", 0); + } } else { - prop.put("ignoreclassname", ""); + prop.put("default_valency_eval", 1); + prop.put("default_valency_ignore", 0); } // ---------- Enrich Vocabulary diff --git a/source/net/yacy/htroot/Crawler_p.java b/source/net/yacy/htroot/Crawler_p.java index 44b944fe8..98ca68dc1 100644 --- a/source/net/yacy/htroot/Crawler_p.java +++ b/source/net/yacy/htroot/Crawler_p.java @@ -485,14 +485,19 @@ public class Crawler_p { final boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld"); final String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", ""); - final String ignoreclassname_s = post.get("ignoreclassname"); - final Set ignoreclassname = new HashSet<>(); - if (ignoreclassname_s != null) { - final String[] ignoreclassname_a = ignoreclassname_s.trim().split(","); - for (int i = 0; i < ignoreclassname_a.length; i++) { - ignoreclassname.add(ignoreclassname_a[i].trim()); + final String valency_switch_tag_names_s = post.get("valency_switch_tag_names"); + final Set valency_switch_tag_names = new HashSet<>(); + if (valency_switch_tag_names_s != null) { + final String[] valency_switch_tag_name_a = valency_switch_tag_names_s.trim().split(","); + for (int i = 0; i < valency_switch_tag_name_a.length; i++) { + valency_switch_tag_names.add(valency_switch_tag_name_a[i].trim()); } } + final String default_valency_radio = post.get("default_valency"); + TagValency default_valency = TagValency.EVAL; + if (default_valency_radio != null && default_valency_radio.equals("IGNORE")) { + default_valency = TagValency.IGNORE; + } // get vocabulary scraper info final JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context @@ -626,8 +631,8 @@ public class Crawler_p { cachePolicy, collection, agentName, - TagValency.EVAL, - ignoreclassname, + default_valency, + valency_switch_tag_names, new VocabularyScraper(vocabulary_scraper), timezoneOffset);