crawl profile adoption to new tag valency attribute

2 years ago · 4304e07e6f
parent 5acd98f4da
commit 4304e07e6f
20 changed files with 1024 additions and 843 deletions
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@ -51,6 +51,7 @@ import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.CrawlQueues;
 import net.yacy.crawler.data.NoticedURL.StackType;
 import net.yacy.crawler.retrieval.Request;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.blob.MapHeap;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.index.RowHandleSet;
@ -60,7 +61,7 @@ import net.yacy.search.Switchboard;
 import net.yacy.search.SwitchboardConstants;

 public final class CrawlSwitchboard {
-	
+    
    public static final String CRAWL_PROFILE_AUTOCRAWL_DEEP = "autocrawlDeep";
    public static final String CRAWL_PROFILE_AUTOCRAWL_SHALLOW = "autocrawlShallow";
    public static final String CRAWL_PROFILE_RECRAWL_JOB = "recrawlJob";
@ -75,7 +76,7 @@ public final class CrawlSwitchboard {

    public static Set<String> DEFAULT_PROFILES = new HashSet<String>();
    static {
-    	DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_DEEP);
+        DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_DEEP);
        DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_SHALLOW);
        DEFAULT_PROFILES.add(CRAWL_PROFILE_RECRAWL_JOB);
        DEFAULT_PROFILES.add(CRAWL_PROFILE_PROXY);
@ -93,11 +94,11 @@ public final class CrawlSwitchboard {

    // Default time cycle in minutes before an indexed URL by a given crawl profile can be accepted for recrawl */

-	/**
-	 * The default recrawl time cycle in minutes for recrawl jobs. The recrawl date
-	 * limit can be set up by the recrawl job selection query, but a default limit
-	 * prevent unwanted overload on targets)
-	 */
+    /**
+     * The default recrawl time cycle in minutes for recrawl jobs. The recrawl date
+     * limit can be set up by the recrawl job selection query, but a default limit
+     * prevent unwanted overload on targets)
+     */
    public static final long CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE = 60L; // on hour
    public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; // one day
    public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; // 30 days
@ -139,7 +140,7 @@ public final class CrawlSwitchboard {
            try {
                p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
            } catch (final IOException | SpaceExceededException | RuntimeException e ) {
-            	ConcurrentLog.warn("CrawlProfiles", "Could not load profile " + handle, e);
+                ConcurrentLog.warn("CrawlProfiles", "Could not load profile " + handle, e);
                p = null;
            }
            if ( p == null ) {
@ -275,16 +276,15 @@ public final class CrawlSwitchboard {
    public RowHandleSet getURLHashes(final byte[] profileKey) {
        return this.profilesActiveCrawlsCounter.get(ASCII.String(profileKey));
    }
-    
-    
+
    private void initActiveCrawlProfiles() {
-    	final Switchboard sb = Switchboard.getSwitchboard();
-    	
-    	// generate new default entry for deep auto crawl
-    	this.defaultAutocrawlDeepProfile =
-    	    new CrawlProfile(
-    	        CRAWL_PROFILE_AUTOCRAWL_DEEP,
-    	        CrawlProfile.MATCH_ALL_STRING,   //crawlerUrlMustMatch
+        final Switchboard sb = Switchboard.getSwitchboard();
+
+        // generate new default entry for deep auto crawl
+        this.defaultAutocrawlDeepProfile =
+            new CrawlProfile(
+                CRAWL_PROFILE_AUTOCRAWL_DEEP,
+                CrawlProfile.MATCH_ALL_STRING,   //crawlerUrlMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //crawlerIpMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
@ -308,12 +308,13 @@ public final class CrawlSwitchboard {
                CacheStrategy.NOCACHE,
                "robot_" + CRAWL_PROFILE_AUTOCRAWL_DEEP,
                ClientIdentification.yacyInternetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
-    	this.profilesActiveCrawls.put(
-    	    UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()),
-    	    this.defaultAutocrawlDeepProfile);
-    	// generate new default entry for shallow auto crawl
+        this.profilesActiveCrawls.put(
+            UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()),
+            this.defaultAutocrawlDeepProfile);
+        // generate new default entry for shallow auto crawl
        this.defaultAutocrawlShallowProfile =
            new CrawlProfile(
                CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
@ -341,6 +342,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.NOCACHE,
                "robot_" + CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
                ClientIdentification.yacyInternetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -364,7 +366,7 @@ public final class CrawlSwitchboard {
                true,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
                -1,
-				false, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
+                false, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_TEXT, true),
                sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true),
                true,
@ -373,6 +375,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFFRESH,
                "robot_" + CRAWL_PROFILE_PROXY,
                ClientIdentification.yacyProxyAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -405,6 +408,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFFRESH,
                "robot_" + CRAWL_PROFILE_REMOTE,
                ClientIdentification.yacyInternetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -437,6 +441,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
                ClientIdentification.yacyIntranetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -469,6 +474,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
                ClientIdentification.yacyIntranetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -509,6 +515,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
                ClientIdentification.browserAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -541,6 +548,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
                ClientIdentification.yacyIntranetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -573,6 +581,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
                ClientIdentification.yacyIntranetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -605,6 +614,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.NOCACHE,
                "robot_" + CRAWL_PROFILE_SURROGATE,
                ClientIdentification.yacyIntranetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -640,6 +650,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.NOCACHE,
                collection,
                ClientIdentification.yacyIntranetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile);
--- a/source/net/yacy/crawler/RecrawlBusyThread.java
+++ b/source/net/yacy/crawler/RecrawlBusyThread.java
@ -43,6 +43,7 @@ import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.NoticedURL;
 import net.yacy.crawler.retrieval.Request;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.workflow.AbstractBusyThread;
 import net.yacy.search.Switchboard;
 import net.yacy.search.schema.CollectionSchema;
@ -355,7 +356,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {
                true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,
                "robot_" + CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB,
-                ClientIdentification.yacyInternetCrawlerAgentName, null, null, 0);
+                ClientIdentification.yacyInternetCrawlerAgentName, 
+                TagValency.EVAL, null, null, 0);
        return profile;
    }

--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -55,6 +55,7 @@ import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.CrawlSwitchboard;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.search.query.QueryParams;
 import net.yacy.search.schema.CollectionSchema;
@ -69,19 +70,19 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M

    /** Regular expression pattern matching everything */
    public static final String  MATCH_ALL_STRING    = ".*";
-    
+
    /** Regular expression pattern matching nothing */
    public static final String  MATCH_NEVER_STRING  = "";
-    
+
    /** Empty Solr query */
    public static final String  SOLR_EMPTY_QUERY  = "";
-    
+
    /** Match all Solr query */
    public static final String  SOLR_MATCH_ALL_QUERY  = AbstractSolrConnector.CATCHALL_QUERY;
-    
+
    /** Regular expression matching everything */
    public static final Pattern MATCH_ALL_PATTERN   = Pattern.compile(MATCH_ALL_STRING);
-    
+
    /** Regular expression matching nothing */
    public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING);

@ -126,14 +127,15 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        INDEX_TEXT                   ("indexText",                  false, CrawlAttribute.BOOLEAN, "Index Text"),
        INDEX_MEDIA                  ("indexMedia",                 false, CrawlAttribute.BOOLEAN, "Index Media"),
        COLLECTIONS                  ("collections",                false, CrawlAttribute.STRING,  "Collections (comma-separated list)"),
-        IGNORE_DIV_CLASS_NAME        ("ignore_class_name",      false, CrawlAttribute.STRING,  "Ignore DIV Class names"),
+        DEFAULT_VALENCY              ("default_valency",            false, CrawlAttribute.STRING,  "default tag valency"),
+        VALENCY_SWITCH_TAG_NAME      ("valency_switch_tag_name",    false, CrawlAttribute.STRING,  "DIV Class names when default valency shall be switched"),
        SCRAPER                      ("scraper",                    false, CrawlAttribute.STRING,  "Declaration for Vocabulary Scraper"),
        TIMEZONEOFFSET               ("timezoneOffset",             true,  CrawlAttribute.INTEGER, "Time Zone of Crawl Start Agent");
-        
+
        public static final int BOOLEAN = 0;
        public static final int INTEGER = 1;
        public static final int STRING = 2;
-        
+
        public final String key, label;
        public final boolean readonly;
        public final int type;
@ -143,39 +145,39 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            this.type = type;
            this.label = label;
        }
-        
+
        @Override
        public String toString() {
            return this.key;
        }
  }
-    
-    
+
    private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
-    
+
    /** Pattern on the URL a document must match to allow adding its embedded links to the crawl stack */
    private Pattern crawlerOriginUrlMustMatch = null;
-    
+
    /** Pattern on the URL a document must not match to allow adding its embedded links to the crawl stack */
    private Pattern crawlerOriginUrlMustNotMatch = null;
-    
+
    private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
    private Pattern crawlernodepthlimitmatch = null;
    private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
    private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null;
-    
+
    /** Pattern on the media type documents must match before being indexed 
     * @see CollectionSchema#content_type */
    private Pattern indexMediaTypeMustMatch = null;
-    
+
    /** Pattern on the media type documents must not match before being indexed
     * @see CollectionSchema#content_type  */
    private Pattern indexMediaTypeMustNotMatch = null;
-    
+
    private Pattern snapshotsMustnotmatch = null;

    private final Map<String, AtomicInteger> doms;
-    private final Set<String> ignore_class_name;
+    private final TagValency defaultValency;
+    private final Set<String> valencySwitchTagNames;
    private final VocabularyScraper scraper;

    /**
@ -238,7 +240,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                 final CacheStrategy cacheStrategy,
                 final String collections,
                 final String userAgentName,
-                 final Set<String> ignore_class_name,
+                 final TagValency defaultValency,
+                 final Set<String> valencySwitchTagNames,
                 final VocabularyScraper scraper,
                 final int timezoneOffset) {
        super(40);
@ -252,40 +255,42 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(CrawlAttribute.NAME.key,             name);
        put(CrawlAttribute.AGENT_NAME.key, userAgentName);
        put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true);
-        put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key,     (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
-        put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key,  (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
-        put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
-        put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key,  (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
-        put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key,      (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
-        put(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key,   (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch);
-        put(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key, (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch);
+        put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key,         (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
+        put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key,      (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
+        put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key,  (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
+        put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key,      (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
+        put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key,          (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
+        put(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key,       (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch);
+        put(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key,     (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch);
        put(CrawlAttribute.CRAWLER_URL_NODEPTHLIMITMATCH.key, (crawlerNoDepthLimitMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerNoDepthLimitMatch);
-        put(CrawlAttribute.INDEXING_URL_MUSTMATCH.key, (indexUrlMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustMatch);
-        put(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key, (indexUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustNotMatch);
-        put(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key, (indexContentMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustMatch);
+        put(CrawlAttribute.INDEXING_URL_MUSTMATCH.key,        (indexUrlMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustMatch);
+        put(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key,     (indexUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustNotMatch);
+        put(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key,    (indexContentMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustMatch);
        put(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch);
-        put(CrawlAttribute.DEPTH.key,            depth);
-        put(CrawlAttribute.DIRECT_DOC_BY_URL.key, directDocByURL);
-        put(CrawlAttribute.RECRAWL_IF_OLDER.key, recrawlIfOlder == null ? Long.MAX_VALUE : recrawlIfOlder.getTime());
-        put(CrawlAttribute.DOM_MAX_PAGES.key,    domMaxPages);
-        put(CrawlAttribute.CRAWLING_Q.key,       crawlingQ); // crawling of urls with '?'
-        put(CrawlAttribute.FOLLOW_FRAMES.key,    followFrames); // load pages contained in frames or ifames
-        put(CrawlAttribute.OBEY_HTML_ROBOTS_NOINDEX.key, obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored
+        put(CrawlAttribute.DEPTH.key,                     depth);
+        put(CrawlAttribute.DIRECT_DOC_BY_URL.key,         directDocByURL);
+        put(CrawlAttribute.RECRAWL_IF_OLDER.key,          recrawlIfOlder == null ? Long.MAX_VALUE : recrawlIfOlder.getTime());
+        put(CrawlAttribute.DOM_MAX_PAGES.key,             domMaxPages);
+        put(CrawlAttribute.CRAWLING_Q.key,                crawlingQ); // crawling of urls with '?'
+        put(CrawlAttribute.FOLLOW_FRAMES.key,             followFrames); // load pages contained in frames or ifames
+        put(CrawlAttribute.OBEY_HTML_ROBOTS_NOINDEX.key,  obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored
        put(CrawlAttribute.OBEY_HTML_ROBOTS_NOFOLLOW.key, obeyHtmlRobotsNofollow);
-        put(CrawlAttribute.INDEX_TEXT.key,       indexText);
-        put(CrawlAttribute.INDEX_MEDIA.key,      indexMedia);
-        put(CrawlAttribute.STORE_HTCACHE.key,    storeHTCache);
-        put(CrawlAttribute.REMOTE_INDEXING.key,  remoteIndexing);
-        put(CrawlAttribute.SNAPSHOTS_MAXDEPTH.key, snapshotsMaxDepth);
-        put(CrawlAttribute.SNAPSHOTS_LOADIMAGE.key, snapshotsLoadImage);
-        put(CrawlAttribute.SNAPSHOTS_REPLACEOLD.key, snapshotsReplaceOld);
-        put(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key, snapshotsMustnotmatch);
-        put(CrawlAttribute.CACHE_STRAGEGY.key,   cacheStrategy.toString());
-        put(CrawlAttribute.COLLECTIONS.key,      CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
+        put(CrawlAttribute.INDEX_TEXT.key,                indexText);
+        put(CrawlAttribute.INDEX_MEDIA.key,               indexMedia);
+        put(CrawlAttribute.STORE_HTCACHE.key,             storeHTCache);
+        put(CrawlAttribute.REMOTE_INDEXING.key,           remoteIndexing);
+        put(CrawlAttribute.SNAPSHOTS_MAXDEPTH.key,        snapshotsMaxDepth);
+        put(CrawlAttribute.SNAPSHOTS_LOADIMAGE.key,       snapshotsLoadImage);
+        put(CrawlAttribute.SNAPSHOTS_REPLACEOLD.key,      snapshotsReplaceOld);
+        put(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key,    snapshotsMustnotmatch);
+        put(CrawlAttribute.CACHE_STRAGEGY.key,            cacheStrategy.toString());
+        put(CrawlAttribute.COLLECTIONS.key,               CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
        // we transform the ignore_class_name and scraper information into a JSON Array
-        this.ignore_class_name = ignore_class_name == null ? new HashSet<String>() : ignore_class_name;
-        String jsonString = new JSONArray(ignore_class_name).toString();
-        put(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key, jsonString);
+        this.defaultValency = defaultValency;
+        this.valencySwitchTagNames = valencySwitchTagNames == null ? new HashSet<String>() : valencySwitchTagNames;
+        String jsonString = new JSONArray(valencySwitchTagNames).toString();
+        put(CrawlAttribute.DEFAULT_VALENCY.key, defaultValency.name());
+        put(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key, jsonString);
        this.scraper = scraper == null ? new VocabularyScraper() : scraper;
        jsonString = this.scraper.toString();
        assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
@ -305,9 +310,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        super(ext == null ? 1 : ext.size());
        if (ext != null) putAll(ext);
        this.doms = new ConcurrentHashMap<String, AtomicInteger>();
-        String jsonString = ext.get(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key);
+        String defaultValency = ext.get(CrawlAttribute.DEFAULT_VALENCY.key);
+        this.defaultValency = defaultValency == null || defaultValency.length() == 0 ? TagValency.EVAL : TagValency.valueOf(defaultValency);
+        String jsonString = ext.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key);
        JSONArray a;
-        if(jsonString == null) {
+        if (jsonString == null) {
            a = new JSONArray();
        } else {
            try {
@ -317,9 +324,9 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                a = new JSONArray();
            }
        }
-        this.ignore_class_name = new HashSet<String>();
+        this.valencySwitchTagNames = new HashSet<String>();
        for (int i = 0; i < a.length(); i++) try {
-            this.ignore_class_name.add(a.getString(i));
+            this.valencySwitchTagNames.add(a.getString(i));
        } catch (JSONException e) {}
        jsonString = ext.get(CrawlAttribute.SCRAPER.key);
        if (jsonString == null || jsonString.length() == 0) {
@ -336,14 +343,18 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
    }

-    public Set<String> ignoreDivClassName() {
-        return this.ignore_class_name;
+    public TagValency defaultValency() {
+        return this.defaultValency;
+    }
+
+    public Set<String> valencySwitchTagNames() {
+        return this.valencySwitchTagNames;
    }

    public VocabularyScraper scraper() {
        return this.scraper;
    }
-    
+
    public void domInc(final String domain) {
        if (domain == null) return; // may be correct for file system crawls
        final AtomicInteger dp = this.doms.get(domain);
@ -427,7 +438,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        //if (r == null) return null;
        return r;
    }
-    
+
    private Map<String, Pattern> cmap = null;

    /**
@ -440,7 +451,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        this.cmap = collectionParser(r);
        return this.cmap;
    }
-    
+
    public static Map<String, Pattern> collectionParser(String collectionString) {
        if (collectionString == null || collectionString.length() == 0) return new HashMap<String, Pattern>();
        String[] cs = CommonPattern.COMMA.split(collectionString);
@ -470,7 +481,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        final String r = get(CrawlAttribute.COLLECTIONS.key);
        return r == null || r.length() == 0 || "user".equals(r) ? name() : r;
    }
-    
+
    /**
     * Gets the regex which must be matched by URLs in order to be crawled.
     * @return regex which must be matched
@ -484,7 +495,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
        return this.crawlerurlmustmatch;
    }
-    
+
    /**
     * Render the urlMustMatchPattern as a String of limited size, suffixing it with
     * "..." when it is truncated. Used to prevent unnecessary growth of the logs,
@ -516,7 +527,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
        return this.crawlerurlmustnotmatch;
    }
-    
+
    /**
     * Get the pattern on the URL a document must match to allow adding its embedded links to the crawl stack
     * 
@ -538,7 +549,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
        return this.crawlerOriginUrlMustMatch;
    }
-    
+
    /**
     * Get the pattern on the URL a document must not match to allow adding its embedded links to the crawl stack
     * 
@ -601,7 +612,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        if (list.length == 1 && list.length == 0) list = new String[0];
        return list;
    }
-    
+
    /**
     * If the regex matches with the url, then there is no depth limit on the crawl (it overrides depth == 0)
     * @return regex which must be matched
@ -643,7 +654,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
        return this.indexurlmustnotmatch;
    }
-    
+
    /**
     * Gets the regex which must be matched by URLs in order to be indexed.
     * @return regex which must be matched
@ -671,7 +682,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
        return this.indexcontentmustnotmatch;
    }
-    
+
    /**
     * Get the Pattern on media type that documents must match in order to be indexed
     * 
@ -693,7 +704,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
        return this.indexMediaTypeMustMatch;
    }
-    
+
    /**
     * Get the Pattern on media type that documents must not match in order to be indexed
     * 
@ -715,9 +726,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
        return this.indexMediaTypeMustNotMatch;
    }
-    
-    
-    
+
    /**
     * Gets depth of crawl job (or height of the tree which will be
     * created by the crawler).
@ -743,7 +752,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
-    
+
    /**
     * @return true when the crawler must always cross check the eventual URL file
     *         extension against the actual Media Type, even when file extension is
@ -772,7 +781,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public void setCacheStrategy(final CacheStrategy newStrategy) {
        put(CrawlAttribute.CACHE_STRAGEGY.key, newStrategy.toString());
    }
-    
+
    /**
     * Gets the minimum date that an entry must have to be re-crawled.
     * @return time in ms representing a date
@ -847,13 +856,13 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
-    
+
    public boolean remoteIndexing() {
        final String r = get(CrawlAttribute.REMOTE_INDEXING.key);
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
-    
+
    public int snapshotMaxdepth() {
        final String r = get(CrawlAttribute.SNAPSHOTS_MAXDEPTH.key);
        if (r == null) return -1;
@ -866,7 +875,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            return -1;
        }
    }
-    
+
    public boolean snapshotLoadImage() {
        final String r = get(CrawlAttribute.SNAPSHOTS_LOADIMAGE.key);
        if (r == null) return false;
@ -878,7 +887,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
-    
+
    public Pattern snapshotsMustnotmatch() {
        if (this.snapshotsMustnotmatch == null) {
            final String r = get(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key);
@ -887,7 +896,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            } catch (final PatternSyntaxException e) { this.snapshotsMustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
        }
        return this.snapshotsMustnotmatch;
-    }    
+    }

    public int timezoneOffset() {
        final String timezoneOffset = get(CrawlAttribute.TIMEZONEOFFSET.key);
@ -898,7 +907,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            return 0;
        }
    }
-    
+
    /**
     * get a recrawl date for a given age in minutes
     * @param oldTimeMinutes
@ -946,7 +955,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+";
        return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host.toLowerCase(Locale.ROOT))).append(url.getPath()).append(".*").toString();
    }
-    
+
    public boolean isPushCrawlProfile() {
        return this.name().startsWith(CrawlProfile.CRAWL_PROFILE_PUSH_STUB);
    }
@ -1008,7 +1017,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", this.handle());
        prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", deleteButton);
        prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", this.handle());
-        
+
        int i = 0;
        if (active && this.domMaxPages() > 0 && this.domMaxPages() != Integer.MAX_VALUE) {
            String item;
@ -1021,7 +1030,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i);

    }
-    
+
    public static void main(String[] args) {
        // test to convert the key set from set to string and back
        Set<String> a = new HashSet<>();
--- a/source/net/yacy/crawler/retrieval/Response.java
+++ b/source/net/yacy/crawler/retrieval/Response.java
@ -48,6 +48,7 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.search.Switchboard;

 public class Response {
@ -853,7 +854,7 @@ public class Response {
        // 4) proxy-load (initiator is "------------")
        // 5) local prefetch/crawling (initiator is own seedHash)
        // 6) local fetching for global crawling (other known or unknown initiator)
-    	// 7) local surrogates processing (can not be known here : crawl profile is required)
+        // 7) local surrogates processing (can not be known here : crawl profile is required)
        EventOrigin processCase = EventOrigin.UNKNOWN;
        // FIXME the equals seems to be incorrect: String.equals(boolean)
        if (initiator() == null || initiator().length == 0 || ASCII.String(initiator()).equals("------------")) {
@ -873,9 +874,13 @@ public class Response {
        final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.getContentType());
        if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
        try {
-            return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.getContentType(), this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(), new HashSet<String>(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
+            return TextParser.parseSource(
+                    url(), this.responseHeader == null ? null : this.responseHeader.getContentType(),
+                    this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(),
+                    TagValency.EVAL, new HashSet<String>(),
+                    new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
        } catch(Parser.Failure e) {
-        	throw e;
+            throw e;
        } catch (final Exception e) {
            return null;
        }
--- a/source/net/yacy/document/AbstractParser.java
+++ b/source/net/yacy/document/AbstractParser.java
@ -32,6 +32,7 @@ import java.util.Set;

 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.document.parser.html.TagValency;

 public abstract class AbstractParser implements Parser {

@ -41,20 +42,20 @@ public abstract class AbstractParser implements Parser {
    protected final Set<String> SUPPORTED_MIME_TYPES = new LinkedHashSet<String>(); 
    protected final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
    private   final String name;
-    
+
    /**
     * initialize a parser with a name
     * @param name
     */
    public AbstractParser(final String name) {
-	    this.name = name;
-	}
+        this.name = name;
+    }

    /*
     *  The following abstract implementations create a circular call which would cause an endless loop when called.
     *  They are both here because one of them must be overridden by the implementing class.
     */
-    
+
    @Override
    public Document[] parse(
            DigestURL url,
@ -64,7 +65,7 @@ public abstract class AbstractParser implements Parser {
            int timezoneOffset,
            InputStream source
            ) throws Parser.Failure, InterruptedException {
-    	return parse(url, mimeType, charset, new HashSet<String>(), scraper, timezoneOffset, source);
+        return parse(url, mimeType, charset, TagValency.EVAL, new HashSet<String>(), scraper, timezoneOffset, source);
    }

    @Override
@ -72,15 +73,15 @@ public abstract class AbstractParser implements Parser {
            DigestURL url,
            String mimeType,
            String charset,
-            Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            VocabularyScraper scraper,
            int timezoneOffset,
            InputStream source
            ) throws Parser.Failure, InterruptedException {
-    	return parse(url, mimeType, charset, scraper, timezoneOffset, source);
+        return parse(url, mimeType, charset, scraper, timezoneOffset, source);
    }
-    
-    
+
    /*
     *  The following abstract implementations create a circular call which would cause an endless loop when called.
     *  They are both here because one of them must be overridden by the implementing class.
@ -88,32 +89,33 @@ public abstract class AbstractParser implements Parser {

    @Override
    public Document[] parseWithLimits(
-    		final DigestURL location,
-    		final String mimeType,
-    		final String charset,
-    		final VocabularyScraper scraper,
-    		final int timezoneOffset,
-    		final InputStream source,
-    		final int maxLinks,
-    		final long maxBytes) throws UnsupportedOperationException, Failure, InterruptedException {
-    	return parseWithLimits(location, mimeType, charset, new HashSet<String>(), scraper, timezoneOffset, source, maxLinks, maxBytes);
+            final DigestURL location,
+            final String mimeType,
+            final String charset,
+            final VocabularyScraper scraper,
+            final int timezoneOffset,
+            final InputStream source,
+            final int maxLinks,
+            final long maxBytes) throws UnsupportedOperationException, Failure, InterruptedException {
+        return parseWithLimits(location, mimeType, charset, TagValency.EVAL, new HashSet<String>(), scraper, timezoneOffset, source, maxLinks, maxBytes);
    }
-    
+
    @Override
    public Document[] parseWithLimits(
-    		DigestURL location,
-    		String mimeType,
-    		String charset,
-    		final Set<String> ignore_class_name,
-    		VocabularyScraper scraper,
-    		int timezoneOffset,
-    		InputStream source,
-    		int maxLinks,
-    		long maxBytes)
-    		throws Failure, InterruptedException, UnsupportedOperationException {
-    	return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, maxLinks, maxBytes);
+            DigestURL location,
+            String mimeType,
+            String charset,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
+            VocabularyScraper scraper,
+            int timezoneOffset,
+            InputStream source,
+            int maxLinks,
+            long maxBytes)
+            throws Failure, InterruptedException, UnsupportedOperationException {
+        return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, maxLinks, maxBytes);
    }
-    
+
    /**
     * return the name of the parser
     */
@ -164,12 +166,11 @@ public abstract class AbstractParser implements Parser {
        if (t != null) c.add(t);
        return c;
    }
-    
+
    @Override
    public boolean isParseWithLimitsSupported() {
-    	/* Please override on subclasses when parseWithLimits is supported */
-    	return false;
+        /* Please override on subclasses when parseWithLimits is supported */
+        return false;
    }
-    

 }
--- a/source/net/yacy/document/Parser.java
+++ b/source/net/yacy/document/Parser.java
@ -28,6 +28,7 @@ import java.util.Set;

 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.document.parser.html.TagValency;

 public interface Parser {

@ -63,72 +64,87 @@ public interface Parser {
            int timezoneOffset,
            InputStream source
            ) throws Parser.Failure, InterruptedException;
-    
+
    public Document[] parse(
            DigestURL url,
            String mimeType,
            String charset,
-            Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            VocabularyScraper scraper,
            int timezoneOffset,
            InputStream source
            ) throws Parser.Failure, InterruptedException;
-    
+
+    /**
+    * Parse an input stream, eventually terminating processing when a total of
+    * maxLinks URLS (anchors, images links, media links...) have been reached,
+    * or when maxBytes content bytes have been processed, thus potentially
+    * resulting in partially parsed documents (with
+    * {@link Document#isPartiallyParsed()} returning true). Some parser
+    * implementations will not support parsing within maxLinks or maxBytes
+    * limits : make sure to check this by calling fist
+    * {@link #isParseWithLimitsSupported()}, or a UnsupportedOperationException
+    * could be thrown.
+    * 
+    * @param url
+    *            the URL of the source
+    * @param mimeType
+    *            the mime type of the source, if known
+    * @param charset
+    *            the charset name of the source, if known
+    * @param scraper
+    *            an entity scraper to detect facets from text annotation
+    *            context
+    * @param timezoneOffset
+    *            the local time zone offset
+    * @param source
+    *            a input stream
+    * @param maxLinks
+    *            the maximum total number of links to parse and add to the
+    *            result documents
+    * @param maxBytes
+    *            the maximum number of content bytes to process
+    * @return a list of documents that result from parsing the source, with
+    *         empty or null text.
+    * @throws Parser.Failure
+    *             when the parser processing failed
+    * @throws InterruptedException
+    *             when the processing was interrupted before termination
+    * @throws UnsupportedOperationException
+    *             when the parser implementation doesn't support parsing within
+    *             limits
+    */
+    public Document[] parseWithLimits(
+            DigestURL url,
+            String mimeType,
+            String charset,
+            VocabularyScraper scraper,
+            int timezoneOffset,
+            InputStream source,
+            int maxLinks,
+            long maxBytes)
+                    throws Parser.Failure, InterruptedException, UnsupportedOperationException;
+
+
+    public Document[] parseWithLimits(
+            final DigestURL location,
+            final String mimeType,
+            final String documentCharset,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
+            final VocabularyScraper vocscraper,
+            final int timezoneOffset,
+            final InputStream sourceStream,
+            final int maxLinks,
+            final long maxBytes)
+                    throws Parser.Failure, InterruptedException, UnsupportedOperationException;
+
    /**
-	 * Parse an input stream, eventually terminating processing when a total of
-	 * maxLinks URLS (anchors, images links, media links...) have been reached,
-	 * or when maxBytes content bytes have been processed, thus potentially
-	 * resulting in partially parsed documents (with
-	 * {@link Document#isPartiallyParsed()} returning true). Some parser
-	 * implementations will not support parsing within maxLinks or maxBytes
-	 * limits : make sure to check this by calling fist
-	 * {@link #isParseWithLimitsSupported()}, or a UnsupportedOperationException
-	 * could be thrown.
-	 * 
-	 * @param url
-	 *            the URL of the source
-	 * @param mimeType
-	 *            the mime type of the source, if known
-	 * @param charset
-	 *            the charset name of the source, if known
-	 * @param scraper
-	 *            an entity scraper to detect facets from text annotation
-	 *            context
-	 * @param timezoneOffset
-	 *            the local time zone offset
-	 * @param source
-	 *            a input stream
-	 * @param maxLinks
-	 *            the maximum total number of links to parse and add to the
-	 *            result documents
-	 * @param maxBytes
-	 *            the maximum number of content bytes to process
-	 * @return a list of documents that result from parsing the source, with
-	 *         empty or null text.
-	 * @throws Parser.Failure
-	 *             when the parser processing failed
-	 * @throws InterruptedException
-	 *             when the processing was interrupted before termination
-	 * @throws UnsupportedOperationException
-	 *             when the parser implementation doesn't support parsing within
-	 *             limits
-	 */
-	public Document[] parseWithLimits(DigestURL url, String mimeType, String charset,
-			VocabularyScraper scraper,
-			int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
-			throws Parser.Failure, InterruptedException, UnsupportedOperationException;
-
-
-    public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset,
-    		final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
-    		final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
-    				throws Parser.Failure, InterruptedException, UnsupportedOperationException;
-
-	/**
-	 * @return true when the parser implementation supports the
-	 *         parseWithLimits() operation.
-	 */
-	public boolean isParseWithLimitsSupported();
+    * @return true when the parser implementation supports the
+    *         parseWithLimits() operation.
+    */
+    public boolean isParseWithLimitsSupported();

    // methods to that shall make it possible to put Parser objects into a hashtable

--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -51,6 +51,7 @@ import net.yacy.document.parser.docParser;
 import net.yacy.document.parser.genericParser;
 import net.yacy.document.parser.gzipParser;
 import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.document.parser.htmlParser;
 import net.yacy.document.parser.linkScraperParser;
 import net.yacy.document.parser.mmParser;
@ -184,7 +185,8 @@ public final class TextParser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final int depth,
@ -201,7 +203,7 @@ public final class TextParser {
                throw new Parser.Failure(errorMsg, location);
            }
            sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
-            docs = parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream);
+            docs = parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -218,7 +220,8 @@ public final class TextParser {
            final DigestURL location,
            String mimeType,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final int depth,
@ -236,7 +239,7 @@ public final class TextParser {
        }
        assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);

-        final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
+        final Document[] docs = parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);

        return docs;
    }
@ -248,7 +251,8 @@ public final class TextParser {
            final DigestURL location,
            String mimeType,
            final String charset,
-            final Set<String> ignoreClassNames,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final int depth,
@ -261,14 +265,15 @@ public final class TextParser {
        final Set<Parser> idioms = new HashSet<>();
        idioms.add(TextParser.genericIdiom);

-        return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
+        return parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
    }

    private static Document[] parseSource(
            final DigestURL location,
            String mimeType,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final int depth,
@ -330,7 +335,7 @@ public final class TextParser {
                    CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);

                    try {
-                        return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset,
+                        return parseSource(location, mimeType, parser, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset,
                                nonCloseInputStream, maxLinks, maxBytes);
                    } catch (final Parser.Failure e) {
                        /* Try to reset the marked stream. If the failed parser has consumed too many bytes :
@ -378,11 +383,11 @@ public final class TextParser {
        int maxBytesToRead = -1;
        if(maxBytes < Integer.MAX_VALUE) {
            /* Load at most maxBytes + 1 :
-		       - to let parsers not supporting Parser.parseWithLimits detect the maxBytes size is exceeded and end with a Parser.Failure
-		       - but let parsers supporting Parser.parseWithLimits perform partial parsing of maxBytes content */
+               - to let parsers not supporting Parser.parseWithLimits detect the maxBytes size is exceeded and end with a Parser.Failure
+               - but let parsers supporting Parser.parseWithLimits perform partial parsing of maxBytes content */
            maxBytesToRead = (int)maxBytes + 1;
        }
-        if(contentLength >= 0 && contentLength < maxBytesToRead) {
+        if (contentLength >= 0 && contentLength < maxBytesToRead) {
            maxBytesToRead = (int)contentLength;
        }

@ -392,16 +397,23 @@ public final class TextParser {
        } catch (final IOException e) {
            throw new Parser.Failure(e.getMessage(), location);
        }
-        final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
+        final Document[] docs = parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);

        return docs;
    }

-    public static Document[] parseSource(final DigestURL location, String mimeType, final String charset,
-            final Set<String> ignore_class_name,
-            final VocabularyScraper scraper, final int timezoneOffset, final int depth, final long contentLength,
+    public static Document[] parseSource(
+            final DigestURL location,
+            String mimeType,
+            final String charset,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
+            final VocabularyScraper scraper,
+            final int timezoneOffset,
+            final int depth,
+            final long contentLength,
            final InputStream sourceStream) throws Parser.Failure {
-        return parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, contentLength, sourceStream,
+        return parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, contentLength, sourceStream,
                Integer.MAX_VALUE, Long.MAX_VALUE);
    }

@ -424,10 +436,19 @@ public final class TextParser {
     * @return a list of documents that result from parsing the source, with empty or null text.
     * @throws Parser.Failure when the parser processing failed
     */
-    public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, final Set<String> ignoreClassNames,
-            final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
+    public static Document[] parseWithLimits(
+            final DigestURL location,
+            String mimeType,
+            final String charset,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
+            final int timezoneOffset,
+            final int depth,
+            final long contentLength,
+            final InputStream sourceStream,
+            int maxLinks,
            long maxBytes) throws Parser.Failure{
-        return parseSource(location, mimeType, charset, ignoreClassNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
+        return parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
                sourceStream, maxLinks, maxBytes);
    }

@ -449,10 +470,11 @@ public final class TextParser {
     * @return a list of documents that result from parsing the source, with empty or null text.
     * @throws Parser.Failure when the parser processing failed
     */
-    public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
+    public static Document[] parseWithLimits(
+            final DigestURL location, String mimeType, final String charset,
            final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
            long maxBytes) throws Parser.Failure{
-        return parseSource(location, mimeType, charset, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength,
+        return parseSource(location, mimeType, charset, TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength,
                sourceStream, maxLinks, maxBytes);
    }

@ -475,7 +497,8 @@ public final class TextParser {
            final String mimeType,
            final Parser parser,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final InputStream sourceStream,
@ -491,11 +514,11 @@ public final class TextParser {
        try {
            final Document[] docs;
            if(parser.isParseWithLimitsSupported()) {
-                docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
+                docs = parser.parseWithLimits(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
            } else {
                /* Parser do not support partial parsing within limits : let's control it here*/
                final InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
-                docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource);
+                docs = parser.parse(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, limitedSource);
            }
            return docs;
        } catch(final Parser.Failure e) {
@ -524,7 +547,8 @@ public final class TextParser {
            final String mimeType,
            final Set<Parser> parsers,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final int depth,
@ -552,13 +576,13 @@ public final class TextParser {
                }
                try {
                    if(parser.isParseWithLimitsSupported()) {
-                        docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis, maxLinks, maxBytes);
+                        docs = parser.parseWithLimits(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, bis, maxLinks, maxBytes);
                    } else {
                        /* Partial parsing is not supported by this parser : check content length now */
                        if(sourceArray.length > maxBytes) {
                            throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);
                        }
-                        docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis);
+                        docs = parser.parse(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, bis);
                    }
                } catch (final Parser.Failure e) {
                    if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException &&
--- a/source/net/yacy/document/importer/MediawikiImporter.java
+++ b/source/net/yacy/document/importer/MediawikiImporter.java
@ -68,6 +68,7 @@ import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.document.content.SurrogateReader;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.util.NamePrefixThreadFactory;

 import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
@ -101,11 +102,11 @@ public class MediawikiImporter extends Thread implements Importer {


    public MediawikiImporter(final MultiProtocolURL sourcefile, final File targetdir) {
-    	super("MediawikiImporter(" + sourcefile != null ? sourcefile.toNormalform(true) : "null sourcefile" +")");
-    	this.sourcefile = sourcefile;
-    	this.docsize = sourcefile.length();
-    	this.approxdocs = (int) (this.docsize * docspermbinxmlbz2 / 1024L / 1024L);
-    	this.targetdir = targetdir;
+        super("MediawikiImporter(" + sourcefile != null ? sourcefile.toNormalform(true) : "null sourcefile" +")");
+        this.sourcefile = sourcefile;
+        this.docsize = sourcefile.length();
+        this.approxdocs = (int) (this.docsize * docspermbinxmlbz2 / 1024L / 1024L);
+        this.targetdir = targetdir;
        this.count = 0;
        this.start = 0;
        this.hostport = null;
@ -154,7 +155,7 @@ public class MediawikiImporter extends Thread implements Importer {
    }

    @SuppressWarnings("resource")
-	@Override
+    @Override
    public void run() {
        this.start = System.currentTimeMillis();
        final int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
@ -179,8 +180,8 @@ public class MediawikiImporter extends Thread implements Importer {
            boolean page = false, text = false;
            String title = null;
            final BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
-			final ExecutorService service = Executors.newCachedThreadPool(
-					new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".convertConsumer"));
+            final ExecutorService service = Executors.newCachedThreadPool(
+                    new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".convertConsumer"));
            final convertConsumer[] consumers = new convertConsumer[threads];
            final Future<?>[] consumerResults = (Future<?>[]) Array.newInstance(Future.class, threads);
            for (int i = 0; i < threads; i++) {
@ -276,23 +277,23 @@ public class MediawikiImporter extends Thread implements Importer {
                    consumerResults[i].get(10000, TimeUnit.MILLISECONDS);
                }
            } catch (final Exception e) {
-            	this.errorMessage = e.getMessage();
+                this.errorMessage = e.getMessage();
                ConcurrentLog.logException(e);
            } finally {
                out.put(poison); // output thread condition (for file.close)
                writerResult.get(10000, TimeUnit.MILLISECONDS);
            }
        } catch (final Exception e) {
-        	this.errorMessage = e.getMessage();
+            this.errorMessage = e.getMessage();
            ConcurrentLog.logException(e);
        } finally {
-        	if(reader != null) {
+            if(reader != null) {
                try {
-					reader.close();
-				} catch (IOException e) {
-					ConcurrentLog.warn("WIKITRANSLATION", "Could not close dump reader : " + e.getMessage());
-				}
-        	}
+                    reader.close();
+                } catch (IOException e) {
+                    ConcurrentLog.warn("WIKITRANSLATION", "Could not close dump reader : " + e.getMessage());
+                }
+            }
            try {
                out.put(poison); // out keeps output file open until poisened, to close file if exception happend in this block
            } catch (InterruptedException ex) { }
@ -310,7 +311,7 @@ public class MediawikiImporter extends Thread implements Importer {
        File mediawikixml;
        
        public indexMaker(final File mediawikixml) {
-        	super("MediawikiImporter.indexMaker " + mediawikixml != null ? mediawikixml.getName() : "");
+            super("MediawikiImporter.indexMaker " + mediawikixml != null ? mediawikixml.getName() : "");
            this.mediawikixml = mediawikixml;
        }

@ -337,8 +338,8 @@ public class MediawikiImporter extends Thread implements Importer {
        final PositionAwareReader in = new PositionAwareReader(dumpFile);
        final indexProducer producer = new indexProducer(100, idxFromMediawikiXML(dumpFile));
        final wikiConsumer consumer = new wikiConsumer(100, producer);
-		final ExecutorService service = Executors.newCachedThreadPool(
-				new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".createIndex"));
+        final ExecutorService service = Executors.newCachedThreadPool(
+                new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".createIndex"));
        final Future<Integer> producerResult = service.submit(consumer);
        final Future<Integer> consumerResult = service.submit(producer);
        service.shutdown();
@ -535,14 +536,14 @@ public class MediawikiImporter extends Thread implements Importer {
        }
        public void genDocument() throws Parser.Failure {
            try {
-				this.url = new AnchorURL(this.urlStub + this.title);
-				final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
-				this.document = Document.mergeDocuments(this.url, "text/html", parsed);
-				// the wiki parser is not able to find the proper title in the source text, so it must be set here
-				this.document.setTitle(this.title);
-			} catch (final MalformedURLException e1) {
-			    ConcurrentLog.logException(e1);
-			}
+                this.url = new AnchorURL(this.urlStub + this.title);
+                final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
+                this.document = Document.mergeDocuments(this.url, "text/html", parsed);
+                // the wiki parser is not able to find the proper title in the source text, so it must be set here
+                this.document.setTitle(this.title);
+            } catch (final MalformedURLException e1) {
+                ConcurrentLog.logException(e1);
+            }
        }
        public void writeXML(final OutputStreamWriter os) throws IOException {
            this.document.writeXML(os);
@ -676,9 +677,9 @@ public class MediawikiImporter extends Thread implements Importer {
                    } catch (final Parser.Failure e) {
                        ConcurrentLog.logException(e);
                    } catch (final IOException e) {
-						// TODO Auto-generated catch block
+                        // TODO Auto-generated catch block
                        ConcurrentLog.logException(e);
-					}
+                    }
                }
            } catch (final InterruptedException e) {
                ConcurrentLog.logException(e);
@ -772,78 +773,78 @@ public class MediawikiImporter extends Thread implements Importer {

    }

-	public static void main(final String[] s) {
-		if (s.length == 0) {
-			System.out.println("usage:");
-			System.out.println(" -index <wikipedia-dump>");
-			System.out.println(" -read  <start> <len> <idx-file>");
-			System.out.println(" -find  <title> <wikipedia-dump>");
-			System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>");
-			ConcurrentLog.shutdown();
-			return;
-		}
-
-		try {
-			// example:
-			// java -Xmx2000m -cp classes:lib/bzip2.jar
-			// de.anomic.tools.mediawikiIndex -convert
-			// DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2
-			// DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
-
-			if (s[0].equals("-convert")) {
-				if(s.length < 3) {
-					System.out.println("usage:");
-					System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>");
-					ConcurrentLog.shutdown();
-					return;
-				}
-				final File targetdir = new File(s[2]);
-				try {
-					final MediawikiImporter mi = new MediawikiImporter(new MultiProtocolURL(s[1]), targetdir);
-					mi.start();
-					mi.join();
-				} catch (final InterruptedException e) {
-					ConcurrentLog.logException(e);
-				} catch (MalformedURLException e) {
-					ConcurrentLog.logException(e);
-				}
-			}
-
-			if (s[0].equals("-index")) {
-				try {
-					createIndex(new File(s[1]));
-				} catch (final IOException e) {
-					ConcurrentLog.logException(e);
-				}
-			}
-
-			if (s[0].equals("-read")) {
-				final long start = Integer.parseInt(s[1]);
-				final int len = Integer.parseInt(s[2]);
-				System.out.println(UTF8.String(read(new File(s[3]), start, len)));
-			}
-
-			if (s[0].equals("-find")) {
-				try {
-					final wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml"));
-					if (w == null) {
-						ConcurrentLog.info("WIKITRANSLATION", "not found");
-					} else {
-						System.out.println(UTF8.String(read(new File(s[2]), w.start, (int) (w.end - w.start))));
-					}
-				} catch (final IOException e) {
-					ConcurrentLog.logException(e);
-				}
-
-			}
-		} finally {
-			try {
-				HTTPClient.closeConnectionManager();
-			} catch (InterruptedException e) {
-				e.printStackTrace();
-			}
-			ConcurrentLog.shutdown();
-		}
-	}
+    public static void main(final String[] s) {
+        if (s.length == 0) {
+            System.out.println("usage:");
+            System.out.println(" -index <wikipedia-dump>");
+            System.out.println(" -read  <start> <len> <idx-file>");
+            System.out.println(" -find  <title> <wikipedia-dump>");
+            System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>");
+            ConcurrentLog.shutdown();
+            return;
+        }
+
+        try {
+            // example:
+            // java -Xmx2000m -cp classes:lib/bzip2.jar
+            // de.anomic.tools.mediawikiIndex -convert
+            // DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2
+            // DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
+
+            if (s[0].equals("-convert")) {
+                if(s.length < 3) {
+                    System.out.println("usage:");
+                    System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>");
+                    ConcurrentLog.shutdown();
+                    return;
+                }
+                final File targetdir = new File(s[2]);
+                try {
+                    final MediawikiImporter mi = new MediawikiImporter(new MultiProtocolURL(s[1]), targetdir);
+                    mi.start();
+                    mi.join();
+                } catch (final InterruptedException e) {
+                    ConcurrentLog.logException(e);
+                } catch (MalformedURLException e) {
+                    ConcurrentLog.logException(e);
+                }
+            }
+
+            if (s[0].equals("-index")) {
+                try {
+                    createIndex(new File(s[1]));
+                } catch (final IOException e) {
+                    ConcurrentLog.logException(e);
+                }
+            }
+
+            if (s[0].equals("-read")) {
+                final long start = Integer.parseInt(s[1]);
+                final int len = Integer.parseInt(s[2]);
+                System.out.println(UTF8.String(read(new File(s[3]), start, len)));
+            }
+
+            if (s[0].equals("-find")) {
+                try {
+                    final wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml"));
+                    if (w == null) {
+                        ConcurrentLog.info("WIKITRANSLATION", "not found");
+                    } else {
+                        System.out.println(UTF8.String(read(new File(s[2]), w.start, (int) (w.end - w.start))));
+                    }
+                } catch (final IOException e) {
+                    ConcurrentLog.logException(e);
+                }
+
+            }
+        } finally {
+            try {
+                HTTPClient.closeConnectionManager();
+            } catch (InterruptedException e) {
+                e.printStackTrace();
+            }
+            ConcurrentLog.shutdown();
+        }
+    }

 }
--- a/source/net/yacy/document/parser/AbstractCompressorParser.java
+++ b/source/net/yacy/document/parser/AbstractCompressorParser.java
@ -37,6 +37,7 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;

 /**
 * Base class for parsing compressed files relying on Apache commons-compress
@ -44,25 +45,25 @@ import net.yacy.document.VocabularyScraper;
 */
 public abstract class AbstractCompressorParser extends AbstractParser implements Parser {

-	/** Crawl depth applied when parsing internal compressed content */
-	protected static final int DEFAULT_DEPTH = 999;
-
-	/**
-	 * @param name the human readable name of the parser
-	 */
-	public AbstractCompressorParser(final String name) {
-		super(name);
-	}
-
-	/**
-	 * @param source an open input stream on a compressed source
-	 * @return a sub class of CompressorInputStream capable of uncompressing the source
-	 *         on the fly
-	 * @throws IOException when an error occurred when trying to open the compressed
-	 *                     stream
-	 */
-	protected abstract CompressorInputStream createDecompressStream(final InputStream source) throws IOException;
-	
+    /** Crawl depth applied when parsing internal compressed content */
+    protected static final int DEFAULT_DEPTH = 999;
+
+    /**
+     * @param name the human readable name of the parser
+     */
+    public AbstractCompressorParser(final String name) {
+        super(name);
+    }
+
+    /**
+     * @param source an open input stream on a compressed source
+     * @return a sub class of CompressorInputStream capable of uncompressing the source
+     *         on the fly
+     * @throws IOException when an error occurred when trying to open the compressed
+     *                     stream
+     */
+    protected abstract CompressorInputStream createDecompressStream(final InputStream source) throws IOException;
+
    /**
     * Maps the given name of a compressed file to the name that the
     * file should have after uncompression. For example, for "file.txt.xz", "file.txt" is returned.
@ -72,116 +73,137 @@ public abstract class AbstractCompressorParser extends AbstractParser implements
     */
    protected abstract String getUncompressedFilename(final String filename);

-	@Override
-	public Document[] parse(final DigestURL location, final String mimeType, final String charset,
-			final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
-			final InputStream source) throws Parser.Failure, InterruptedException {
-
-		return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, Integer.MAX_VALUE,
-				Long.MAX_VALUE);
-	}
-
-	@Override
-	public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
-			final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
-			final InputStream source, final int maxLinks, final long maxBytes) throws Parser.Failure {
-		Document maindoc;
-		final CompressorInputStream compressedInStream;
-		try {
-			compressedInStream = createDecompressStream(source);
-		} catch (final IOException | RuntimeException e) {
-			throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
-		}
-
-		try {
-			// create maindoc for this archive, register with supplied url & mime
-			maindoc = AbstractCompressorParser.createMainDocument(location, mimeType, charset, this);
-
-			final Document[] docs = this.parseCompressedInputStream(location, null, ignoreClassNames, timezoneOffset,
-					AbstractCompressorParser.DEFAULT_DEPTH, compressedInStream, maxLinks, maxBytes);
-			if (docs != null) {
-				maindoc.addSubDocuments(docs);
-				if (docs.length > 0 && docs[0].isPartiallyParsed()) {
-					maindoc.setPartiallyParsed(true);
-				}
-			}
-		} catch (final Parser.Failure e) {
-			throw e;
-		} catch (final IOException | RuntimeException e) {
-			throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
-		}
-		return new Document[] { maindoc };
-	}
-
-	/**
-	 * Create the main parsed document for the compressed document at the given URL
-	 * and Media type
-	 *
-	 * @param location the parsed resource URL
-	 * @param mimeType the media type of the resource
-	 * @param charset  the charset name if known
-	 * @param parser   an instance of CompressorParser that is registered as the
-	 *                 parser origin of the document
-	 * @return a Document instance
-	 */
-	protected static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
-			final AbstractCompressorParser parser) {
-		final String filename = location.getFileName();
-		return new Document(location, mimeType, charset, parser, null, null,
-				AbstractParser
-						.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
-				null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
-	}
-
-	/**
-	 * Parse content in an open stream uncompressing on the fly a compressed
-	 * resource.
-	 *
-	 * @param location           the URL of the compressed resource
-	 * @param charset            the charset name if known
-	 * @param ignoreClassNames   an eventual set of CSS class names whose matching
-	 *                           html elements content should be ignored
-	 * @param timezoneOffset     the local time zone offset
-	 * @param compressedInStream an open stream uncompressing on the fly the
-	 *                           compressed content
-	 * @param maxLinks           the maximum total number of links to parse and add
-	 *                           to the result documents
-	 * @param maxBytes           the maximum number of content bytes to process
-	 * @return a list of documents that result from parsing the source, with empty
-	 *         or null text.
-	 * @throws Parser.Failure when the parser processing failed
-	 */
-	protected Document[] parseCompressedInputStream(final DigestURL location, final String charset,
-			final Set<String> ignoreClassNames, final int timezoneOffset, final int depth,
-			final CompressorInputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
-		final String compressedFileName = location.getFileName();
-		final String contentfilename = getUncompressedFilename(compressedFileName);
-		final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
-		try {
-			/*
-			 * Use the uncompressed file name for sub parsers to not unnecessarily use again
-			 * this same uncompressing parser
-			 */
-			final String locationPath = location.getPath();
-			final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length())
-					+ contentfilename;
-			final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(),
-					location.getPort(), contentPath);
-
-			/*
-			 * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
-			 * compressed content
-			 */
-			return TextParser.parseWithLimits(contentLocation, mime, charset, ignoreClassNames, timezoneOffset, depth,
-					-1, compressedInStream, maxLinks, maxBytes);
-		} catch (final MalformedURLException e) {
-			throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
-		}
-	}
-
-	@Override
-	public boolean isParseWithLimitsSupported() {
-		return true;
-	}
+    @Override
+    public Document[] parse(
+            final DigestURL location,
+            final String mimeType,
+            final String charset,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
+            final VocabularyScraper scraper,
+            final int timezoneOffset,
+            final InputStream source) throws Parser.Failure, InterruptedException {
+
+        return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, Integer.MAX_VALUE,
+                Long.MAX_VALUE);
+    }
+
+    @Override
+    public Document[] parseWithLimits(
+            final DigestURL location,
+            final String mimeType,
+            final String charset,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
+            final VocabularyScraper scraper,
+            final int timezoneOffset,
+            final InputStream source,
+            final int maxLinks,
+            final long maxBytes) throws Parser.Failure {
+        Document maindoc;
+        final CompressorInputStream compressedInStream;
+        try {
+            compressedInStream = createDecompressStream(source);
+        } catch (final IOException | RuntimeException e) {
+            throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
+        }
+
+        try {
+            // create maindoc for this archive, register with supplied url & mime
+            maindoc = AbstractCompressorParser.createMainDocument(location, mimeType, charset, this);
+
+            final Document[] docs = this.parseCompressedInputStream(location, null, defaultValency, valencySwitchTagNames, timezoneOffset,
+                    AbstractCompressorParser.DEFAULT_DEPTH, compressedInStream, maxLinks, maxBytes);
+            if (docs != null) {
+                maindoc.addSubDocuments(docs);
+                if (docs.length > 0 && docs[0].isPartiallyParsed()) {
+                    maindoc.setPartiallyParsed(true);
+                }
+            }
+        } catch (final Parser.Failure e) {
+            throw e;
+        } catch (final IOException | RuntimeException e) {
+            throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
+        }
+        return new Document[] { maindoc };
+    }
+
+    /**
+     * Create the main parsed document for the compressed document at the given URL
+     * and Media type
+     *
+     * @param location the parsed resource URL
+     * @param mimeType the media type of the resource
+     * @param charset  the charset name if known
+     * @param parser   an instance of CompressorParser that is registered as the
+     *                 parser origin of the document
+     * @return a Document instance
+     */
+    protected static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
+            final AbstractCompressorParser parser) {
+        final String filename = location.getFileName();
+        return new Document(location, mimeType, charset, parser, null, null,
+                AbstractParser
+                        .singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
+                null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
+    }
+
+    /**
+     * Parse content in an open stream uncompressing on the fly a compressed
+     * resource.
+     *
+     * @param location           the URL of the compressed resource
+     * @param charset            the charset name if known
+     * @param ignoreClassNames   an eventual set of CSS class names whose matching
+     *                           html elements content should be ignored
+     * @param timezoneOffset     the local time zone offset
+     * @param compressedInStream an open stream uncompressing on the fly the
+     *                           compressed content
+     * @param maxLinks           the maximum total number of links to parse and add
+     *                           to the result documents
+     * @param maxBytes           the maximum number of content bytes to process
+     * @return a list of documents that result from parsing the source, with empty
+     *         or null text.
+     * @throws Parser.Failure when the parser processing failed
+     */
+    protected Document[] parseCompressedInputStream(
+            final DigestURL location,
+            final String charset,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
+            final int timezoneOffset, final int depth,
+            final CompressorInputStream compressedInStream,
+            final int maxLinks,
+            final long maxBytes) throws Failure {
+        final String compressedFileName = location.getFileName();
+        final String contentfilename = getUncompressedFilename(compressedFileName);
+        final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
+        try {
+            /*
+             * Use the uncompressed file name for sub parsers to not unnecessarily use again
+             * this same uncompressing parser
+             */
+            final String locationPath = location.getPath();
+            final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length())
+                    + contentfilename;
+            final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(),
+                    location.getPort(), contentPath);
+
+            /*
+             * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
+             * compressed content
+             */
+            return TextParser.parseWithLimits(
+                    contentLocation, mime, charset, defaultValency, valencySwitchTagNames, timezoneOffset, depth,
+                    -1, compressedInStream, maxLinks, maxBytes);
+        } catch (final MalformedURLException e) {
+            throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
+        }
+    }
+
+    @Override
+    public boolean isParseWithLimitsSupported() {
+        return true;
+    }

 }
--- a/source/net/yacy/document/parser/bzipParser.java
+++ b/source/net/yacy/document/parser/bzipParser.java
@ -45,6 +45,7 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.util.FileUtils;

 /**
@ -52,7 +53,7 @@ import net.yacy.kelondro.util.FileUtils;
 * Unzips and parses the content and adds it to the created main document
 */
 public class bzipParser extends AbstractParser implements Parser {
-	
+    
    public bzipParser() {
        super("Bzip 2 UNIX Compressed File Parser");
        this.SUPPORTED_EXTENSIONS.add("bz2");
@ -70,7 +71,8 @@ public class bzipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            Set<String> ignore_class_name,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            final InputStream source)
@ -99,25 +101,25 @@ public class bzipParser extends AbstractParser implements Parser {
            out = null;

        } catch(Exception e) {
-        	if (tempFile != null) {
-        		FileUtils.deletedelete(tempFile);
-        	}
-        	throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
+            if (tempFile != null) {
+                FileUtils.deletedelete(tempFile);
+            }
+            throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
        } finally {
-        	if(zippedContent != null) {
-        		try {
-					zippedContent.close();
-				} catch (IOException ignored) {
-					log.warn("Could not close bzip input stream");
-				}
-        	}
-        	if(out != null) {
-        		try {
-					out.close();
-				} catch (IOException e) {
-					throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
-				}
-        	}
+            if(zippedContent != null) {
+                try {
+                    zippedContent.close();
+                } catch (IOException ignored) {
+                    log.warn("Could not close bzip input stream");
+                }
+            }
+            if(out != null) {
+                try {
+                    out.close();
+                } catch (IOException e) {
+                    throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
+                }
+            }
        }
        try {
             // create maindoc for this bzip container, register with supplied url & mime
@ -125,7 +127,7 @@ public class bzipParser extends AbstractParser implements Parser {
            // creating a new parser class to parse the unzipped content
            final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName());
            final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
-            final Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tempFile);
+            final Document[] docs = TextParser.parseSource(location, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, 999, tempFile);
            if (docs != null) maindoc.addSubDocuments(docs);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
@ -140,7 +142,7 @@ public class bzipParser extends AbstractParser implements Parser {
    
    @Override
    public boolean isParseWithLimitsSupported() {
-    	return true;
+        return true;
    }
    
    /**
@ -151,9 +153,9 @@ public class bzipParser extends AbstractParser implements Parser {
     * @param parser instance of bzipParser that is registered as the parser origin of the document
     * @return a Document instance
     */
-	public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final bzipParser parser) {
-		final String filename = location.getFileName();
-		Document maindoc = new Document(
+    public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final bzipParser parser) {
+        final String filename = location.getFileName();
+        Document maindoc = new Document(
                location,
                mimeType,
                charset,
@ -172,49 +174,48 @@ public class bzipParser extends AbstractParser implements Parser {
                null,
                false,
                new Date());
-		return maindoc;
-	}
-	
-	/**
-	 * Parse content in an open stream uncompressing on the fly a bzipped resource.
-	 * @param location the URL of the bzipped resource 
-	 * @param charset the charset name if known
-	 * @param timezoneOffset the local time zone offset
-	 * @param compressedInStream an open stream uncompressing on the fly the compressed content
-	 * @param maxLinks
-	 *            the maximum total number of links to parse and add to the
-	 *            result documents
-	 * @param maxBytes
-	 *            the maximum number of content bytes to process
-	 * @return a list of documents that result from parsing the source, with
-	 *         empty or null text.
-	 * @throws Parser.Failure
-	 *             when the parser processing failed
-	 */
-	public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
-			final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
+        return maindoc;
+    }
+
+    /**
+     * Parse content in an open stream uncompressing on the fly a bzipped resource.
+     * @param location the URL of the bzipped resource 
+     * @param charset the charset name if known
+     * @param timezoneOffset the local time zone offset
+     * @param compressedInStream an open stream uncompressing on the fly the compressed content
+     * @param maxLinks
+     *            the maximum total number of links to parse and add to the
+     *            result documents
+     * @param maxBytes
+     *            the maximum number of content bytes to process
+     * @return a list of documents that result from parsing the source, with
+     *         empty or null text.
+     * @throws Parser.Failure
+     *             when the parser processing failed
+     */
+    public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
+            final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
        // creating a new parser class to parse the unzipped content
-		final String compressedFileName = location.getFileName();
+        final String compressedFileName = location.getFileName();
        final String contentfilename = BZip2Utils.getUncompressedFilename(compressedFileName);
        final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
        try {
-        	/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
-    		final String locationPath = location.getPath();
-        	final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
-			final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
-			
-	        /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
-	        return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
-		} catch (MalformedURLException e) {
-			throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
-		}
-	}
-		
-    
+            /* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
+            final String locationPath = location.getPath();
+            final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
+            final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
+
+            /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
+            return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
+        } catch (MalformedURLException e) {
+            throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
+        }
+    }
+
    @Override
    public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper,
-    		final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
-    		throws Parser.Failure {
+            final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
+            throws Parser.Failure {
        Document maindoc = null;
        BZip2CompressorInputStream zippedContent = null;
        try {
@ -222,23 +223,23 @@ public class bzipParser extends AbstractParser implements Parser {
            zippedContent = new BZip2CompressorInputStream(source);

        } catch(Exception e) {
-        	throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
+            throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
        } 
-        
+
        try {
             // create maindoc for this bzip container, register with supplied url & mime
            maindoc = createMainDocument(location, mimeType, charset, this);
            // creating a new parser class to parse the unzipped content
            final Document[] docs = parseCompressedInputStream(location, null, timezoneOffset, 999, zippedContent, maxLinks, maxBytes);
            if (docs != null) {
-            	maindoc.addSubDocuments(docs);
-            	if(docs.length > 0 && docs[0].isPartiallyParsed()) {
-            		maindoc.setPartiallyParsed(true);
-            	}
+                maindoc.addSubDocuments(docs);
+                if(docs.length > 0 && docs[0].isPartiallyParsed()) {
+                    maindoc.setPartiallyParsed(true);
+                }
            }
        } catch (final Exception e) {
            if (e instanceof Parser.Failure) {
-            	throw (Parser.Failure) e;
+                throw (Parser.Failure) e;
            }

            throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(),location);
--- a/source/net/yacy/document/parser/gzipParser.java
+++ b/source/net/yacy/document/parser/gzipParser.java
@ -45,6 +45,7 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.util.FileUtils;

 /**
@ -52,8 +53,8 @@ import net.yacy.kelondro.util.FileUtils;
 * Unzips and parses the content and adds it to the created main document
 */
 public class gzipParser extends AbstractParser implements Parser {
-	
-	private static final int DEFAULT_DEPTH = 999;
+
+    private static final int DEFAULT_DEPTH = 999;

    public gzipParser() {
        super("GNU Zip Compressed Archive Parser");
@ -72,7 +73,8 @@ public class gzipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            Set<String> ignore_class_name,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            final InputStream source) throws Parser.Failure, InterruptedException {
@ -84,10 +86,10 @@ public class gzipParser extends AbstractParser implements Parser {
        try {
            zippedContent = new GZIPInputStream(source);
        } catch(IOException e) {
-        	/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
-        	 * and eventually apply special error handling */
-			throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
-					new GZIPOpeningStreamException());
+            /* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
+             * and eventually apply special error handling */
+            throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
+                    new GZIPOpeningStreamException());
        }
        try {
            int read = 0;
@ -103,32 +105,32 @@ public class gzipParser extends AbstractParser implements Parser {
                out.write(data, 0, read);
            }
        } catch(Exception e) {
-        	if (tempFile != null) {
-        		FileUtils.deletedelete(tempFile);
-        	}
-        	throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
+            if (tempFile != null) {
+                FileUtils.deletedelete(tempFile);
+            }
+            throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
        } finally {
-        	if(zippedContent != null) {
-        		try {
-					zippedContent.close();
-				} catch (IOException ignored) {
-					log.warn("Could not close gzip input stream");
-				}
-        	}
-        	if(out != null) {
-        		try {
-					out.close();
-				} catch (IOException e) {
-					throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
-				}
-        	}
+            if(zippedContent != null) {
+                try {
+                    zippedContent.close();
+                } catch (IOException ignored) {
+                    log.warn("Could not close gzip input stream");
+                }
+            }
+            if(out != null) {
+                try {
+                    out.close();
+                } catch (IOException e) {
+                    throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
+                }
+            }
        }
        try {
            maindoc = createMainDocument(location, mimeType, charset, this);
            // creating a new parser class to parse the unzipped content
            final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
            final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
-            Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
+            Document[] docs = TextParser.parseSource(location, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
            if (docs != null) maindoc.addSubDocuments(docs);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
@ -149,96 +151,96 @@ public class gzipParser extends AbstractParser implements Parser {
     * @param an instance of gzipParser that is registered as the parser origin of the document
     * @return a Document instance
     */
-	public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final gzipParser parser) {
-		final String filename = location.getFileName();
-		Document maindoc = new Document(
-		        location,
-		        mimeType,
-		        charset,
-		        parser,
-		        null,
-		        null,
-		        AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
-		        null,
-		        null,
-		        null,
-		        null,
-		        0.0d, 0.0d,
-		        (Object) null,
-		        null,
-		        null,
-		        null,
-		        false,
-		        new Date());
-		return maindoc;
-	}
-	
-	/**
-	 * Parse content in an open stream uncompressing on the fly a gzipped resource.
-	 * @param location the URL of the gzipped resource 
-	 * @param charset the charset name if known
-	 * @param timezoneOffset the local time zone offset
-	 * @param compressedInStream an open stream uncompressing on the fly the compressed content
-	 * @param maxLinks
-	 *            the maximum total number of links to parse and add to the
-	 *            result documents
-	 * @param maxBytes
-	 *            the maximum number of content bytes to process
-	 * @return a list of documents that result from parsing the source, with
-	 *         empty or null text.
-	 * @throws Parser.Failure
-	 *             when the parser processing failed
-	 */
-	public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
-			final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
+    public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final gzipParser parser) {
+        final String filename = location.getFileName();
+        Document maindoc = new Document(
+                location,
+                mimeType,
+                charset,
+                parser,
+                null,
+                null,
+                AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
+                null,
+                null,
+                null,
+                null,
+                0.0d, 0.0d,
+                (Object) null,
+                null,
+                null,
+                null,
+                false,
+                new Date());
+        return maindoc;
+    }
+
+    /**
+     * Parse content in an open stream uncompressing on the fly a gzipped resource.
+     * @param location the URL of the gzipped resource 
+     * @param charset the charset name if known
+     * @param timezoneOffset the local time zone offset
+     * @param compressedInStream an open stream uncompressing on the fly the compressed content
+     * @param maxLinks
+     *            the maximum total number of links to parse and add to the
+     *            result documents
+     * @param maxBytes
+     *            the maximum number of content bytes to process
+     * @return a list of documents that result from parsing the source, with
+     *         empty or null text.
+     * @throws Parser.Failure
+     *             when the parser processing failed
+     */
+    public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
+            final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
        // creating a new parser class to parse the unzipped content
-		final String compressedFileName = location.getFileName();
+        final String compressedFileName = location.getFileName();
        final String contentfilename = GzipUtils.getUncompressedFilename(compressedFileName);
        final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
        try {
-        	/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
-    		final String locationPath = location.getPath();
-        	final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
-			final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
-			
-	        /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
-	        return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
-		} catch (MalformedURLException e) {
-			throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
-		}
-	}
-    
+            /* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
+            final String locationPath = location.getPath();
+            final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
+            final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
+
+            /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
+            return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
+        } catch (MalformedURLException e) {
+            throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
+        }
+    }
+
    @Override
    public boolean isParseWithLimitsSupported() {
-    	return true;
+        return true;
    }
-    
+
    @Override
    public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper,
-    		final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
-    		throws Parser.Failure {
+            final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
+            throws Parser.Failure {
        Document maindoc = null;
        GZIPInputStream zippedContent = null;
        try {
-        	/* Only use in-memory stream here (no temporary file) : the parsers 
-        	 * matching compressed content are expected to handle properly the maxBytes limit and terminate 
-        	 * before an eventual OutOfMemory occurs */
+            /* Only use in-memory stream here (no temporary file) : the parsers 
+             * matching compressed content are expected to handle properly the maxBytes limit and terminate 
+             * before an eventual OutOfMemory occurs */
            zippedContent = new GZIPInputStream(source);
        } catch(IOException e) {
-        	/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
-        	 * and eventually apply special error handling */
-			throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
-					new GZIPOpeningStreamException());
+            /* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
+             * and eventually apply special error handling */
+            throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
+                    new GZIPOpeningStreamException());
        }
        try {
            maindoc = createMainDocument(location, mimeType, charset, this);
-            
+
            Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes);
            if (docs != null) {
-            	maindoc.addSubDocuments(docs);
-            	if(docs.length > 0 && docs[0].isPartiallyParsed()) {
-            		maindoc.setPartiallyParsed(true);
-            	}
+                maindoc.addSubDocuments(docs);
+                if(docs.length > 0 && docs[0].isPartiallyParsed()) {
+                    maindoc.setPartiallyParsed(true);
+                }
            }
        } catch (final Exception e) {
            throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location);
@ -251,15 +253,15 @@ public class gzipParser extends AbstractParser implements Parser {
     */
    public class GZIPOpeningStreamException extends Exception {

-		/** The serialization ID */
-		private static final long serialVersionUID = 2824038185373304636L;
+        /** The serialization ID */
+        private static final long serialVersionUID = 2824038185373304636L;
+
+        public GZIPOpeningStreamException() {
+            super();
+        }

-		public GZIPOpeningStreamException() {
-    		super();
-    	}
-    	
-    	public GZIPOpeningStreamException(final String message) {
-    		super(message);
-    	}
+        public GZIPOpeningStreamException(final String message) {
+            super(message);
+        }
    }
 }
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -63,7 +63,7 @@ import net.yacy.document.parser.html.TransformerWriter;

 public class htmlParser extends AbstractParser implements Parser {

-	/** The default maximum number of links (other than a, area, and canonical and stylesheet links) to add to a parsed document */
+    /** The default maximum number of links (other than a, area, and canonical and stylesheet links) to add to a parsed document */
    private static final int DEFAULT_MAX_LINKS = 10000;

    public htmlParser() {
@ -108,42 +108,93 @@ public class htmlParser extends AbstractParser implements Parser {
            final int timezoneOffset,
            final InputStream sourceStream) throws Parser.Failure, InterruptedException {

-        return parseWithLimits(location, mimeType, documentCharset, new HashSet<String>(), vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
+        return parseWithLimits(
+                location,
+                mimeType,
+                documentCharset,
+                TagValency.EVAL,
+                new HashSet<String>(),
+                vocscraper,
+                timezoneOffset,
+                sourceStream,
+                Integer.MAX_VALUE,
+                DEFAULT_MAX_LINKS,
+                Long.MAX_VALUE);
    }
-    
+
    @Override
    public Document[] parse(
            final DigestURL location,
            final String mimeType,
            final String documentCharset,
-            final Set<String> ignore_class_name, 
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper vocscraper,
            final int timezoneOffset,
            final InputStream sourceStream) throws Parser.Failure, InterruptedException {

-        return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
+        return parseWithLimits(
+                location, mimeType,
+                documentCharset,
+                defaultValency,
+                valencySwitchTagNames,
+                vocscraper,
+                timezoneOffset,
+                sourceStream,
+                Integer.MAX_VALUE,
+                DEFAULT_MAX_LINKS,
+                Long.MAX_VALUE);
    }
    
    @Override
    public boolean isParseWithLimitsSupported() {
-    	return true;
+        return true;
    }
    
    @Override
-    public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset,
-    		final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
-    		final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
-    		throws Failure {
-        return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, maxLinks, maxLinks, maxBytes);
+    public Document[] parseWithLimits(
+            final DigestURL location,
+            final String mimeType,
+            final String documentCharset,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
+            final VocabularyScraper vocscraper,
+            final int timezoneOffset,
+            final InputStream sourceStream,
+            final int maxLinks,
+            final long maxBytes)
+            throws Failure {
+        return parseWithLimits(
+                location,
+                mimeType,
+                documentCharset,
+                defaultValency,
+                valencySwitchTagNames,
+                vocscraper,
+                timezoneOffset,
+                sourceStream,
+                maxLinks,
+                maxLinks,
+                maxBytes);
    }
    
-    private Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
-    		final int timezoneOffset, final InputStream sourceStream, final int maxAnchors, final int maxLinks, final long maxBytes)
-    		throws Failure {
+    private Document[] parseWithLimits(
+            final DigestURL location,
+            final String mimeType,
+            final String documentCharset,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
+            final VocabularyScraper vocscraper,
+            final int timezoneOffset,
+            final InputStream sourceStream,
+            final int maxAnchors,
+            final int maxLinks,
+            final long maxBytes)
+            throws Failure {
        try {
            // first get a document from the parsed html
            Charset[] detectedcharsetcontainer = new Charset[]{null};
-            ContentScraper scraper = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
+            ContentScraper scraper = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
            // parseToScraper also detects/corrects/sets charset from html content tag
            final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
            Document documentSnapshot = null;
@ -152,10 +203,10 @@ public class htmlParser extends AbstractParser implements Parser {
                // and create a sub-document for snapshot page (which will be merged by loader)
                // TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler
                if (location.getRef() != null && location.getRef().startsWith("!")) {
-                    documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
+                    documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
                } else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both)
                    if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) {
-                        documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
+                        documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
                    }
                }
            } catch (Exception ex1) { // ignore any exception for any issue with snapshot
@ -221,7 +272,16 @@ public class htmlParser extends AbstractParser implements Parser {
        return ppd;
    }

-    public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxAnchors, final int maxLinks) throws IOException {
+    public static ContentScraper parseToScraper(
+            final DigestURL location,
+            final String documentCharset, 
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
+            final VocabularyScraper vocabularyScraper,
+            final int timezoneOffset,
+            final String input,
+            final int maxAnchors,
+            final int maxLinks) throws IOException {
        Charset[] detectedcharsetcontainer = new Charset[]{null};
        InputStream sourceStream;
        try {
@ -231,7 +291,7 @@ public class htmlParser extends AbstractParser implements Parser {
        }
        ContentScraper scraper; // for this static methode no need to init local this.scraperObject
        try {
-            scraper = parseToScraper(location, documentCharset, ignore_class_name, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
+            scraper = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
        } catch (Failure e) {
            throw new IOException(e.getMessage());
        }
@ -256,7 +316,8 @@ public class htmlParser extends AbstractParser implements Parser {
    public static ContentScraper parseToScraper(
            final DigestURL location,
            final String documentCharset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper vocabularyScraper,
            final Charset[] detectedcharsetcontainer,
            final int timezoneOffset,
@ -264,7 +325,7 @@ public class htmlParser extends AbstractParser implements Parser {
            final int maxAnchors,
            final int maxLinks,
            final long maxBytes) throws Parser.Failure, IOException {
-    	
+        
        // make a scraper
        String charset = null;

@ -280,8 +341,8 @@ public class htmlParser extends AbstractParser implements Parser {
                htmlFilter = new ScraperInputStream(
                        sourceStream,
                        documentCharset,
-                        ignore_class_name,
-                        TagValency.EVAL,
+                        valencySwitchTagNames,
+                        defaultValency,
                        vocabularyScraper,
                        location,
                        false,
@ -325,26 +386,26 @@ public class htmlParser extends AbstractParser implements Parser {
                location,
                maxAnchors,
                maxLinks,
-                ignore_class_name,
+                valencySwitchTagNames,
                TagValency.EVAL,
                vocabularyScraper,
                timezoneOffset);
        final TransformerWriter writer = new TransformerWriter(null, null, scraper, false, Math.max(64, Math.min(4096, sourceStream.available())));
        try {
-        	final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());
-        	final Reader sourceReader = new InputStreamReader(sourceStream, detectedcharsetcontainer[0]);
-			final long copiedChars = IOUtils.copyLarge(sourceReader, writer, 0, maxChars);
+            final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());
+            final Reader sourceReader = new InputStreamReader(sourceStream, detectedcharsetcontainer[0]);
+            final long copiedChars = IOUtils.copyLarge(sourceReader, writer, 0, maxChars);
            if(copiedChars > maxChars) {
-            	/* maxChars limit has been exceeded : do not fail here as we want to use the partially obtained results. */
-            	scraper.setContentSizeLimitExceeded(true);
+                /* maxChars limit has been exceeded : do not fail here as we want to use the partially obtained results. */
+                scraper.setContentSizeLimitExceeded(true);
            } else if(copiedChars == maxChars) {
-            	/* Exactly maxChars limit reached : let's check if more to read remain. */
-            	if(sourceReader.read() >= 0) {
-            		scraper.setContentSizeLimitExceeded(true);
-            	}
+                /* Exactly maxChars limit reached : let's check if more to read remain. */
+                if(sourceReader.read() >= 0) {
+                    scraper.setContentSizeLimitExceeded(true);
+                }
            }
        } catch (final IOException e) {
-       		throw new Parser.Failure("IO error:" + e.getMessage(), location);
+               throw new Parser.Failure("IO error:" + e.getMessage(), location);
        } finally {
            writer.flush();
            //sourceStream.close(); keep open for multipe parsing (close done by caller)
@ -456,9 +517,10 @@ public class htmlParser extends AbstractParser implements Parser {
     * @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot
     */
    private Document parseAlternativeSnapshot(
-    		final DigestURL location, final String mimeType, final String documentCharset,
-    		final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
-    		final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) {
+            final DigestURL location, final String mimeType, final String documentCharset,
+            final TagValency defaultValency, final Set<String> valencySwitchTagNames,
+            final VocabularyScraper vocscraper,
+            final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) {
        Document documentSnapshot = null;
        try {
            // construct url for case (1) with anchor
@ -476,17 +538,17 @@ public class htmlParser extends AbstractParser implements Parser {
            Charset[] detectedcharsetcontainer = new Charset[]{null};
            InputStream snapshotStream = null;
            try {
-            	snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
-            	ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
+                snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
+                ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
                documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot);
            } finally {
-            	if(snapshotStream != null) {
-            		try {
-            			snapshotStream.close();
-            		} catch(IOException e) {
-            			AbstractParser.log.warn("Could not close snapshot stream : " + e.getMessage());
-            		}
-            	}
+                if(snapshotStream != null) {
+                    try {
+                        snapshotStream.close();
+                    } catch(IOException e) {
+                        AbstractParser.log.warn("Could not close snapshot stream : " + e.getMessage());
+                    }
+                }
            }
            AbstractParser.log.info("parse snapshot "+locationSnapshot.toString() + " additional to " + location.toString());
        } catch (IOException | Failure ex) { }
--- a/source/net/yacy/document/parser/sevenzipParser.java
+++ b/source/net/yacy/document/parser/sevenzipParser.java
@ -44,6 +44,7 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.util.FileUtils;
 import SevenZip.ArchiveExtractCallback;
 import SevenZip.IInStream;
@ -63,7 +64,8 @@ public class sevenzipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
            final int timezoneOffset,
            final IInStream source) throws Parser.Failure, InterruptedException {

@ -94,7 +96,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
        } catch (final IOException e) {
            throw new Parser.Failure("error opening 7zip archive: " + e.getMessage(), location);
        }
-        final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), ignore_class_name, timezoneOffset);
+        final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), defaultValency, valencySwitchTagNames, timezoneOffset);
        AbstractParser.log.fine("processing archive contents...");
        try {
            archive.Extract(null, -1, 0, aec);
@ -116,10 +118,11 @@ public class sevenzipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
            final int timezoneOffset,
            final byte[] source) throws Parser.Failure, InterruptedException {
-        return parse(location, mimeType, charset, ignore_class_name, timezoneOffset, new ByteArrayIInStream(source));
+        return parse(location, mimeType, charset, defaultValency, valencySwitchTagNames, timezoneOffset, new ByteArrayIInStream(source));
    }

    @Override
@ -127,14 +130,15 @@ public class sevenzipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            Set<String> ignore_class_name,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            final InputStream source) throws Parser.Failure, InterruptedException {
        try {
            final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
            FileUtils.copy(source, cfos);
-            return new Document[]{parse(location, mimeType, charset, ignore_class_name, timezoneOffset, cfos.toByteArray())};
+            return new Document[]{parse(location, mimeType, charset, defaultValency, valencySwitchTagNames, timezoneOffset, cfos.toByteArray())};
        } catch (final IOException e) {
            throw new Parser.Failure("error processing 7zip archive: " + e.getMessage(), location);
        }
@ -148,7 +152,8 @@ public class sevenzipParser extends AbstractParser implements Parser {
         private ByteArrayOutputStream cfos = null;
         private final Document doc;
         private final String prefix;
-         private Set<String> ignore_class_name;
+         private final TagValency defaultValency;
+         private Set<String> valencySwitchTagNames;
         private final int timezoneOffset;

         public SZParserExtractCallback(
@ -156,13 +161,15 @@ public class sevenzipParser extends AbstractParser implements Parser {
                 final IInArchive handler,
                 final Document doc,
                 final String prefix,
-                 final Set<String> ignore_class_name,
+                 final TagValency defaultValency, 
+                 final Set<String> valencySwitchTagNames,
                 final int timezoneOffset) {
             super.Init(handler);
             this.log = logger;
             this.doc = doc;
             this.prefix = prefix;
-             this.ignore_class_name = ignore_class_name;
+             this.defaultValency = defaultValency;
+             this.valencySwitchTagNames = valencySwitchTagNames;
             this.timezoneOffset = timezoneOffset;
         }

@ -205,7 +212,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
                     // below for reversion of the effects
                     final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath);
                     final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
-                     theDocs = TextParser.parseSource(url, mime, null, this.ignore_class_name, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());
+                     theDocs = TextParser.parseSource(url, mime, null,this.defaultValency, this.valencySwitchTagNames, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());

                     this.doc.addSubDocuments(theDocs);
                 }
--- a/source/net/yacy/document/parser/tarParser.java
+++ b/source/net/yacy/document/parser/tarParser.java
@ -45,6 +45,7 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.util.FileUtils;

 // this is a new implementation of this parser idiom using multiple documents as result set
@ -70,7 +71,8 @@ public class tarParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            InputStream source) throws Parser.Failure, InterruptedException {
@ -104,17 +106,17 @@ public class tarParser extends AbstractParser implements Parser {
                try {
                    tmp = FileUtils.createTempFile(this.getClass(), name);
                    FileUtils.copy(tis, tmp, entry.getSize());
-					/*
-					 * Create an appropriate sub location to prevent unwanted fallback to the tarparser on resources included in the archive. 
-					 * We use the tar file name as the parent sub path. Example : http://host/archive.tar/name.
-					 * Indeed if we create a sub location with a '#' separator such as http://host/archive.tar#name, the
-					 * extension of the URL is still ".tar", thus incorrectly making the tar parser
-					 * as a possible parser for the sub resource.
-					 */
+/*
+ * Create an appropriate sub location to prevent unwanted fallback to the tarparser on resources included in the archive. 
+ * We use the tar file name as the parent sub path. Example : http://host/archive.tar/name.
+ * Indeed if we create a sub location with a '#' separator such as http://host/archive.tar#name, the
+ * extension of the URL is still ".tar", thus incorrectly making the tar parser
+ * as a possible parser for the sub resource.
+ */
                    final DigestURL subLocation = new DigestURL(parentTarURL, name);
-                    final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, ignore_class_name, scraper, timezoneOffset,	999, tmp);
+                    final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset,999, tmp);
                    if (subDocs == null) {
-                    	continue;
+                    continue;
                    }
                    maindoc.addSubDocuments(subDocs);
                } catch (final Parser.Failure e) {
@ -130,146 +132,146 @@ public class tarParser extends AbstractParser implements Parser {
        return new Document[]{maindoc};
    }

-	@Override
-	public boolean isParseWithLimitsSupported() {
-		return true;
-	}
+@Override
+public boolean isParseWithLimitsSupported() {
+return true;
+}

-	@Override
-	public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
-			final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks,
-			final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException {
+@Override
+public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
+final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks,
+final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException {

-		final DigestURL parentTarURL = createParentTarURL(location);
+final DigestURL parentTarURL = createParentTarURL(location);

-		final TarArchiveInputStream tis = new TarArchiveInputStream(source);
+final TarArchiveInputStream tis = new TarArchiveInputStream(source);

-		// create maindoc for this tar container
-		final Document maindoc = createMainDocument(location, mimeType, charset, this);
+// create maindoc for this tar container
+final Document maindoc = createMainDocument(location, mimeType, charset, this);

-		// loop through the elements in the tar file and parse every single file inside
-		TarArchiveEntry entry;
-		int totalProcessedLinks = 0;
-		while (true) {
-			try {
-				entry = tis.getNextTarEntry();
-				if (entry == null) {
-					break;
-				}
+// loop through the elements in the tar file and parse every single file inside
+TarArchiveEntry entry;
+int totalProcessedLinks = 0;
+while (true) {
+try {
+entry = tis.getNextTarEntry();
+if (entry == null) {
+break;
+}

-				/*
-				 * We are here sure at least one entry has still to be processed : let's check
-				 * now the bytes limit as sub parsers applied on eventual previous entries may
-				 * not support partial parsing and would have thrown a Parser.Failure instead of
-				 * marking the document as partially parsed.
-				 */
-				if (tis.getBytesRead() >= maxBytes) {
-					maindoc.setPartiallyParsed(true);
-					break;
-				}
+/*
+ * We are here sure at least one entry has still to be processed : let's check
+ * now the bytes limit as sub parsers applied on eventual previous entries may
+ * not support partial parsing and would have thrown a Parser.Failure instead of
+ * marking the document as partially parsed.
+ */
+if (tis.getBytesRead() >= maxBytes) {
+maindoc.setPartiallyParsed(true);
+break;
+}

-				if (entry.isDirectory() || entry.getSize() <= 0) {
-					continue;
-				}
-				final String name = entry.getName();
-				final int idx = name.lastIndexOf('.');
-				final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : "");
-				try {
-					/*
-					 * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
-					 * compressed content
-					 */
+if (entry.isDirectory() || entry.getSize() <= 0) {
+continue;
+}
+final String name = entry.getName();
+final int idx = name.lastIndexOf('.');
+final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : "");
+try {
+/*
+ * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
+ * compressed content
+ */

-					/*
-					 * Create an appropriate sub location to prevent unwanted fallback to the
-					 * tarparser on resources included in the archive. We use the tar file name as
-					 * the parent sub path. Example : http://host/archive.tar/name. Indeed if we
-					 * create a sub location with a '#' separator such as
-					 * http://host/archive.tar#name, the extension of the URL is still ".tar", thus
-					 * incorrectly making the tar parser as a possible parser for the sub resource.
-					 */
-					final DigestURL subLocation = new DigestURL(parentTarURL, name);
-					final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999,
-							entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead());
+/*
+ * Create an appropriate sub location to prevent unwanted fallback to the
+ * tarparser on resources included in the archive. We use the tar file name as
+ * the parent sub path. Example : http://host/archive.tar/name. Indeed if we
+ * create a sub location with a '#' separator such as
+ * http://host/archive.tar#name, the extension of the URL is still ".tar", thus
+ * incorrectly making the tar parser as a possible parser for the sub resource.
+ */
+final DigestURL subLocation = new DigestURL(parentTarURL, name);
+final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999,
+entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead());

-					/*
-					 * If the parser(s) did not consume all bytes in the entry, these ones will be
-					 * skipped by the next call to getNextTarEntry()
-					 */
-					if (subDocs == null) {
-						continue;
-					}
-					maindoc.addSubDocuments(subDocs);
-					for (Document subDoc : subDocs) {
-						if (subDoc.getAnchors() != null) {
-							totalProcessedLinks += subDoc.getAnchors().size();
-						}
-					}
-					/*
-					 * Check if a limit has been exceeded (we are sure to pass here when maxLinks
-					 * has been exceeded as this limit require parser support for partial parsing to
-					 * be detected)
-					 */
-					if (subDocs[0].isPartiallyParsed()) {
-						maindoc.setPartiallyParsed(true);
-						break;
-					}
-				} catch (final Parser.Failure e) {
-					AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
-				}
-			} catch (final IOException e) {
-				AbstractParser.log.warn("tar parser:" + e.getMessage());
-				break;
-			}
-		}
-		return new Document[] { maindoc };
-	}
+/*
+ * If the parser(s) did not consume all bytes in the entry, these ones will be
+ * skipped by the next call to getNextTarEntry()
+ */
+if (subDocs == null) {
+continue;
+}
+maindoc.addSubDocuments(subDocs);
+for (Document subDoc : subDocs) {
+if (subDoc.getAnchors() != null) {
+totalProcessedLinks += subDoc.getAnchors().size();
+}
+}
+/*
+ * Check if a limit has been exceeded (we are sure to pass here when maxLinks
+ * has been exceeded as this limit require parser support for partial parsing to
+ * be detected)
+ */
+if (subDocs[0].isPartiallyParsed()) {
+maindoc.setPartiallyParsed(true);
+break;
+}
+} catch (final Parser.Failure e) {
+AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
+}
+} catch (final IOException e) {
+AbstractParser.log.warn("tar parser:" + e.getMessage());
+break;
+}
+}
+return new Document[] { maindoc };
+}

-	/**
-	 * Generate a parent URL to use for generating sub URLs on tar archive entries.
-	 * 
-	 * @param tarURL
-	 *            the URL of the tar archive
-	 * @return an URL ending with a "/" suitable as a base URL for archive entries
-	 */
-	private DigestURL createParentTarURL(final DigestURL tarURL) {
-		String locationStr = tarURL.toNormalform(false);
-		if (!locationStr.endsWith("/")) {
-			locationStr += "/";
-		}
-		DigestURL parentTarURL;
-		try {
-			parentTarURL = new DigestURL(locationStr);
-		} catch (MalformedURLException e1) {
-			/* This should not happen */
-			parentTarURL = tarURL;
-		}
-		return parentTarURL;
-	}
+/**
+ * Generate a parent URL to use for generating sub URLs on tar archive entries.
+ * 
+ * @param tarURL
+ *            the URL of the tar archive
+ * @return an URL ending with a "/" suitable as a base URL for archive entries
+ */
+private DigestURL createParentTarURL(final DigestURL tarURL) {
+String locationStr = tarURL.toNormalform(false);
+if (!locationStr.endsWith("/")) {
+locationStr += "/";
+}
+DigestURL parentTarURL;
+try {
+parentTarURL = new DigestURL(locationStr);
+} catch (MalformedURLException e1) {
+/* This should not happen */
+parentTarURL = tarURL;
+}
+return parentTarURL;
+}

-	/**
-	 * Create the main resulting parsed document for a tar container
-	 * 
-	 * @param location
-	 *            the parsed resource URL
-	 * @param mimeType
-	 *            the media type of the resource
-	 * @param charset
-	 *            the charset name if known
-	 * @param parser
-	 *            instance of tarParser that is registered as the parser origin of
-	 *            the document
-	 * @return a Document instance
-	 */
-	public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
-			final tarParser parser) {
-		final String filename = location.getFileName();
-		final Document maindoc = new Document(location, mimeType, charset, parser, null, null,
-				AbstractParser
-						.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
-				null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
-		return maindoc;
-	}
+/**
+ * Create the main resulting parsed document for a tar container
+ * 
+ * @param location
+ *            the parsed resource URL
+ * @param mimeType
+ *            the media type of the resource
+ * @param charset
+ *            the charset name if known
+ * @param parser
+ *            instance of tarParser that is registered as the parser origin of
+ *            the document
+ * @return a Document instance
+ */
+public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
+final tarParser parser) {
+final String filename = location.getFileName();
+final Document maindoc = new Document(location, mimeType, charset, parser, null, null,
+AbstractParser
+.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
+null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
+return maindoc;
+}

    public final static boolean isTar(File f) {
        if (!f.exists() || f.length() < 0x105) return false;
--- a/source/net/yacy/document/parser/zipParser.java
+++ b/source/net/yacy/document/parser/zipParser.java
@ -39,6 +39,7 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.MemoryControl;

@ -72,7 +73,8 @@ public class zipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            final InputStream source)
@ -121,7 +123,7 @@ public class zipParser extends AbstractParser implements Parser {
                    FileUtils.copy(zis, tmp, entry.getSize());
                    final DigestURL virtualURL = DigestURL.newURL(location, "#" + name);
                    //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
-                    final Document[] docs = TextParser.parseSource(virtualURL, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tmp);
+                    final Document[] docs = TextParser.parseSource(virtualURL, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, 999, tmp);
                    if (docs == null) continue;
                    maindoc.addSubDocuments(docs);
                } catch (final Parser.Failure e) {
--- a/source/net/yacy/htroot/Crawler_p.java
+++ b/source/net/yacy/htroot/Crawler_p.java
@ -626,6 +626,7 @@ public class Crawler_p {
                            cachePolicy,
                            collection,
                            agentName,
+                            TagValency.EVAL,
                            ignoreclassname,
                            new VocabularyScraper(vocabulary_scraper),
                            timezoneOffset);
--- a/source/net/yacy/htroot/QuickCrawlLink_p.java
+++ b/source/net/yacy/htroot/QuickCrawlLink_p.java
@ -43,6 +43,7 @@ import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.retrieval.Request;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.search.Switchboard;
 import net.yacy.search.SwitchboardConstants;
 import net.yacy.search.index.Segment;
@ -161,7 +162,7 @@ public class QuickCrawlLink_p {
                        CacheStrategy.IFFRESH,
                        collection,
                        ClientIdentification.yacyIntranetCrawlerAgentName,
-                        null, null,
+                        TagValency.EVAL, null, null,
                        timezoneOffset);
                sb.crawler.putActive(pe.handle().getBytes(), pe);
            } catch (final Exception e) {
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -709,7 +709,16 @@ public final class LoaderDispatcher {
        final String supportError = TextParser.supports(url, responseHeader.getContentType());
        if (supportError != null) throw new IOException("no parser support: " + supportError);
        try {
-            documents = TextParser.parseSource(url, responseHeader.getContentType(), responseHeader.getCharacterEncoding(), response.profile().ignoreDivClassName(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent());
+            documents = TextParser.parseSource(
+                    url,
+                    responseHeader.getContentType(),
+                    responseHeader.getCharacterEncoding(),
+                    response.profile().defaultValency(),
+                    response.profile().valencySwitchTagNames(),
+                    response.profile().scraper(),
+                    timezoneOffset,
+                    response.depth(),
+                    response.getContent());
            if (documents == null) throw new IOException("document == null");
        } catch (final Exception e) {
            throw new IOException("parser error: " + e.getMessage());
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2945,7 +2945,8 @@ public final class Switchboard extends serverSwitch {
                    documents = TextParser.genericParseSource(new AnchorURL(response.url()),
                            response.getMimeType(),
                            response.getCharacterEncoding(),
-                            response.profile().ignoreDivClassName(),
+                            response.profile().defaultValency(),
+                            response.profile().valencySwitchTagNames(),
                            response.profile().scraper(),
                            response.profile().timezoneOffset(),
                            response.depth(),
@ -2963,7 +2964,8 @@ public final class Switchboard extends serverSwitch {
                                new AnchorURL(response.url()),
                                response.getMimeType(),
                                response.getCharacterEncoding(),
-                                response.profile().ignoreDivClassName(),
+                                response.profile().defaultValency(),
+                                response.profile().valencySwitchTagNames(),
                                response.profile().scraper(),
                                response.profile().timezoneOffset(),
                                response.depth(),
--- a/source/net/yacy/search/index/DocumentIndex.java
+++ b/source/net/yacy/search/index/DocumentIndex.java
@ -45,6 +45,7 @@ import net.yacy.document.Document;
 import net.yacy.document.LibraryProvider;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.workflow.WorkflowProcessor;
 import net.yacy.search.schema.CollectionConfiguration;
 import net.yacy.search.schema.WebgraphConfiguration;
@ -162,24 +163,24 @@ public class DocumentIndex extends Segment {
        }
        InputStream sourceStream = null;
        try {
-        	sourceStream = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
-            documents = TextParser.parseSource(url, null, null, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, 0, length, sourceStream);
+            sourceStream = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
+            documents = TextParser.parseSource(url, null, null, TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, 0, length, sourceStream);
        } catch (final Exception e ) {
            throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage());
        } finally {
-        	if(sourceStream != null) {
-        		try {
-        			sourceStream.close();
-        		} catch(IOException e) {
-        			ConcurrentLog.warn("DocumentIndex", "Could not close source stream : " + e.getMessage());
-        		}
-        	}
+            if(sourceStream != null) {
+                try {
+                    sourceStream.close();
+                } catch(IOException e) {
+                    ConcurrentLog.warn("DocumentIndex", "Could not close source stream : " + e.getMessage());
+                }
+            }
        }
        //Document document = Document.mergeDocuments(url, null, documents);
        final SolrInputDocument[] rows = new SolrInputDocument[documents.length];
        int c = 0;
        for ( final Document document : documents ) {
-        	if (document == null) continue;
+            if (document == null) continue;
            final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true, 0);
            rows[c++] =
                super.storeDocument(