crawl profile adoption to new tag valency attribute

2 years ago · 4304e07e6f
parent 5acd98f4da
commit 4304e07e6f
20 changed files with 1024 additions and 843 deletions
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@ -51,6 +51,7 @@ import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.CrawlQueues;
 import net.yacy.crawler.data.NoticedURL.StackType;
 import net.yacy.crawler.retrieval.Request;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.blob.MapHeap;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.index.RowHandleSet;
@ -276,7 +277,6 @@ public final class CrawlSwitchboard {
        return this.profilesActiveCrawlsCounter.get(ASCII.String(profileKey));
    }

-    
    private void initActiveCrawlProfiles() {
        final Switchboard sb = Switchboard.getSwitchboard();

@ -308,6 +308,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.NOCACHE,
                "robot_" + CRAWL_PROFILE_AUTOCRAWL_DEEP,
                ClientIdentification.yacyInternetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -341,6 +342,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.NOCACHE,
                "robot_" + CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
                ClientIdentification.yacyInternetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -373,6 +375,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFFRESH,
                "robot_" + CRAWL_PROFILE_PROXY,
                ClientIdentification.yacyProxyAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -405,6 +408,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFFRESH,
                "robot_" + CRAWL_PROFILE_REMOTE,
                ClientIdentification.yacyInternetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -437,6 +441,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
                ClientIdentification.yacyIntranetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -469,6 +474,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
                ClientIdentification.yacyIntranetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -509,6 +515,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
                ClientIdentification.browserAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -541,6 +548,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
                ClientIdentification.yacyIntranetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -573,6 +581,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
                ClientIdentification.yacyIntranetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -605,6 +614,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.NOCACHE,
                "robot_" + CRAWL_PROFILE_SURROGATE,
                ClientIdentification.yacyIntranetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(
@ -640,6 +650,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.NOCACHE,
                collection,
                ClientIdentification.yacyIntranetCrawlerAgentName,
+                TagValency.EVAL,
                null, null,
                0);
        this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile);
--- a/source/net/yacy/crawler/RecrawlBusyThread.java
+++ b/source/net/yacy/crawler/RecrawlBusyThread.java
@ -43,6 +43,7 @@ import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.NoticedURL;
 import net.yacy.crawler.retrieval.Request;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.workflow.AbstractBusyThread;
 import net.yacy.search.Switchboard;
 import net.yacy.search.schema.CollectionSchema;
@ -355,7 +356,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {
                true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,
                "robot_" + CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB,
-                ClientIdentification.yacyInternetCrawlerAgentName, null, null, 0);
+                ClientIdentification.yacyInternetCrawlerAgentName, 
+                TagValency.EVAL, null, null, 0);
        return profile;
    }

--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -55,6 +55,7 @@ import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.CrawlSwitchboard;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.search.query.QueryParams;
 import net.yacy.search.schema.CollectionSchema;
@ -126,7 +127,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        INDEX_TEXT                   ("indexText",                  false, CrawlAttribute.BOOLEAN, "Index Text"),
        INDEX_MEDIA                  ("indexMedia",                 false, CrawlAttribute.BOOLEAN, "Index Media"),
        COLLECTIONS                  ("collections",                false, CrawlAttribute.STRING,  "Collections (comma-separated list)"),
-        IGNORE_DIV_CLASS_NAME        ("ignore_class_name",      false, CrawlAttribute.STRING,  "Ignore DIV Class names"),
+        DEFAULT_VALENCY              ("default_valency",            false, CrawlAttribute.STRING,  "default tag valency"),
+        VALENCY_SWITCH_TAG_NAME      ("valency_switch_tag_name",    false, CrawlAttribute.STRING,  "DIV Class names when default valency shall be switched"),
        SCRAPER                      ("scraper",                    false, CrawlAttribute.STRING,  "Declaration for Vocabulary Scraper"),
        TIMEZONEOFFSET               ("timezoneOffset",             true,  CrawlAttribute.INTEGER, "Time Zone of Crawl Start Agent");

@ -150,7 +152,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
  }

-    
    private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;

    /** Pattern on the URL a document must match to allow adding its embedded links to the crawl stack */
@ -175,7 +176,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    private Pattern snapshotsMustnotmatch = null;

    private final Map<String, AtomicInteger> doms;
-    private final Set<String> ignore_class_name;
+    private final TagValency defaultValency;
+    private final Set<String> valencySwitchTagNames;
    private final VocabularyScraper scraper;

    /**
@ -238,7 +240,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                 final CacheStrategy cacheStrategy,
                 final String collections,
                 final String userAgentName,
-                 final Set<String> ignore_class_name,
+                 final TagValency defaultValency,
+                 final Set<String> valencySwitchTagNames,
                 final VocabularyScraper scraper,
                 final int timezoneOffset) {
        super(40);
@ -283,9 +286,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(CrawlAttribute.CACHE_STRAGEGY.key,            cacheStrategy.toString());
        put(CrawlAttribute.COLLECTIONS.key,               CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
        // we transform the ignore_class_name and scraper information into a JSON Array
-        this.ignore_class_name = ignore_class_name == null ? new HashSet<String>() : ignore_class_name;
-        String jsonString = new JSONArray(ignore_class_name).toString();
-        put(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key, jsonString);
+        this.defaultValency = defaultValency;
+        this.valencySwitchTagNames = valencySwitchTagNames == null ? new HashSet<String>() : valencySwitchTagNames;
+        String jsonString = new JSONArray(valencySwitchTagNames).toString();
+        put(CrawlAttribute.DEFAULT_VALENCY.key, defaultValency.name());
+        put(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key, jsonString);
        this.scraper = scraper == null ? new VocabularyScraper() : scraper;
        jsonString = this.scraper.toString();
        assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
@ -305,9 +310,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        super(ext == null ? 1 : ext.size());
        if (ext != null) putAll(ext);
        this.doms = new ConcurrentHashMap<String, AtomicInteger>();
-        String jsonString = ext.get(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key);
+        String defaultValency = ext.get(CrawlAttribute.DEFAULT_VALENCY.key);
+        this.defaultValency = defaultValency == null || defaultValency.length() == 0 ? TagValency.EVAL : TagValency.valueOf(defaultValency);
+        String jsonString = ext.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key);
        JSONArray a;
-        if(jsonString == null) {
+        if (jsonString == null) {
            a = new JSONArray();
        } else {
            try {
@ -317,9 +324,9 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                a = new JSONArray();
            }
        }
-        this.ignore_class_name = new HashSet<String>();
+        this.valencySwitchTagNames = new HashSet<String>();
        for (int i = 0; i < a.length(); i++) try {
-            this.ignore_class_name.add(a.getString(i));
+            this.valencySwitchTagNames.add(a.getString(i));
        } catch (JSONException e) {}
        jsonString = ext.get(CrawlAttribute.SCRAPER.key);
        if (jsonString == null || jsonString.length() == 0) {
@ -336,8 +343,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
    }

-    public Set<String> ignoreDivClassName() {
-        return this.ignore_class_name;
+    public TagValency defaultValency() {
+        return this.defaultValency;
+    }
+
+    public Set<String> valencySwitchTagNames() {
+        return this.valencySwitchTagNames;
    }

    public VocabularyScraper scraper() {
@ -716,8 +727,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return this.indexMediaTypeMustNotMatch;
    }

-    
-    
    /**
     * Gets depth of crawl job (or height of the tree which will be
     * created by the crawler).
--- a/source/net/yacy/crawler/retrieval/Response.java
+++ b/source/net/yacy/crawler/retrieval/Response.java
@ -48,6 +48,7 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.search.Switchboard;

 public class Response {
@ -873,7 +874,11 @@ public class Response {
        final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.getContentType());
        if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
        try {
-            return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.getContentType(), this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(), new HashSet<String>(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
+            return TextParser.parseSource(
+                    url(), this.responseHeader == null ? null : this.responseHeader.getContentType(),
+                    this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(),
+                    TagValency.EVAL, new HashSet<String>(),
+                    new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
        } catch(Parser.Failure e) {
            throw e;
        } catch (final Exception e) {
--- a/source/net/yacy/document/AbstractParser.java
+++ b/source/net/yacy/document/AbstractParser.java
@ -32,6 +32,7 @@ import java.util.Set;

 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.document.parser.html.TagValency;

 public abstract class AbstractParser implements Parser {

@ -64,7 +65,7 @@ public abstract class AbstractParser implements Parser {
            int timezoneOffset,
            InputStream source
            ) throws Parser.Failure, InterruptedException {
-    	return parse(url, mimeType, charset, new HashSet<String>(), scraper, timezoneOffset, source);
+        return parse(url, mimeType, charset, TagValency.EVAL, new HashSet<String>(), scraper, timezoneOffset, source);
    }

    @Override
@ -72,7 +73,8 @@ public abstract class AbstractParser implements Parser {
            DigestURL url,
            String mimeType,
            String charset,
-            Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            VocabularyScraper scraper,
            int timezoneOffset,
            InputStream source
@ -80,7 +82,6 @@ public abstract class AbstractParser implements Parser {
        return parse(url, mimeType, charset, scraper, timezoneOffset, source);
    }

-    
    /*
     *  The following abstract implementations create a circular call which would cause an endless loop when called.
     *  They are both here because one of them must be overridden by the implementing class.
@ -96,7 +97,7 @@ public abstract class AbstractParser implements Parser {
            final InputStream source,
            final int maxLinks,
            final long maxBytes) throws UnsupportedOperationException, Failure, InterruptedException {
-    	return parseWithLimits(location, mimeType, charset, new HashSet<String>(), scraper, timezoneOffset, source, maxLinks, maxBytes);
+        return parseWithLimits(location, mimeType, charset, TagValency.EVAL, new HashSet<String>(), scraper, timezoneOffset, source, maxLinks, maxBytes);
    }

    @Override
@ -104,7 +105,8 @@ public abstract class AbstractParser implements Parser {
            DigestURL location,
            String mimeType,
            String charset,
-    		final Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            VocabularyScraper scraper,
            int timezoneOffset,
            InputStream source,
@ -171,5 +173,4 @@ public abstract class AbstractParser implements Parser {
        return false;
    }

-
 }
--- a/source/net/yacy/document/Parser.java
+++ b/source/net/yacy/document/Parser.java
@ -28,6 +28,7 @@ import java.util.Set;

 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.document.parser.html.TagValency;

 public interface Parser {

@ -68,7 +69,8 @@ public interface Parser {
            DigestURL url,
            String mimeType,
            String charset,
-            Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            VocabularyScraper scraper,
            int timezoneOffset,
            InputStream source
@ -113,15 +115,29 @@ public interface Parser {
    *             when the parser implementation doesn't support parsing within
    *             limits
    */
-	public Document[] parseWithLimits(DigestURL url, String mimeType, String charset,
+    public Document[] parseWithLimits(
+            DigestURL url,
+            String mimeType,
+            String charset,
            VocabularyScraper scraper,
-			int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
+            int timezoneOffset,
+            InputStream source,
+            int maxLinks,
+            long maxBytes)
                    throws Parser.Failure, InterruptedException, UnsupportedOperationException;


-    public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset,
-    		final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
-    		final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
+    public Document[] parseWithLimits(
+            final DigestURL location,
+            final String mimeType,
+            final String documentCharset,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
+            final VocabularyScraper vocscraper,
+            final int timezoneOffset,
+            final InputStream sourceStream,
+            final int maxLinks,
+            final long maxBytes)
                    throws Parser.Failure, InterruptedException, UnsupportedOperationException;

    /**
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -51,6 +51,7 @@ import net.yacy.document.parser.docParser;
 import net.yacy.document.parser.genericParser;
 import net.yacy.document.parser.gzipParser;
 import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.document.parser.htmlParser;
 import net.yacy.document.parser.linkScraperParser;
 import net.yacy.document.parser.mmParser;
@ -184,7 +185,8 @@ public final class TextParser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final int depth,
@ -201,7 +203,7 @@ public final class TextParser {
                throw new Parser.Failure(errorMsg, location);
            }
            sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
-            docs = parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream);
+            docs = parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -218,7 +220,8 @@ public final class TextParser {
            final DigestURL location,
            String mimeType,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final int depth,
@ -236,7 +239,7 @@ public final class TextParser {
        }
        assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);

-        final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
+        final Document[] docs = parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);

        return docs;
    }
@ -248,7 +251,8 @@ public final class TextParser {
            final DigestURL location,
            String mimeType,
            final String charset,
-            final Set<String> ignoreClassNames,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final int depth,
@ -261,14 +265,15 @@ public final class TextParser {
        final Set<Parser> idioms = new HashSet<>();
        idioms.add(TextParser.genericIdiom);

-        return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
+        return parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
    }

    private static Document[] parseSource(
            final DigestURL location,
            String mimeType,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final int depth,
@ -330,7 +335,7 @@ public final class TextParser {
                    CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);

                    try {
-                        return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset,
+                        return parseSource(location, mimeType, parser, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset,
                                nonCloseInputStream, maxLinks, maxBytes);
                    } catch (final Parser.Failure e) {
                        /* Try to reset the marked stream. If the failed parser has consumed too many bytes :
@ -382,7 +387,7 @@ public final class TextParser {
               - but let parsers supporting Parser.parseWithLimits perform partial parsing of maxBytes content */
            maxBytesToRead = (int)maxBytes + 1;
        }
-        if(contentLength >= 0 && contentLength < maxBytesToRead) {
+        if (contentLength >= 0 && contentLength < maxBytesToRead) {
            maxBytesToRead = (int)contentLength;
        }

@ -392,16 +397,23 @@ public final class TextParser {
        } catch (final IOException e) {
            throw new Parser.Failure(e.getMessage(), location);
        }
-        final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
+        final Document[] docs = parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);

        return docs;
    }

-    public static Document[] parseSource(final DigestURL location, String mimeType, final String charset,
-            final Set<String> ignore_class_name,
-            final VocabularyScraper scraper, final int timezoneOffset, final int depth, final long contentLength,
+    public static Document[] parseSource(
+            final DigestURL location,
+            String mimeType,
+            final String charset,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
+            final VocabularyScraper scraper,
+            final int timezoneOffset,
+            final int depth,
+            final long contentLength,
            final InputStream sourceStream) throws Parser.Failure {
-        return parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, contentLength, sourceStream,
+        return parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, contentLength, sourceStream,
                Integer.MAX_VALUE, Long.MAX_VALUE);
    }

@ -424,10 +436,19 @@ public final class TextParser {
     * @return a list of documents that result from parsing the source, with empty or null text.
     * @throws Parser.Failure when the parser processing failed
     */
-    public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, final Set<String> ignoreClassNames,
-            final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
+    public static Document[] parseWithLimits(
+            final DigestURL location,
+            String mimeType,
+            final String charset,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
+            final int timezoneOffset,
+            final int depth,
+            final long contentLength,
+            final InputStream sourceStream,
+            int maxLinks,
            long maxBytes) throws Parser.Failure{
-        return parseSource(location, mimeType, charset, ignoreClassNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
+        return parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
                sourceStream, maxLinks, maxBytes);
    }

@ -449,10 +470,11 @@ public final class TextParser {
     * @return a list of documents that result from parsing the source, with empty or null text.
     * @throws Parser.Failure when the parser processing failed
     */
-    public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
+    public static Document[] parseWithLimits(
+            final DigestURL location, String mimeType, final String charset,
            final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
            long maxBytes) throws Parser.Failure{
-        return parseSource(location, mimeType, charset, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength,
+        return parseSource(location, mimeType, charset, TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength,
                sourceStream, maxLinks, maxBytes);
    }

@ -475,7 +497,8 @@ public final class TextParser {
            final String mimeType,
            final Parser parser,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final InputStream sourceStream,
@ -491,11 +514,11 @@ public final class TextParser {
        try {
            final Document[] docs;
            if(parser.isParseWithLimitsSupported()) {
-                docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
+                docs = parser.parseWithLimits(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
            } else {
                /* Parser do not support partial parsing within limits : let's control it here*/
                final InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
-                docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource);
+                docs = parser.parse(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, limitedSource);
            }
            return docs;
        } catch(final Parser.Failure e) {
@ -524,7 +547,8 @@ public final class TextParser {
            final String mimeType,
            final Set<Parser> parsers,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final int depth,
@ -552,13 +576,13 @@ public final class TextParser {
                }
                try {
                    if(parser.isParseWithLimitsSupported()) {
-                        docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis, maxLinks, maxBytes);
+                        docs = parser.parseWithLimits(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, bis, maxLinks, maxBytes);
                    } else {
                        /* Partial parsing is not supported by this parser : check content length now */
                        if(sourceArray.length > maxBytes) {
                            throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);
                        }
-                        docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis);
+                        docs = parser.parse(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, bis);
                    }
                } catch (final Parser.Failure e) {
                    if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException &&
--- a/source/net/yacy/document/importer/MediawikiImporter.java
+++ b/source/net/yacy/document/importer/MediawikiImporter.java
@ -68,6 +68,7 @@ import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.document.content.SurrogateReader;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.util.NamePrefixThreadFactory;

 import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
@ -536,7 +537,7 @@ public class MediawikiImporter extends Thread implements Importer {
        public void genDocument() throws Parser.Failure {
            try {
                this.url = new AnchorURL(this.urlStub + this.title);
-				final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
+                final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
                this.document = Document.mergeDocuments(this.url, "text/html", parsed);
                // the wiki parser is not able to find the proper title in the source text, so it must be set here
                this.document.setTitle(this.title);
--- a/source/net/yacy/document/parser/AbstractCompressorParser.java
+++ b/source/net/yacy/document/parser/AbstractCompressorParser.java
@ -37,6 +37,7 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;

 /**
 * Base class for parsing compressed files relying on Apache commons-compress
@ -73,8 +74,14 @@ public abstract class AbstractCompressorParser extends AbstractParser implements
    protected abstract String getUncompressedFilename(final String filename);

    @Override
-	public Document[] parse(final DigestURL location, final String mimeType, final String charset,
-			final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
+    public Document[] parse(
+            final DigestURL location,
+            final String mimeType,
+            final String charset,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
+            final VocabularyScraper scraper,
+            final int timezoneOffset,
            final InputStream source) throws Parser.Failure, InterruptedException {

        return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, Integer.MAX_VALUE,
@ -82,9 +89,17 @@ public abstract class AbstractCompressorParser extends AbstractParser implements
    }

    @Override
-	public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
-			final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
-			final InputStream source, final int maxLinks, final long maxBytes) throws Parser.Failure {
+    public Document[] parseWithLimits(
+            final DigestURL location,
+            final String mimeType,
+            final String charset,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
+            final VocabularyScraper scraper,
+            final int timezoneOffset,
+            final InputStream source,
+            final int maxLinks,
+            final long maxBytes) throws Parser.Failure {
        Document maindoc;
        final CompressorInputStream compressedInStream;
        try {
@ -97,7 +112,7 @@ public abstract class AbstractCompressorParser extends AbstractParser implements
            // create maindoc for this archive, register with supplied url & mime
            maindoc = AbstractCompressorParser.createMainDocument(location, mimeType, charset, this);

-			final Document[] docs = this.parseCompressedInputStream(location, null, ignoreClassNames, timezoneOffset,
+            final Document[] docs = this.parseCompressedInputStream(location, null, defaultValency, valencySwitchTagNames, timezoneOffset,
                    AbstractCompressorParser.DEFAULT_DEPTH, compressedInStream, maxLinks, maxBytes);
            if (docs != null) {
                maindoc.addSubDocuments(docs);
@ -151,9 +166,15 @@ public abstract class AbstractCompressorParser extends AbstractParser implements
     *         or null text.
     * @throws Parser.Failure when the parser processing failed
     */
-	protected Document[] parseCompressedInputStream(final DigestURL location, final String charset,
-			final Set<String> ignoreClassNames, final int timezoneOffset, final int depth,
-			final CompressorInputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
+    protected Document[] parseCompressedInputStream(
+            final DigestURL location,
+            final String charset,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
+            final int timezoneOffset, final int depth,
+            final CompressorInputStream compressedInStream,
+            final int maxLinks,
+            final long maxBytes) throws Failure {
        final String compressedFileName = location.getFileName();
        final String contentfilename = getUncompressedFilename(compressedFileName);
        final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
@ -172,7 +193,8 @@ public abstract class AbstractCompressorParser extends AbstractParser implements
             * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
             * compressed content
             */
-			return TextParser.parseWithLimits(contentLocation, mime, charset, ignoreClassNames, timezoneOffset, depth,
+            return TextParser.parseWithLimits(
+                    contentLocation, mime, charset, defaultValency, valencySwitchTagNames, timezoneOffset, depth,
                    -1, compressedInStream, maxLinks, maxBytes);
        } catch (final MalformedURLException e) {
            throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
--- a/source/net/yacy/document/parser/bzipParser.java
+++ b/source/net/yacy/document/parser/bzipParser.java
@ -45,6 +45,7 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.util.FileUtils;

 /**
@ -70,7 +71,8 @@ public class bzipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            Set<String> ignore_class_name,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            final InputStream source)
@ -125,7 +127,7 @@ public class bzipParser extends AbstractParser implements Parser {
            // creating a new parser class to parse the unzipped content
            final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName());
            final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
-            final Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tempFile);
+            final Document[] docs = TextParser.parseSource(location, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, 999, tempFile);
            if (docs != null) maindoc.addSubDocuments(docs);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
@ -210,7 +212,6 @@ public class bzipParser extends AbstractParser implements Parser {
        }
    }

-    
    @Override
    public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper,
            final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
--- a/source/net/yacy/document/parser/gzipParser.java
+++ b/source/net/yacy/document/parser/gzipParser.java
@ -45,6 +45,7 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.util.FileUtils;

 /**
@ -72,7 +73,8 @@ public class gzipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            Set<String> ignore_class_name,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            final InputStream source) throws Parser.Failure, InterruptedException {
@ -128,7 +130,7 @@ public class gzipParser extends AbstractParser implements Parser {
            // creating a new parser class to parse the unzipped content
            final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
            final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
-            Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
+            Document[] docs = TextParser.parseSource(location, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
            if (docs != null) maindoc.addSubDocuments(docs);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -108,7 +108,18 @@ public class htmlParser extends AbstractParser implements Parser {
            final int timezoneOffset,
            final InputStream sourceStream) throws Parser.Failure, InterruptedException {

-        return parseWithLimits(location, mimeType, documentCharset, new HashSet<String>(), vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
+        return parseWithLimits(
+                location,
+                mimeType,
+                documentCharset,
+                TagValency.EVAL,
+                new HashSet<String>(),
+                vocscraper,
+                timezoneOffset,
+                sourceStream,
+                Integer.MAX_VALUE,
+                DEFAULT_MAX_LINKS,
+                Long.MAX_VALUE);
    }

    @Override
@ -116,12 +127,23 @@ public class htmlParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String documentCharset,
-            final Set<String> ignore_class_name, 
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper vocscraper,
            final int timezoneOffset,
            final InputStream sourceStream) throws Parser.Failure, InterruptedException {

-        return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
+        return parseWithLimits(
+                location, mimeType,
+                documentCharset,
+                defaultValency,
+                valencySwitchTagNames,
+                vocscraper,
+                timezoneOffset,
+                sourceStream,
+                Integer.MAX_VALUE,
+                DEFAULT_MAX_LINKS,
+                Long.MAX_VALUE);
    }
    
    @Override
@ -130,20 +152,49 @@ public class htmlParser extends AbstractParser implements Parser {
    }
    
    @Override
-    public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset,
-    		final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
-    		final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
+    public Document[] parseWithLimits(
+            final DigestURL location,
+            final String mimeType,
+            final String documentCharset,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
+            final VocabularyScraper vocscraper,
+            final int timezoneOffset,
+            final InputStream sourceStream,
+            final int maxLinks,
+            final long maxBytes)
            throws Failure {
-        return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, maxLinks, maxLinks, maxBytes);
+        return parseWithLimits(
+                location,
+                mimeType,
+                documentCharset,
+                defaultValency,
+                valencySwitchTagNames,
+                vocscraper,
+                timezoneOffset,
+                sourceStream,
+                maxLinks,
+                maxLinks,
+                maxBytes);
    }
    
-    private Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
-    		final int timezoneOffset, final InputStream sourceStream, final int maxAnchors, final int maxLinks, final long maxBytes)
+    private Document[] parseWithLimits(
+            final DigestURL location,
+            final String mimeType,
+            final String documentCharset,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
+            final VocabularyScraper vocscraper,
+            final int timezoneOffset,
+            final InputStream sourceStream,
+            final int maxAnchors,
+            final int maxLinks,
+            final long maxBytes)
            throws Failure {
        try {
            // first get a document from the parsed html
            Charset[] detectedcharsetcontainer = new Charset[]{null};
-            ContentScraper scraper = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
+            ContentScraper scraper = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
            // parseToScraper also detects/corrects/sets charset from html content tag
            final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
            Document documentSnapshot = null;
@ -152,10 +203,10 @@ public class htmlParser extends AbstractParser implements Parser {
                // and create a sub-document for snapshot page (which will be merged by loader)
                // TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler
                if (location.getRef() != null && location.getRef().startsWith("!")) {
-                    documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
+                    documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
                } else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both)
                    if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) {
-                        documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
+                        documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
                    }
                }
            } catch (Exception ex1) { // ignore any exception for any issue with snapshot
@ -221,7 +272,16 @@ public class htmlParser extends AbstractParser implements Parser {
        return ppd;
    }

-    public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxAnchors, final int maxLinks) throws IOException {
+    public static ContentScraper parseToScraper(
+            final DigestURL location,
+            final String documentCharset, 
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
+            final VocabularyScraper vocabularyScraper,
+            final int timezoneOffset,
+            final String input,
+            final int maxAnchors,
+            final int maxLinks) throws IOException {
        Charset[] detectedcharsetcontainer = new Charset[]{null};
        InputStream sourceStream;
        try {
@ -231,7 +291,7 @@ public class htmlParser extends AbstractParser implements Parser {
        }
        ContentScraper scraper; // for this static methode no need to init local this.scraperObject
        try {
-            scraper = parseToScraper(location, documentCharset, ignore_class_name, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
+            scraper = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
        } catch (Failure e) {
            throw new IOException(e.getMessage());
        }
@ -256,7 +316,8 @@ public class htmlParser extends AbstractParser implements Parser {
    public static ContentScraper parseToScraper(
            final DigestURL location,
            final String documentCharset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency,
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper vocabularyScraper,
            final Charset[] detectedcharsetcontainer,
            final int timezoneOffset,
@ -280,8 +341,8 @@ public class htmlParser extends AbstractParser implements Parser {
                htmlFilter = new ScraperInputStream(
                        sourceStream,
                        documentCharset,
-                        ignore_class_name,
-                        TagValency.EVAL,
+                        valencySwitchTagNames,
+                        defaultValency,
                        vocabularyScraper,
                        location,
                        false,
@ -325,7 +386,7 @@ public class htmlParser extends AbstractParser implements Parser {
                location,
                maxAnchors,
                maxLinks,
-                ignore_class_name,
+                valencySwitchTagNames,
                TagValency.EVAL,
                vocabularyScraper,
                timezoneOffset);
@ -457,7 +518,8 @@ public class htmlParser extends AbstractParser implements Parser {
     */
    private Document parseAlternativeSnapshot(
            final DigestURL location, final String mimeType, final String documentCharset,
-    		final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
+            final TagValency defaultValency, final Set<String> valencySwitchTagNames,
+            final VocabularyScraper vocscraper,
            final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) {
        Document documentSnapshot = null;
        try {
@ -477,7 +539,7 @@ public class htmlParser extends AbstractParser implements Parser {
            InputStream snapshotStream = null;
            try {
                snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
-            	ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
+                ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
                documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot);
            } finally {
                if(snapshotStream != null) {
--- a/source/net/yacy/document/parser/sevenzipParser.java
+++ b/source/net/yacy/document/parser/sevenzipParser.java
@ -44,6 +44,7 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.util.FileUtils;
 import SevenZip.ArchiveExtractCallback;
 import SevenZip.IInStream;
@ -63,7 +64,8 @@ public class sevenzipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
            final int timezoneOffset,
            final IInStream source) throws Parser.Failure, InterruptedException {

@ -94,7 +96,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
        } catch (final IOException e) {
            throw new Parser.Failure("error opening 7zip archive: " + e.getMessage(), location);
        }
-        final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), ignore_class_name, timezoneOffset);
+        final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), defaultValency, valencySwitchTagNames, timezoneOffset);
        AbstractParser.log.fine("processing archive contents...");
        try {
            archive.Extract(null, -1, 0, aec);
@ -116,10 +118,11 @@ public class sevenzipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
            final int timezoneOffset,
            final byte[] source) throws Parser.Failure, InterruptedException {
-        return parse(location, mimeType, charset, ignore_class_name, timezoneOffset, new ByteArrayIInStream(source));
+        return parse(location, mimeType, charset, defaultValency, valencySwitchTagNames, timezoneOffset, new ByteArrayIInStream(source));
    }

    @Override
@ -127,14 +130,15 @@ public class sevenzipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            Set<String> ignore_class_name,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            final InputStream source) throws Parser.Failure, InterruptedException {
        try {
            final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
            FileUtils.copy(source, cfos);
-            return new Document[]{parse(location, mimeType, charset, ignore_class_name, timezoneOffset, cfos.toByteArray())};
+            return new Document[]{parse(location, mimeType, charset, defaultValency, valencySwitchTagNames, timezoneOffset, cfos.toByteArray())};
        } catch (final IOException e) {
            throw new Parser.Failure("error processing 7zip archive: " + e.getMessage(), location);
        }
@ -148,7 +152,8 @@ public class sevenzipParser extends AbstractParser implements Parser {
         private ByteArrayOutputStream cfos = null;
         private final Document doc;
         private final String prefix;
-         private Set<String> ignore_class_name;
+         private final TagValency defaultValency;
+         private Set<String> valencySwitchTagNames;
         private final int timezoneOffset;

         public SZParserExtractCallback(
@ -156,13 +161,15 @@ public class sevenzipParser extends AbstractParser implements Parser {
                 final IInArchive handler,
                 final Document doc,
                 final String prefix,
-                 final Set<String> ignore_class_name,
+                 final TagValency defaultValency, 
+                 final Set<String> valencySwitchTagNames,
                 final int timezoneOffset) {
             super.Init(handler);
             this.log = logger;
             this.doc = doc;
             this.prefix = prefix;
-             this.ignore_class_name = ignore_class_name;
+             this.defaultValency = defaultValency;
+             this.valencySwitchTagNames = valencySwitchTagNames;
             this.timezoneOffset = timezoneOffset;
         }

@ -205,7 +212,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
                     // below for reversion of the effects
                     final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath);
                     final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
-                     theDocs = TextParser.parseSource(url, mime, null, this.ignore_class_name, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());
+                     theDocs = TextParser.parseSource(url, mime, null,this.defaultValency, this.valencySwitchTagNames, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());

                     this.doc.addSubDocuments(theDocs);
                 }
--- a/source/net/yacy/document/parser/tarParser.java
+++ b/source/net/yacy/document/parser/tarParser.java
@ -45,6 +45,7 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.util.FileUtils;

 // this is a new implementation of this parser idiom using multiple documents as result set
@ -70,7 +71,8 @@ public class tarParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            InputStream source) throws Parser.Failure, InterruptedException {
@ -104,7 +106,7 @@ public class tarParser extends AbstractParser implements Parser {
                try {
                    tmp = FileUtils.createTempFile(this.getClass(), name);
                    FileUtils.copy(tis, tmp, entry.getSize());
-					/*
+/*
 * Create an appropriate sub location to prevent unwanted fallback to the tarparser on resources included in the archive. 
 * We use the tar file name as the parent sub path. Example : http://host/archive.tar/name.
 * Indeed if we create a sub location with a '#' separator such as http://host/archive.tar#name, the
@ -112,7 +114,7 @@ public class tarParser extends AbstractParser implements Parser {
 * as a possible parser for the sub resource.
 */
                    final DigestURL subLocation = new DigestURL(parentTarURL, name);
-                    final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, ignore_class_name, scraper, timezoneOffset,	999, tmp);
+                    final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset,999, tmp);
                    if (subDocs == null) {
                    continue;
                    }
@ -130,57 +132,57 @@ public class tarParser extends AbstractParser implements Parser {
        return new Document[]{maindoc};
    }

-	@Override
-	public boolean isParseWithLimitsSupported() {
-		return true;
-	}
+@Override
+public boolean isParseWithLimitsSupported() {
+return true;
+}

-	@Override
-	public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
-			final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks,
-			final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException {
+@Override
+public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
+final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks,
+final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException {

-		final DigestURL parentTarURL = createParentTarURL(location);
+final DigestURL parentTarURL = createParentTarURL(location);

-		final TarArchiveInputStream tis = new TarArchiveInputStream(source);
+final TarArchiveInputStream tis = new TarArchiveInputStream(source);

-		// create maindoc for this tar container
-		final Document maindoc = createMainDocument(location, mimeType, charset, this);
+// create maindoc for this tar container
+final Document maindoc = createMainDocument(location, mimeType, charset, this);

-		// loop through the elements in the tar file and parse every single file inside
-		TarArchiveEntry entry;
-		int totalProcessedLinks = 0;
-		while (true) {
-			try {
-				entry = tis.getNextTarEntry();
-				if (entry == null) {
-					break;
-				}
+// loop through the elements in the tar file and parse every single file inside
+TarArchiveEntry entry;
+int totalProcessedLinks = 0;
+while (true) {
+try {
+entry = tis.getNextTarEntry();
+if (entry == null) {
+break;
+}

-				/*
+/*
 * We are here sure at least one entry has still to be processed : let's check
 * now the bytes limit as sub parsers applied on eventual previous entries may
 * not support partial parsing and would have thrown a Parser.Failure instead of
 * marking the document as partially parsed.
 */
-				if (tis.getBytesRead() >= maxBytes) {
-					maindoc.setPartiallyParsed(true);
-					break;
-				}
+if (tis.getBytesRead() >= maxBytes) {
+maindoc.setPartiallyParsed(true);
+break;
+}

-				if (entry.isDirectory() || entry.getSize() <= 0) {
-					continue;
-				}
-				final String name = entry.getName();
-				final int idx = name.lastIndexOf('.');
-				final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : "");
-				try {
-					/*
+if (entry.isDirectory() || entry.getSize() <= 0) {
+continue;
+}
+final String name = entry.getName();
+final int idx = name.lastIndexOf('.');
+final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : "");
+try {
+/*
 * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
 * compressed content
 */

-					/*
+/*
 * Create an appropriate sub location to prevent unwanted fallback to the
 * tarparser on resources included in the archive. We use the tar file name as
 * the parent sub path. Example : http://host/archive.tar/name. Indeed if we
@ -188,66 +190,66 @@ public class tarParser extends AbstractParser implements Parser {
 * http://host/archive.tar#name, the extension of the URL is still ".tar", thus
 * incorrectly making the tar parser as a possible parser for the sub resource.
 */
-					final DigestURL subLocation = new DigestURL(parentTarURL, name);
-					final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999,
-							entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead());
+final DigestURL subLocation = new DigestURL(parentTarURL, name);
+final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999,
+entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead());

-					/*
+/*
 * If the parser(s) did not consume all bytes in the entry, these ones will be
 * skipped by the next call to getNextTarEntry()
 */
-					if (subDocs == null) {
-						continue;
-					}
-					maindoc.addSubDocuments(subDocs);
-					for (Document subDoc : subDocs) {
-						if (subDoc.getAnchors() != null) {
-							totalProcessedLinks += subDoc.getAnchors().size();
-						}
-					}
-					/*
+if (subDocs == null) {
+continue;
+}
+maindoc.addSubDocuments(subDocs);
+for (Document subDoc : subDocs) {
+if (subDoc.getAnchors() != null) {
+totalProcessedLinks += subDoc.getAnchors().size();
+}
+}
+/*
 * Check if a limit has been exceeded (we are sure to pass here when maxLinks
 * has been exceeded as this limit require parser support for partial parsing to
 * be detected)
 */
-					if (subDocs[0].isPartiallyParsed()) {
-						maindoc.setPartiallyParsed(true);
-						break;
-					}
-				} catch (final Parser.Failure e) {
-					AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
-				}
-			} catch (final IOException e) {
-				AbstractParser.log.warn("tar parser:" + e.getMessage());
-				break;
-			}
-		}
-		return new Document[] { maindoc };
-	}
+if (subDocs[0].isPartiallyParsed()) {
+maindoc.setPartiallyParsed(true);
+break;
+}
+} catch (final Parser.Failure e) {
+AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
+}
+} catch (final IOException e) {
+AbstractParser.log.warn("tar parser:" + e.getMessage());
+break;
+}
+}
+return new Document[] { maindoc };
+}

-	/**
+/**
 * Generate a parent URL to use for generating sub URLs on tar archive entries.
 * 
 * @param tarURL
 *            the URL of the tar archive
 * @return an URL ending with a "/" suitable as a base URL for archive entries
 */
-	private DigestURL createParentTarURL(final DigestURL tarURL) {
-		String locationStr = tarURL.toNormalform(false);
-		if (!locationStr.endsWith("/")) {
-			locationStr += "/";
-		}
-		DigestURL parentTarURL;
-		try {
-			parentTarURL = new DigestURL(locationStr);
-		} catch (MalformedURLException e1) {
-			/* This should not happen */
-			parentTarURL = tarURL;
-		}
-		return parentTarURL;
-	}
+private DigestURL createParentTarURL(final DigestURL tarURL) {
+String locationStr = tarURL.toNormalform(false);
+if (!locationStr.endsWith("/")) {
+locationStr += "/";
+}
+DigestURL parentTarURL;
+try {
+parentTarURL = new DigestURL(locationStr);
+} catch (MalformedURLException e1) {
+/* This should not happen */
+parentTarURL = tarURL;
+}
+return parentTarURL;
+}

-	/**
+/**
 * Create the main resulting parsed document for a tar container
 * 
 * @param location
@ -261,15 +263,15 @@ public class tarParser extends AbstractParser implements Parser {
 *            the document
 * @return a Document instance
 */
-	public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
-			final tarParser parser) {
-		final String filename = location.getFileName();
-		final Document maindoc = new Document(location, mimeType, charset, parser, null, null,
-				AbstractParser
-						.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
-				null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
-		return maindoc;
-	}
+public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
+final tarParser parser) {
+final String filename = location.getFileName();
+final Document maindoc = new Document(location, mimeType, charset, parser, null, null,
+AbstractParser
+.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
+null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
+return maindoc;
+}

    public final static boolean isTar(File f) {
        if (!f.exists() || f.length() < 0x105) return false;
--- a/source/net/yacy/document/parser/zipParser.java
+++ b/source/net/yacy/document/parser/zipParser.java
@ -39,6 +39,7 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.MemoryControl;

@ -72,7 +73,8 @@ public class zipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            final Set<String> ignore_class_name,
+            final TagValency defaultValency, 
+            final Set<String> valencySwitchTagNames,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            final InputStream source)
@ -121,7 +123,7 @@ public class zipParser extends AbstractParser implements Parser {
                    FileUtils.copy(zis, tmp, entry.getSize());
                    final DigestURL virtualURL = DigestURL.newURL(location, "#" + name);
                    //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
-                    final Document[] docs = TextParser.parseSource(virtualURL, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tmp);
+                    final Document[] docs = TextParser.parseSource(virtualURL, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, 999, tmp);
                    if (docs == null) continue;
                    maindoc.addSubDocuments(docs);
                } catch (final Parser.Failure e) {
--- a/source/net/yacy/htroot/Crawler_p.java
+++ b/source/net/yacy/htroot/Crawler_p.java
@ -626,6 +626,7 @@ public class Crawler_p {
                            cachePolicy,
                            collection,
                            agentName,
+                            TagValency.EVAL,
                            ignoreclassname,
                            new VocabularyScraper(vocabulary_scraper),
                            timezoneOffset);
--- a/source/net/yacy/htroot/QuickCrawlLink_p.java
+++ b/source/net/yacy/htroot/QuickCrawlLink_p.java
@ -43,6 +43,7 @@ import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.retrieval.Request;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.search.Switchboard;
 import net.yacy.search.SwitchboardConstants;
 import net.yacy.search.index.Segment;
@ -161,7 +162,7 @@ public class QuickCrawlLink_p {
                        CacheStrategy.IFFRESH,
                        collection,
                        ClientIdentification.yacyIntranetCrawlerAgentName,
-                        null, null,
+                        TagValency.EVAL, null, null,
                        timezoneOffset);
                sb.crawler.putActive(pe.handle().getBytes(), pe);
            } catch (final Exception e) {
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -709,7 +709,16 @@ public final class LoaderDispatcher {
        final String supportError = TextParser.supports(url, responseHeader.getContentType());
        if (supportError != null) throw new IOException("no parser support: " + supportError);
        try {
-            documents = TextParser.parseSource(url, responseHeader.getContentType(), responseHeader.getCharacterEncoding(), response.profile().ignoreDivClassName(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent());
+            documents = TextParser.parseSource(
+                    url,
+                    responseHeader.getContentType(),
+                    responseHeader.getCharacterEncoding(),
+                    response.profile().defaultValency(),
+                    response.profile().valencySwitchTagNames(),
+                    response.profile().scraper(),
+                    timezoneOffset,
+                    response.depth(),
+                    response.getContent());
            if (documents == null) throw new IOException("document == null");
        } catch (final Exception e) {
            throw new IOException("parser error: " + e.getMessage());
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2945,7 +2945,8 @@ public final class Switchboard extends serverSwitch {
                    documents = TextParser.genericParseSource(new AnchorURL(response.url()),
                            response.getMimeType(),
                            response.getCharacterEncoding(),
-                            response.profile().ignoreDivClassName(),
+                            response.profile().defaultValency(),
+                            response.profile().valencySwitchTagNames(),
                            response.profile().scraper(),
                            response.profile().timezoneOffset(),
                            response.depth(),
@ -2963,7 +2964,8 @@ public final class Switchboard extends serverSwitch {
                                new AnchorURL(response.url()),
                                response.getMimeType(),
                                response.getCharacterEncoding(),
-                                response.profile().ignoreDivClassName(),
+                                response.profile().defaultValency(),
+                                response.profile().valencySwitchTagNames(),
                                response.profile().scraper(),
                                response.profile().timezoneOffset(),
                                response.depth(),
--- a/source/net/yacy/search/index/DocumentIndex.java
+++ b/source/net/yacy/search/index/DocumentIndex.java
@ -45,6 +45,7 @@ import net.yacy.document.Document;
 import net.yacy.document.LibraryProvider;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.kelondro.workflow.WorkflowProcessor;
 import net.yacy.search.schema.CollectionConfiguration;
 import net.yacy.search.schema.WebgraphConfiguration;
@ -163,7 +164,7 @@ public class DocumentIndex extends Segment {
        InputStream sourceStream = null;
        try {
            sourceStream = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
-            documents = TextParser.parseSource(url, null, null, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, 0, length, sourceStream);
+            documents = TextParser.parseSource(url, null, null, TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, 0, length, sourceStream);
        } catch (final Exception e ) {
            throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage());
        } finally {