introduction of tag-to-indexing relation TagValency

2 years ago · 5acd98f4da
parent 95e02e5291
commit 5acd98f4da
10 changed files with 651 additions and 570 deletions
--- a/htroot/Crawler_p.html
+++ b/htroot/Crawler_p.html
@ -28,13 +28,13 @@
 <fieldset id="queues" style="width:210px;float:left;">
 <legend>Queues</legend>
      <table border="0" class="watchCrawler">
-      	<thead>
+          <thead>
          <tr class="TableHeader"> 
            <th width="120">Queue<br/>&nbsp;</th>
            <th width="60">Size<br/>&nbsp;</th>
            <th width="30"><span class="glyphicon glyphicon-wrench"></span></th>
          </tr>
-		</thead>
+        </thead>
        <tbody>
          <tr class="TableCellLight"> 
            <td align="left"><a href="IndexCreateQueues_p.html?stack=LOCAL">Local Crawler</a></td>
@ -89,13 +89,13 @@
 <fieldset id="indexsize" style="width:240px;float:left;">
 <legend>Index Size</legend>
      <table border="0" class="watchCrawler">
-      	<thead>
+          <thead>
          <tr class="TableHeader"> 
            <th width="130">Database<br/>&nbsp;</th>
            <th width="50">Entries<br/>&nbsp;</th>
            <th width="40">Seg-<br/>ments</th>
          </tr>
-		</thead>
+        </thead>
        <tbody>
          <tr class="TableCellLight"> 
            <td align="left">Documents<br/><a href="#[urlpublictextSolrURL]#">solr search api</a></td>
@ -124,12 +124,12 @@
 <legend>Progress</legend>
      <form action="Crawler_p.html" method="get" enctype="multipart/form-data" accept-charset="UTF-8">
      <table border="0" class="watchCrawler">
-      	<thead>
+          <thead>
          <tr class="TableHeader"> 
            <th width="160">Indicator<br/>&nbsp;</th>
            <th width="300" colspan="4">Level<br/>&nbsp;</th>
          </tr>
-		</thead>
+        </thead>
        <tbody>
          <tr class="TableCellLight"> 
            <td align="left">Speed / PPM<br/>(Pages Per Minute)</td>
@ -147,7 +147,7 @@
            <td align="left">Crawler PPM</td>
            <td align="left" width="60"><span id="ppmNum">&nbsp;&nbsp;&nbsp;</span></td>
            <td align="left" width="260px" colspan="3">
-            	<progress id="ppmbar" max="30000" value="0" style="width:94%;"/>
+                <progress id="ppmbar" max="30000" value="0" style="width:94%;"/>
            </td>
          </tr>
          <tr class="TableCellLight"> 
@ -180,13 +180,13 @@

 <script>
 function setTableSize() {
-	var maxh = Math.max(document.getElementById("progress").children[1].clientHeight, document.getElementById("indexsize").children[1].clientHeight, document.getElementById("queues").children[1].clientHeight) + 42;
-	if(lastMaxh !== maxh) {
-		var lastMaxh = maxh;
-		document.getElementById("indexsize").style.height = maxh + "px";
-		document.getElementById("progress").style.height = maxh + "px";
-		document.getElementById("queues").style.height = maxh + "px";
-	}
+    var maxh = Math.max(document.getElementById("progress").children[1].clientHeight, document.getElementById("indexsize").children[1].clientHeight, document.getElementById("queues").children[1].clientHeight) + 42;
+    if(lastMaxh !== maxh) {
+        var lastMaxh = maxh;
+        document.getElementById("indexsize").style.height = maxh + "px";
+        document.getElementById("progress").style.height = maxh + "px";
+        document.getElementById("queues").style.height = maxh + "px";
+    }
 }
 window.setInterval("setTableSize()", 1000);
 </script>
@ -219,23 +219,23 @@ window.setInterval("setTableSize()", 1000);
        If you crawl any un-wanted pages, you can delete them <a href="IndexCreateQueues_p.html?stack=LOCAL">here</a>.<br />::
        <!-- 9 -->
        No embedded local Solr index is connected. This is required to use a Solr query filter.
-		You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.::
-		<!-- 10 -->
-		The Solr filter query syntax is not valid : <code>#[solrQuery]#</code>::
-		<!-- 11 -->
-		Could not parse the Solr filter query : <code>#[solrQuery]#</code>
+        You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.::
+        <!-- 10 -->
+        The Solr filter query syntax is not valid : <code>#[solrQuery]#</code>::
+        <!-- 11 -->
+        Could not parse the Solr filter query : <code>#[solrQuery]#</code>
 #(/info)#
 </p>

 #(wontReceiptRemoteResults)#::
 <div class="alert alert-warning">
- 	<p>You asked for remote indexing, but remote crawl results won't be added to the local index as the remote crawler is currently disabled on this peer.</p>
-	<p>You can activate it in the <a href="RemoteCrawl_p.html">Remote Crawl Configuration</a> page.</p>
+     <p>You asked for remote indexing, but remote crawl results won't be added to the local index as the remote crawler is currently disabled on this peer.</p>
+    <p>You can activate it in the <a href="RemoteCrawl_p.html">Remote Crawl Configuration</a> page.</p>
 </div>
 #(/wontReceiptRemoteResults)#

 <!-- #(noEmbeddedSolr)#::<div class="alert alert-error">No embedded local Solr index is connected. This is required to use the Solr filter query. 
-	You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.</div>
+    You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.</div>
 #(/noEmbeddedSolr)#

 #(solrQuerySyntaxtError)#::<div class="alert alert-error">The Solr filter query syntax is not valid : #[solrQuery]#</div>
@ -252,32 +252,32 @@ window.setInterval("setTableSize()", 1000);
 <tr><td>
  <table border="0" summary="A list of crawl profiles and their current settings." id="crawlProfiles">
    <colgroup>
-    	<col width="16" />
-    	<col width="140"/>
+        <col width="16" />
+        <col width="140"/>
    </colgroup>
    <thead>
-    	<tr class="TableHeader">
-      		<th><strong>Name</strong></th>
-      		#(debug)#::<th id="headerDebug"><strong>Count</strong></th>#(/debug)#
-      		<th><strong>Status</strong></th>
-    	</tr>
+        <tr class="TableHeader">
+              <th><strong>Name</strong></th>
+              #(debug)#::<th id="headerDebug"><strong>Count</strong></th>#(/debug)#
+              <th><strong>Status</strong></th>
+        </tr>
    </thead>
    <tbody>
    #{list}# 
-    	<tr class="TableCell#(dark)#Light::Dark#(/dark)#" id="#[handle]#">
-      		<td>#[name]#</td>
-      		#(debug)#::<td>#[count]#</td>#(/debug)#
-      		<td id="#[handle]#_status_cell">#(terminateButton)#::
-        		<div id="#[handle]#_status" style="text-decoration:blink;float:left;">Running</div>
-        		<form id="#[handle]#_terminate" style="float:left;" action="Crawler_p.html" method="get" enctype="multipart/form-data" accept-charset="UTF-8">
-        			<div>
-        				<input type="hidden" name="handle" value="#[handle]#" />
-        				<input type="submit" name="terminate" value="Terminate" class="btn btn-danger btn-xs"/>
-        			</div>
-        		</form>
-        	#(/terminateButton)#
-      		</td>
-    	</tr>
+        <tr class="TableCell#(dark)#Light::Dark#(/dark)#" id="#[handle]#">
+              <td>#[name]#</td>
+              #(debug)#::<td>#[count]#</td>#(/debug)#
+              <td id="#[handle]#_status_cell">#(terminateButton)#::
+                <div id="#[handle]#_status" style="text-decoration:blink;float:left;">Running</div>
+                <form id="#[handle]#_terminate" style="float:left;" action="Crawler_p.html" method="get" enctype="multipart/form-data" accept-charset="UTF-8">
+                    <div>
+                        <input type="hidden" name="handle" value="#[handle]#" />
+                        <input type="submit" name="terminate" value="Terminate" class="btn btn-danger btn-xs"/>
+                    </div>
+                </form>
+            #(/terminateButton)#
+              </td>
+        </tr>
    #{/list}#
    </tbody> 
  </table>
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -306,33 +306,33 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        if (ext != null) putAll(ext);
        this.doms = new ConcurrentHashMap<String, AtomicInteger>();
        String jsonString = ext.get(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key);
-    	JSONArray a;
-    	if(jsonString == null) {
-    		a = new JSONArray();
-    	} else {
-    		try {
-    			a = new JSONArray(new JSONTokener(jsonString));
-    		} catch(final JSONException e) {
-    			ConcurrentLog.logException(e);
-    			a = new JSONArray();
-    		}
-    	}
+        JSONArray a;
+        if(jsonString == null) {
+            a = new JSONArray();
+        } else {
+            try {
+                a = new JSONArray(new JSONTokener(jsonString));
+            } catch(final JSONException e) {
+                ConcurrentLog.logException(e);
+                a = new JSONArray();
+            }
+        }
        this.ignore_class_name = new HashSet<String>();
        for (int i = 0; i < a.length(); i++) try {
            this.ignore_class_name.add(a.getString(i));
        } catch (JSONException e) {}
        jsonString = ext.get(CrawlAttribute.SCRAPER.key);
        if (jsonString == null || jsonString.length() == 0) {
-        	this.scraper = new VocabularyScraper();
+            this.scraper = new VocabularyScraper();
        } else {
-        	VocabularyScraper loadedScraper;
-        	try {
-        		loadedScraper = new VocabularyScraper(jsonString);
-        	} catch(final JSONException e) {
-        		ConcurrentLog.logException(e);
-        		loadedScraper = new VocabularyScraper();	
-        	}
-    		this.scraper = loadedScraper;
+            VocabularyScraper loadedScraper;
+            try {
+                loadedScraper = new VocabularyScraper(jsonString);
+            } catch(final JSONException e) {
+                ConcurrentLog.logException(e);
+                loadedScraper = new VocabularyScraper();    
+            }
+            this.scraper = loadedScraper;
        }
    }

@ -485,22 +485,22 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return this.crawlerurlmustmatch;
    }
    
-	/**
-	 * Render the urlMustMatchPattern as a String of limited size, suffixing it with
-	 * "..." when it is truncated. Used to prevent unnecessary growth of the logs,
-	 * and to prevent exceeding the field size limit for
-	 * CollectionSchema.failreason_s (32k) when the pattern is present in a fail doc
-	 * added to the Solr index.
-	 * 
-	 * @return the urlMustMatchPattern formatted as a String of limited size
-	 */
+    /**
+     * Render the urlMustMatchPattern as a String of limited size, suffixing it with
+     * "..." when it is truncated. Used to prevent unnecessary growth of the logs,
+     * and to prevent exceeding the field size limit for
+     * CollectionSchema.failreason_s (32k) when the pattern is present in a fail doc
+     * added to the Solr index.
+     * 
+     * @return the urlMustMatchPattern formatted as a String of limited size
+     */
    public String formattedUrlMustMatchPattern() {
-    	String patternStr = urlMustMatchPattern().toString();
-    	if(patternStr.length() > 1000) {
-    		/* The pattern may be quite large when using the 'From Link-List of URL' crawl start point. */
-    		patternStr = patternStr.substring(0, Math.min(patternStr.length(), 1000)) + "...";
-    	}
-    	return patternStr;
+        String patternStr = urlMustMatchPattern().toString();
+        if(patternStr.length() > 1000) {
+            /* The pattern may be quite large when using the 'From Link-List of URL' crawl start point. */
+            patternStr = patternStr.substring(0, Math.min(patternStr.length(), 1000)) + "...";
+        }
+        return patternStr;
    }

    /**
@ -517,47 +517,47 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return this.crawlerurlmustnotmatch;
    }
    
-	/**
-	 * Get the pattern on the URL a document must match to allow adding its embedded links to the crawl stack
-	 * 
-	 * @return a {@link Pattern} instance, defaulting to
-	 *         {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression
-	 *         string is not set or its syntax is incorrect
-	 */
+    /**
+     * Get the pattern on the URL a document must match to allow adding its embedded links to the crawl stack
+     * 
+     * @return a {@link Pattern} instance, defaulting to
+     *         {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression
+     *         string is not set or its syntax is incorrect
+     */
    public Pattern getCrawlerOriginUrlMustMatchPattern() {
-		if (this.crawlerOriginUrlMustMatch == null) {
-			/* Cache the compiled pattern for faster next calls */
-			final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key);
-			try {
-				this.crawlerOriginUrlMustMatch = (patternStr == null
-						|| patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN
-								: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
-			} catch (final PatternSyntaxException e) {
-				this.crawlerOriginUrlMustMatch = CrawlProfile.MATCH_ALL_PATTERN;
-			}
-		}
+        if (this.crawlerOriginUrlMustMatch == null) {
+            /* Cache the compiled pattern for faster next calls */
+            final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key);
+            try {
+                this.crawlerOriginUrlMustMatch = (patternStr == null
+                        || patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN
+                                : Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
+            } catch (final PatternSyntaxException e) {
+                this.crawlerOriginUrlMustMatch = CrawlProfile.MATCH_ALL_PATTERN;
+            }
+        }
        return this.crawlerOriginUrlMustMatch;
    }
    
-	/**
-	 * Get the pattern on the URL a document must not match to allow adding its embedded links to the crawl stack
-	 * 
-	 * @return a {@link Pattern} instance, defaulting to
-	 *         {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression
-	 *         string is not set or its syntax is incorrect
-	 */
+    /**
+     * Get the pattern on the URL a document must not match to allow adding its embedded links to the crawl stack
+     * 
+     * @return a {@link Pattern} instance, defaulting to
+     *         {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression
+     *         string is not set or its syntax is incorrect
+     */
    public Pattern getCrawlerOriginUrlMustNotMatchPattern() {
-		if (this.crawlerOriginUrlMustNotMatch == null) {
-			/* Cache the compiled pattern for faster next calls */
-			final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key);
-			try {
-				this.crawlerOriginUrlMustNotMatch = (patternStr == null
-						|| patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN
-								: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
-			} catch (final PatternSyntaxException e) {
-				this.crawlerOriginUrlMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN;
-			}
-		}
+        if (this.crawlerOriginUrlMustNotMatch == null) {
+            /* Cache the compiled pattern for faster next calls */
+            final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key);
+            try {
+                this.crawlerOriginUrlMustNotMatch = (patternStr == null
+                        || patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN
+                                : Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
+            } catch (final PatternSyntaxException e) {
+                this.crawlerOriginUrlMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN;
+            }
+        }
        return this.crawlerOriginUrlMustNotMatch;
    }

@ -672,47 +672,47 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return this.indexcontentmustnotmatch;
    }
    
-	/**
-	 * Get the Pattern on media type that documents must match in order to be indexed
-	 * 
-	 * @return a {@link Pattern} instance, defaulting to
-	 *         {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression
-	 *         string is not set or its syntax is incorrect
-	 */
+    /**
+     * Get the Pattern on media type that documents must match in order to be indexed
+     * 
+     * @return a {@link Pattern} instance, defaulting to
+     *         {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression
+     *         string is not set or its syntax is incorrect
+     */
    public Pattern getIndexMediaTypeMustMatchPattern() {
-		if (this.indexMediaTypeMustMatch == null) {
-			/* Cache the compiled pattern for faster next calls */
-			final String patternStr = get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key);
-			try {
-				this.indexMediaTypeMustMatch = (patternStr == null
-						|| patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN
-								: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
-			} catch (final PatternSyntaxException e) {
-				this.indexMediaTypeMustMatch = CrawlProfile.MATCH_ALL_PATTERN;
-			}
-		}
+        if (this.indexMediaTypeMustMatch == null) {
+            /* Cache the compiled pattern for faster next calls */
+            final String patternStr = get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key);
+            try {
+                this.indexMediaTypeMustMatch = (patternStr == null
+                        || patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN
+                                : Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
+            } catch (final PatternSyntaxException e) {
+                this.indexMediaTypeMustMatch = CrawlProfile.MATCH_ALL_PATTERN;
+            }
+        }
        return this.indexMediaTypeMustMatch;
    }
    
-	/**
-	 * Get the Pattern on media type that documents must not match in order to be indexed
-	 * 
-	 * @return a {@link Pattern} instance, defaulting to
-	 *         {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression
-	 *         string is not set or its syntax is incorrect
-	 */
+    /**
+     * Get the Pattern on media type that documents must not match in order to be indexed
+     * 
+     * @return a {@link Pattern} instance, defaulting to
+     *         {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression
+     *         string is not set or its syntax is incorrect
+     */
    public Pattern getIndexMediaTypeMustNotMatchPattern() {
-		if (this.indexMediaTypeMustNotMatch == null) {
-			/* Cache the compiled pattern for faster next calls */
-			final String patternStr = get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key);
-			try {
-				this.indexMediaTypeMustNotMatch = (patternStr == null
-						|| patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN
-								: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
-			} catch (final PatternSyntaxException e) {
-				this.indexMediaTypeMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN;
-			}
-		}
+        if (this.indexMediaTypeMustNotMatch == null) {
+            /* Cache the compiled pattern for faster next calls */
+            final String patternStr = get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key);
+            try {
+                this.indexMediaTypeMustNotMatch = (patternStr == null
+                        || patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN
+                                : Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
+            } catch (final PatternSyntaxException e) {
+                this.indexMediaTypeMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN;
+            }
+        }
        return this.indexMediaTypeMustNotMatch;
    }
    
@ -734,29 +734,29 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
    }

-	/**
-	 * @return true when URLs of unsupported resources (no parser available or denied format) should
-	 *         be indexed as links (with metadata only on URL and not on content).
-	 */
+    /**
+     * @return true when URLs of unsupported resources (no parser available or denied format) should
+     *         be indexed as links (with metadata only on URL and not on content).
+     */
    public boolean isIndexNonParseableUrls() {
        final String r = get(CrawlAttribute.DIRECT_DOC_BY_URL.key);
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
    
-	/**
-	 * @return true when the crawler must always cross check the eventual URL file
-	 *         extension against the actual Media Type, even when file extension is
-	 *         unknown or unsupported. False when the crawler should not load URLs
-	 *         with an unknown or unsupported file extension.
-	 */
-	public boolean isCrawlerAlwaysCheckMediaType() {
-		final String r = get(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key);
-		if (r == null) {
-			return false;
-		}
-		return (r.equals(Boolean.TRUE.toString()));
-	}
+    /**
+     * @return true when the crawler must always cross check the eventual URL file
+     *         extension against the actual Media Type, even when file extension is
+     *         unknown or unsupported. False when the crawler should not load URLs
+     *         with an unknown or unsupported file extension.
+     */
+    public boolean isCrawlerAlwaysCheckMediaType() {
+        final String r = get(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key);
+        if (r == null) {
+            return false;
+        }
+        return (r.equals(Boolean.TRUE.toString()));
+    }

    public CacheStrategy cacheStrategy() {
        final String r = get(CrawlAttribute.CACHE_STRAGEGY.key);
@ -952,7 +952,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    }

    public void putProfileEntry(
-    		final String CRAWL_PROFILE_PREFIX,
+            final String CRAWL_PROFILE_PREFIX,
            final serverObjects prop,
            final boolean active,
            final boolean dark,
@ -1022,22 +1022,22 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M

    }
    
-	public static void main(String[] args) {
-    	// test to convert the key set from set to string and back
-    	Set<String> a = new HashSet<>();
-    	a.add("eins"); a.add("zwei"); a.add("drei");
-    	JSONArray j = new JSONArray(a);
-    	String s = j.toString();
-    	System.out.println(s);
-    	JSONTokener o = new JSONTokener(s);
-    	try {
-        	j = new JSONArray(o);
-        	System.out.println(j);
-        	Set<String> h = new HashSet<String>();
+    public static void main(String[] args) {
+        // test to convert the key set from set to string and back
+        Set<String> a = new HashSet<>();
+        a.add("eins"); a.add("zwei"); a.add("drei");
+        JSONArray j = new JSONArray(a);
+        String s = j.toString();
+        System.out.println(s);
+        JSONTokener o = new JSONTokener(s);
+        try {
+            j = new JSONArray(o);
+            System.out.println(j);
+            Set<String> h = new HashSet<String>();
            for (int i = 0; i < j.length(); i++) h.add(j.getString(i));
-        	System.out.println(h);
-    	} catch (JSONException e) {
-    	    e.printStackTrace();
-    	}
+            System.out.println(h);
+        } catch (JSONException e) {
+            e.printStackTrace();
+        }
    }
 }
--- a/source/net/yacy/data/BookmarkHelper.java
+++ b/source/net/yacy/data/BookmarkHelper.java
@ -60,6 +60,7 @@ import net.yacy.data.BookmarksDB.Bookmark;
 import net.yacy.data.BookmarksDB.Tag;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.document.parser.html.ContentScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.document.parser.html.TransformerWriter;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.util.FileUtils;
@ -117,12 +118,12 @@ public class BookmarkHelper {
    // --------------------------------------

    public static int importFromBookmarks(final BookmarksDB db, final DigestURL baseURL, final String input, final String tag, final boolean importPublic){
-		// convert string to input stream
-		final ByteArrayInputStream byteIn = new ByteArrayInputStream(UTF8.getBytes(input));
-		final InputStreamReader reader = new InputStreamReader(byteIn, StandardCharsets.UTF_8);
+        // convert string to input stream
+        final ByteArrayInputStream byteIn = new ByteArrayInputStream(UTF8.getBytes(input));
+        final InputStreamReader reader = new InputStreamReader(byteIn, StandardCharsets.UTF_8);

-		// import stream
-		return importFromBookmarks(db, baseURL, reader, tag, importPublic);
+        // import stream
+        return importFromBookmarks(db, baseURL, reader, tag, importPublic);
    }

    private static int importFromBookmarks(final BookmarksDB db, final DigestURL baseURL, final InputStreamReader input, final String tag, final boolean importPublic){
@ -135,7 +136,7 @@ public class BookmarkHelper {
        final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
        try {
            //load the links
-            final ContentScraper scraper = new ContentScraper(baseURL, 10000, new HashSet<String>(), new VocabularyScraper(), 0);
+            final ContentScraper scraper = new ContentScraper(baseURL, 10000, new HashSet<String>(), TagValency.EVAL, new VocabularyScraper(), 0);
            //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
            final Writer writer = new TransformerWriter(null, null, scraper, false);
            FileUtils.copy(input,writer);
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -73,7 +73,7 @@ import net.yacy.kelondro.util.ISO639;
 public class ContentScraper extends AbstractScraper implements Scraper {

    private final static int MAX_TAGSIZE = 1024 * 1024;
-	public static final int MAX_DOCSIZE = 40 * 1024 * 1024;
+    public static final int MAX_DOCSIZE = 40 * 1024 * 1024;

    private final char degree = '\u00B0';
    private final char[] minuteCharsHTML = "&#039;".toCharArray();
@ -88,9 +88,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private static final Pattern LB = Pattern.compile("\n");

    public enum TagType {
-    	/** Tag with no end tag (see https://www.w3.org/TR/html51/syntax.html#void-elements),
-    	 * optional end tag (see https://www.w3.org/TR/html51/syntax.html#optional-tags),
-    	 * or where processing directly only the start tag is desired. */
+        /** Tag with no end tag (see https://www.w3.org/TR/html51/syntax.html#void-elements),
+         * optional end tag (see https://www.w3.org/TR/html51/syntax.html#optional-tags),
+         * or where processing directly only the start tag is desired. */
        singleton,
        /** Paired tag : has a start tag and an end tag (https://www.w3.org/TR/html51/syntax.html#normal-elements) */
        pair;
@ -146,22 +146,22 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        public String name;
        public Properties opts;
        public CharBuffer content;
-        
-        /** Set to true when this tag should be ignored from scraping */
-        private boolean ignore = false;
-        
-        public Tag(final String name) {
+        private TagValency tv;
+        public Tag(final String name, TagValency defaultValency) {
            this.name = name;
+            this.tv = defaultValency;
            this.opts = new Properties();
            this.content = new CharBuffer(MAX_TAGSIZE);
        }
-        public Tag(final String name, final Properties opts) {
+        public Tag(final String name, TagValency defaultValency, final Properties opts) {
            this.name = name;
+            this.tv = defaultValency;
            this.opts = opts;
            this.content = new CharBuffer(MAX_TAGSIZE);
        }
-        public Tag(final String name, final Properties opts, final CharBuffer content) {
+        public Tag(final String name, TagValency defaultValency, final Properties opts, final CharBuffer content) {
            this.name = name;
+            this.tv = defaultValency;
            this.opts = opts;
            this.content = content;
        }
@ -178,15 +178,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        
        /** @return true when this tag should be ignored from scraping */
        public boolean isIgnore() {
-			return this.ignore;
-		}
-        
-        /**
-         * @param ignore true when this tag should be ignored from scraping
-         */
-        public void setIgnore(final boolean ignore) {
-			this.ignore = ignore;
-		}
+            return this.tv == TagValency.IGNORE;
+        }
+        public TagValency getValency() {
+            return this.tv;
+        }
+        public void setValency(final TagValency tv) {
+            this.tv = tv;
+        }
    }

    // all these tags must be given in lowercase, because the tags from the files are compared in lowercase
@ -205,10 +204,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private final List<ImageEntry> images; 
    private final SizeLimitedSet<AnchorURL> script, frames, iframes;
    
-	/**
-	 * URLs of linked data item types referenced from HTML content with standard
-	 * annotations such as RDFa, microdata, microformats or JSON-LD
-	 */
+    /**
+     * URLs of linked data item types referenced from HTML content with standard
+     * annotations such as RDFa, microdata, microformats or JSON-LD
+     */
    private final SizeLimitedSet<DigestURL> linkedDataTypes;
    
    private final SizeLimitedMap<String, String> metas;
@ -230,8 +229,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    
    private final VocabularyScraper vocabularyScraper;
    
-    /** Set of CSS class names whose matching div elements content should be ignored */
-    private final Set<String> ignoreDivClassNames;
+    /** Set of CSS class names whose matching div elements may switch from IGNORE to EVAL or vice versa */
+    private final Set<String> valencySwitchTagNames;
+    private final TagValency defaultValency;
    
    private final int timezoneOffset;
    private int breadcrumbs;
@ -261,19 +261,28 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     * @param root the document root url
     * @param maxAnchors the maximum number of URLs to process and store in the anchors property.
     * @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
-     * @param ignoreDivClassNames an eventual set of CSS class names whose matching div elements content should be ignored
+     * @param valencySwitchTagNames an eventual set of CSS class names whose matching div elements content should be ignored
+     * @param defaultValency the valency default; should be TagValency.EVAL by default
     * @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
     * @param timezoneOffset local time zone offset
     */
    @SuppressWarnings("unchecked")
-    public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set<String> ignoreDivClassNames, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
+    public ContentScraper(
+            final DigestURL root,
+            final int maxAnchors,
+            final int maxLinks,
+            final Set<String> valencySwitchTagNames,
+            final TagValency defaultValency,
+            final VocabularyScraper vocabularyScraper,
+            int timezoneOffset) {
        // the root value here will not be used to load the resource.
        // it is only the reference for relative links
        super(linkTags0, linkTags1);
        assert root != null;
        this.root = root;
        this.vocabularyScraper = vocabularyScraper;
-        this.ignoreDivClassNames = ignoreDivClassNames;
+        this.valencySwitchTagNames = valencySwitchTagNames;
+        this.defaultValency = defaultValency;
        this.timezoneOffset = timezoneOffset;
        this.evaluationScores = new Evaluation();
        this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
@ -321,10 +330,20 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     * @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
     * @param timezoneOffset local time zone offset
     */
-    public ContentScraper(final DigestURL root, final int maxLinks, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
-        this(root, Integer.MAX_VALUE, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
+    public ContentScraper(
+            final DigestURL root,
+            final int maxLinks,
+            final Set<String> valencySwitchTagNames,
+            final TagValency defaultValency,
+            final VocabularyScraper vocabularyScraper,
+            int timezoneOffset) {
+        this(root, Integer.MAX_VALUE, maxLinks, valencySwitchTagNames, defaultValency, vocabularyScraper, timezoneOffset);
    }

+    public TagValency defaultValency() {
+        return this.defaultValency;
+    }
+    
    @Override
    public void finish() {
        this.content.trimToSize();
@ -333,12 +352,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    @Override
    public void scrapeText(final char[] newtext0, final Tag insideTag) {
        if (insideTag != null) {
-        	if(insideTag.ignore) {
-        		return;
-        	}
-			if ((TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name))) {
-				return;
-			}
+            if (insideTag.tv == TagValency.IGNORE) {
+                return;
+            }
+            if ((TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name))) {
+                return;
+            }
        }
        int p, pl, q, s = 0;
        char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray();
@ -414,21 +433,21 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        List<ContentScraperListener> anchorListeners = new ArrayList<>();
        for (int i = 0; i < listeners.length; i += 2) {
            if (listeners[i] == ContentScraperListener.class) {
-            	anchorListeners.add((ContentScraperListener)listeners[i+1]);
+                anchorListeners.add((ContentScraperListener)listeners[i+1]);
            }
        }
        
        if(!this.maxAnchorsExceeded) {
-        	int maxLinksToDetect = this.maxAnchors - this.anchors.size();
-        	if(maxLinksToDetect < Integer.MAX_VALUE) {
-        		/* Add one to the anchors limit to detect when the limit is exceeded */
-        		maxLinksToDetect++;
-        	}
-        	findAbsoluteURLs(b, this.anchors, anchorListeners, maxLinksToDetect);
-        	if(this.anchors.size() > this.maxAnchors) {
-        		this.maxAnchorsExceeded = true;
-        		this.anchors.remove(this.anchors.size() -1);
-        	}
+            int maxLinksToDetect = this.maxAnchors - this.anchors.size();
+            if(maxLinksToDetect < Integer.MAX_VALUE) {
+                /* Add one to the anchors limit to detect when the limit is exceeded */
+                maxLinksToDetect++;
+            }
+            findAbsoluteURLs(b, this.anchors, anchorListeners, maxLinksToDetect);
+            if(this.anchors.size() > this.maxAnchors) {
+                this.maxAnchorsExceeded = true;
+                this.anchors.remove(this.anchors.size() -1);
+            }
        }
        
        // append string to content
@ -453,7 +472,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     */
    public static long findAbsoluteURLs(final String text, final Collection<AnchorURL> urls, final Collection<ContentScraperListener> listeners, final long maxURLs) {
        if(text == null) {
-        	return 0;
+            return 0;
        }
        int schemePosition, offset = 0;
        boolean hasWhiteSpace;
@ -465,7 +484,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        long detectedURLsCount = 0;
        while (offset < text.length() && detectedURLsCount < maxURLs) {
            if(!urlSchemeMatcher.find(offset)) {
-            	break;
+                break;
            }
            schemePosition = urlSchemeMatcher.start();
            
@ -473,26 +492,26 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            urlString = text.substring(schemePosition, hasWhiteSpace ? whiteSpaceMatcher.start() : text.length());
            
            if (urlString.endsWith(".")) {
-            	urlString = urlString.substring(0, urlString.length() - 1); // remove the '.' that was appended above
+                urlString = urlString.substring(0, urlString.length() - 1); // remove the '.' that was appended above
            }
            /* URLs can contain brackets, furthermore as they can even be reserved characters in the URI syntax (see https://tools.ietf.org/html/rfc3986#section-2.2)
             * But when unpaired, in most cases this is that the unpaired bracket is not part of the URL, but rather used to wrap it in the text*/
            urlString = removeUnpairedBrackets(urlString, '(', ')');
            urlString = removeUnpairedBrackets(urlString, '{', '}');
-           	urlString = removeUnpairedBrackets(urlString, '[', ']');
+               urlString = removeUnpairedBrackets(urlString, '[', ']');
            
            offset = schemePosition + urlString.length();
            try {
-            	url = new AnchorURL(urlString);
-            	detectedURLsCount++;
-            	if(urls != null) {
-            		urls.add(url);
-            	}
-            	if(listeners != null) {
-            		for(ContentScraperListener listener : listeners) {
-            			listener.anchorAdded(url.toNormalform(false));
-            		}
-            	}
+                url = new AnchorURL(urlString);
+                detectedURLsCount++;
+                if(urls != null) {
+                    urls.add(url);
+                }
+                if(listeners != null) {
+                    for(ContentScraperListener listener : listeners) {
+                        listener.anchorAdded(url.toNormalform(false));
+                    }
+                }
            } catch (final MalformedURLException ignored) {}
        }
        return detectedURLsCount;
@ -505,62 +524,62 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     * @param listeners a collection of listeners to trigger.
     */
    public static void findAbsoluteURLs(final String text, final Collection<AnchorURL> urls, final Collection<ContentScraperListener> listeners) {
-    	findAbsoluteURLs(text, urls, listeners, Long.MAX_VALUE);
-    }
-
-	/**
-	 * Analyze bracket pairs found in the string and eventually
-	 * return a truncated version of that string when one or more pairs are incomplete
-	 * 
-	 * @param str
-	 *            the string to analyze
-	 * @param openingMark
-	 *            the opening bracket character (example : '{')
-	 * @param closingMark
-	 *            the closing bracket character (example : '}')
-	 * @return the original string or a truncated copy
-	 */
-	protected static String removeUnpairedBrackets(final String str, final char openingMark,
-			final char closingMark) {
-		if(str == null) {
-			return null;
-		}
-		String result = str;
-		char ch;
-		int depth = 0, index = 0, lastUnpairedOpeningIndex = -1;
-		/* Loop on all characters of the string */
-		for(; index < str.length(); index++) {
-			ch = str.charAt(index);
-			if(ch == openingMark) {
-				if(depth == 0) {
-					lastUnpairedOpeningIndex = index;
-				}
-				depth++;
-			} else if(ch == closingMark) {
-				depth--;
-				if(depth == 0) {
-					lastUnpairedOpeningIndex = -1;
-				}
-			}
-			if(depth < 0) {
-				/* Unpaired closing mark : stop the loop here */
-				break;
-			}
-		}
-		
-		if (depth > 0) {
-			/* One or more unpaired opening marks : truncate at the first opening level */
-			if(lastUnpairedOpeningIndex >= 0) {
-				result = str.substring(0, lastUnpairedOpeningIndex);
-			}
-		} else if (depth < 0) {
-			/* One or more unpaired closing marks : truncate at the current index as the loop should have been exited with a break */
-			if(index >= 0) {
-				result = str.substring(0, index);
-			}
-		}
-		return result;
-	}
+        findAbsoluteURLs(text, urls, listeners, Long.MAX_VALUE);
+    }
+
+    /**
+     * Analyze bracket pairs found in the string and eventually
+     * return a truncated version of that string when one or more pairs are incomplete
+     * 
+     * @param str
+     *            the string to analyze
+     * @param openingMark
+     *            the opening bracket character (example : '{')
+     * @param closingMark
+     *            the closing bracket character (example : '}')
+     * @return the original string or a truncated copy
+     */
+    protected static String removeUnpairedBrackets(final String str, final char openingMark,
+            final char closingMark) {
+        if(str == null) {
+            return null;
+        }
+        String result = str;
+        char ch;
+        int depth = 0, index = 0, lastUnpairedOpeningIndex = -1;
+        /* Loop on all characters of the string */
+        for(; index < str.length(); index++) {
+            ch = str.charAt(index);
+            if(ch == openingMark) {
+                if(depth == 0) {
+                    lastUnpairedOpeningIndex = index;
+                }
+                depth++;
+            } else if(ch == closingMark) {
+                depth--;
+                if(depth == 0) {
+                    lastUnpairedOpeningIndex = -1;
+                }
+            }
+            if(depth < 0) {
+                /* Unpaired closing mark : stop the loop here */
+                break;
+            }
+        }
+        
+        if (depth > 0) {
+            /* One or more unpaired opening marks : truncate at the first opening level */
+            if(lastUnpairedOpeningIndex >= 0) {
+                result = str.substring(0, lastUnpairedOpeningIndex);
+            }
+        } else if (depth < 0) {
+            /* One or more unpaired closing marks : truncate at the current index as the loop should have been exited with a break */
+            if(index >= 0) {
+                result = str.substring(0, index);
+            }
+        }
+        return result;
+    }

    /**
     * @param relativePath relative path to this document base URL
@ -574,42 +593,42 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        }
    }
    
-	/**
-	 * Parse the eventual microdata itemtype attribute of a tag and extract its
-	 * valid URL tokens when the itemscope attribute is present.
-	 * 
-	 * @param tagAttributes parsed HTML tag attributes.
-	 * @return a set of URLs eventually empty when no itemtype attribute is present
-	 *         or when its value is not valid
-	 * @see <a href="https://www.w3.org/TR/microdata/#dfn-itemtype">itemtype
-	 *      definition at W3C</a>
-	 * @see <a href=
-	 *      "https://html.spec.whatwg.org/multipage/microdata.html#attr-itemtype">itemtype
-	 *      definition at WHATWG</a>
-	 */
-	private Set<DigestURL> parseMicrodataItemType(final Properties tagAttributes) {
-		final Set<DigestURL> types = new HashSet<>();
-		if (tagAttributes != null) {
-			/*
-			 * The itemtype attribute must not be specified on elements that do not have an
-			 * itemscope attribute specified. So we lazily check here for itemscope boolean
-			 * attribute presence (strictly conforming parsing would also check it has no
-			 * value or the value is the empty string or "itemscope")
-			 */
-			if (tagAttributes.getProperty("itemscope") != null) {
-				final Set<String> itemTypes = parseSpaceSeparatedTokens(tagAttributes.getProperty("itemtype"));
-
-				for (final String itemType : itemTypes) {
-					try {
-						types.add(new DigestURL(itemType));
-					} catch (final MalformedURLException ignored) {
-						/* Each itemtype space-separated token must be a valid absolute URL */
-					}
-				}
-			}
-		}
-		return types;
-	}
+    /**
+     * Parse the eventual microdata itemtype attribute of a tag and extract its
+     * valid URL tokens when the itemscope attribute is present.
+     * 
+     * @param tagAttributes parsed HTML tag attributes.
+     * @return a set of URLs eventually empty when no itemtype attribute is present
+     *         or when its value is not valid
+     * @see <a href="https://www.w3.org/TR/microdata/#dfn-itemtype">itemtype
+     *      definition at W3C</a>
+     * @see <a href=
+     *      "https://html.spec.whatwg.org/multipage/microdata.html#attr-itemtype">itemtype
+     *      definition at WHATWG</a>
+     */
+    private Set<DigestURL> parseMicrodataItemType(final Properties tagAttributes) {
+        final Set<DigestURL> types = new HashSet<>();
+        if (tagAttributes != null) {
+            /*
+             * The itemtype attribute must not be specified on elements that do not have an
+             * itemscope attribute specified. So we lazily check here for itemscope boolean
+             * attribute presence (strictly conforming parsing would also check it has no
+             * value or the value is the empty string or "itemscope")
+             */
+            if (tagAttributes.getProperty("itemscope") != null) {
+                final Set<String> itemTypes = parseSpaceSeparatedTokens(tagAttributes.getProperty("itemtype"));
+
+                for (final String itemType : itemTypes) {
+                    try {
+                        types.add(new DigestURL(itemType));
+                    } catch (final MalformedURLException ignored) {
+                        /* Each itemtype space-separated token must be a valid absolute URL */
+                    }
+                }
+            }
+        }
+        return types;
+    }
    
    private void checkOpts(final Tag tag) {
        // vocabulary classes
@ -651,53 +670,53 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        }
    }
    
-	/**
-	 * Parses sizes icon link attribute. (see
-	 * http://www.w3.org/TR/html5/links.html#attr-link-sizes) Eventual
-	 * duplicates are removed.
-	 * 
-	 * @param sizesAttr
-	 *            sizes attribute string, may be null
-	 * @return a set of sizes eventually empty.
-	 */
-	public static Set<Dimension> parseSizes(String sizesAttr) {
-		Set<Dimension> sizes = new HashSet<Dimension>();
-		Set<String> tokens = parseSpaceSeparatedTokens(sizesAttr);
-		for (String token : tokens) {
-			/*
-			 * "any" keyword may be present, but doesn't have to produce a
-			 * dimension result
-			 */
-			if (token != null) {
-				Matcher matcher = IconEntry.SIZE_PATTERN.matcher(token);
-				if (matcher.matches()) {
-					/* With given pattern no NumberFormatException can occur */
-					sizes.add(new Dimension(Integer.parseInt(matcher.group(1)), Integer.parseInt(matcher.group(2))));
-				}
-			}
-		}
-		return sizes;
-	}
-
-	/**
-	 * Parses a space separated tokens attribute value (see
-	 * http://www.w3.org/TR/html5/infrastructure.html#space-separated-tokens).
-	 * Eventual duplicates are removed.
-	 * 
-	 * @param attr
-	 *            attribute string, may be null
-	 * @return a set of tokens eventually empty
-	 */
-	public static Set<String> parseSpaceSeparatedTokens(final String attr) {
-		Set<String> tokens = new HashSet<>();
-		/* Check attr string is not empty to avoid adding a single empty string
-		 * in result */
-		if (attr != null && !attr.trim().isEmpty()) {
-			String[] items = attr.trim().split(CommonPattern.SPACES.pattern());
-			Collections.addAll(tokens, items);
-		}
-		return tokens;
-	}
+    /**
+     * Parses sizes icon link attribute. (see
+     * http://www.w3.org/TR/html5/links.html#attr-link-sizes) Eventual
+     * duplicates are removed.
+     * 
+     * @param sizesAttr
+     *            sizes attribute string, may be null
+     * @return a set of sizes eventually empty.
+     */
+    public static Set<Dimension> parseSizes(String sizesAttr) {
+        Set<Dimension> sizes = new HashSet<Dimension>();
+        Set<String> tokens = parseSpaceSeparatedTokens(sizesAttr);
+        for (String token : tokens) {
+            /*
+             * "any" keyword may be present, but doesn't have to produce a
+             * dimension result
+             */
+            if (token != null) {
+                Matcher matcher = IconEntry.SIZE_PATTERN.matcher(token);
+                if (matcher.matches()) {
+                    /* With given pattern no NumberFormatException can occur */
+                    sizes.add(new Dimension(Integer.parseInt(matcher.group(1)), Integer.parseInt(matcher.group(2))));
+                }
+            }
+        }
+        return sizes;
+    }
+
+    /**
+     * Parses a space separated tokens attribute value (see
+     * http://www.w3.org/TR/html5/infrastructure.html#space-separated-tokens).
+     * Eventual duplicates are removed.
+     * 
+     * @param attr
+     *            attribute string, may be null
+     * @return a set of tokens eventually empty
+     */
+    public static Set<String> parseSpaceSeparatedTokens(final String attr) {
+        Set<String> tokens = new HashSet<>();
+        /* Check attr string is not empty to avoid adding a single empty string
+         * in result */
+        if (attr != null && !attr.trim().isEmpty()) {
+            String[] items = attr.trim().split(CommonPattern.SPACES.pattern());
+            Collections.addAll(tokens, items);
+        }
+        return tokens;
+    }
    
    /**
     * Retain only icon relations (standard and non standard) from tokens .
@ -705,13 +724,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     * @return a Set of icon relations, eventually empty
     */
    public Set<String> retainIconRelations(Collection<String> relTokens) {
-    	HashSet<String> iconRels = new HashSet<>();
-    	for(String token : relTokens) {
-    		if(IconLinkRelations.isIconRel(token)) {
-    			iconRels.add(token.toLowerCase(Locale.ENGLISH));
-    		}
-    	}
-    	return iconRels;
+        HashSet<String> iconRels = new HashSet<>();
+        for(String token : relTokens) {
+            if(IconLinkRelations.isIconRel(token)) {
+                iconRels.add(token.toLowerCase(Locale.ENGLISH));
+            }
+        }
+        return iconRels;
    }

    /**
@ -720,9 +739,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     */
    @Override
    public void scrapeTag0(final Tag tag) {
-    	if(tag.ignore) {
-    		return;
-    	}
+        if (tag.tv == TagValency.IGNORE) {
+            return;
+        }
        checkOpts(tag);
        if (tag.name.equalsIgnoreCase("img")) {
            final String src = tag.opts.getProperty("src", EMPTY_STRING);
@ -740,23 +759,23 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            } catch (final NumberFormatException e) {}
            this.evaluationScores.match(Element.imgpath, src);
        } else if(tag.name.equalsIgnoreCase("base")) {
-        	final String baseHref = tag.opts.getProperty("href", EMPTY_STRING);
-        	if(!baseHref.isEmpty()) {
-        		/* We must use here AnchorURL.newAnchor as the base href may also be an URL relative to the document URL */
-        		try {
-        			this.root = AnchorURL.newAnchor(this.root, baseHref);
-        		} catch (final MalformedURLException | RuntimeException ignored) {
-        			/* Nothing more to do when the base URL is malformed */
-        		}
-        	}
+            final String baseHref = tag.opts.getProperty("href", EMPTY_STRING);
+            if(!baseHref.isEmpty()) {
+                /* We must use here AnchorURL.newAnchor as the base href may also be an URL relative to the document URL */
+                try {
+                    this.root = AnchorURL.newAnchor(this.root, baseHref);
+                } catch (final MalformedURLException | RuntimeException ignored) {
+                    /* Nothing more to do when the base URL is malformed */
+                }
+            }
        } else if (tag.name.equalsIgnoreCase("frame")) {
            final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
            if(src != null) {
-            	tag.opts.put("src", src.toNormalform(true));
-            	src.setAll(tag.opts);
-            	//this.addAnchor(src); // don't add the frame to the anchors because the webgraph should not contain such links (by definition)
-            	this.frames.add(src);
-            	this.evaluationScores.match(Element.framepath, src.toNormalform(true));
+                tag.opts.put("src", src.toNormalform(true));
+                src.setAll(tag.opts);
+                //this.addAnchor(src); // don't add the frame to the anchors because the webgraph should not contain such links (by definition)
+                this.frames.add(src);
+                this.evaluationScores.match(Element.framepath, src.toNormalform(true));
            }
        } else if (tag.name.equalsIgnoreCase("body")) {
            final String classprop = tag.opts.getProperty("class", EMPTY_STRING);
@ -786,9 +805,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                tag.opts.put("name", areatitle);
                AnchorURL url = absolutePath(href);
                if(url != null) {
-                	tag.opts.put("href", url.toNormalform(true));
-                	url.setAll(tag.opts);
-                	this.addAnchor(url);
+                    tag.opts.put("href", url.toNormalform(true));
+                    url.setAll(tag.opts);
+                    this.addAnchor(url);
                }
            }
        } else if (tag.name.equalsIgnoreCase("link")) {
@ -808,18 +827,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                Set<String> iconRels = retainIconRelations(relTokens);
                /* Distinguish icons from images. It will enable for example to later search only images and no icons */
                if (!iconRels.isEmpty()) {
-                	String sizesAttr = tag.opts.getProperty("sizes", EMPTY_STRING);
-                	Set<Dimension> sizes = parseSizes(sizesAttr);
-                	IconEntry icon = this.icons.get(newLink);
-                	/* There is already an icon with same URL for this document : 
-                	 * they may have different rel attribute or different sizes (multi sizes ico file) or this may be a duplicate */
-                	if(icon != null) {
-                		icon.getRel().addAll(iconRels);
-                		icon.getSizes().addAll(sizes);
-                	} else {
-                		icon = new IconEntry(newLink, iconRels, sizes);
-                		this.icons.put(newLink, icon);
-                	}
+                    String sizesAttr = tag.opts.getProperty("sizes", EMPTY_STRING);
+                    Set<Dimension> sizes = parseSizes(sizesAttr);
+                    IconEntry icon = this.icons.get(newLink);
+                    /* There is already an icon with same URL for this document : 
+                     * they may have different rel attribute or different sizes (multi sizes ico file) or this may be a duplicate */
+                    if(icon != null) {
+                        icon.getRel().addAll(iconRels);
+                        icon.getSizes().addAll(sizes);
+                    } else {
+                        icon = new IconEntry(newLink, iconRels, sizes);
+                        this.icons.put(newLink, icon);
+                    }
                } else if (rel.equalsIgnoreCase("canonical")) {
                    tag.opts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
                    newLink.setAll(tag.opts);
@ -863,19 +882,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            if (name.equalsIgnoreCase("movie")) {
                AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING));
                if(url != null) {
-                	tag.opts.put("value", url.toNormalform(true));
-                	url.setAll(tag.opts);
-                	this.addAnchor(url);
+                    tag.opts.put("value", url.toNormalform(true));
+                    url.setAll(tag.opts);
+                    this.addAnchor(url);
                }
            }
        } else if (tag.name.equalsIgnoreCase("iframe")) {
            final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
            if(src != null) {
-            	tag.opts.put("src", src.toNormalform(true));
-            	src.setAll(tag.opts);
-            	// this.addAnchor(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition)
-            	this.iframes.add(src);
-            	this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
+                tag.opts.put("src", src.toNormalform(true));
+                src.setAll(tag.opts);
+                // this.addAnchor(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition)
+                this.iframes.add(src);
+                this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
            }
        } else if (tag.name.equalsIgnoreCase("html")) {
            final String lang = tag.opts.getProperty("lang", EMPTY_STRING);
@ -893,9 +912,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     */
    @Override
    public void scrapeTag1(final Tag tag) {
-    	if(tag.ignore) {
-    		return;
-    	}
+        if (tag.tv == TagValency.IGNORE) {
+            return;
+        }
        checkOpts(tag);
        // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
        if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
@ -916,12 +935,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        }
        final String h;
        if (tag.name.equalsIgnoreCase("div")) {
-	       final String id = tag.opts.getProperty("id", EMPTY_STRING);
-	       this.evaluationScores.match(Element.divid, id);
-	       final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
-	       if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
-	    	   breadcrumbs++;
-	       }
+           final String id = tag.opts.getProperty("id", EMPTY_STRING);
+           this.evaluationScores.match(Element.divid, id);
+           final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
+           if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
+               breadcrumbs++;
+           }
        } else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
            if (h.length() > 0) this.headlines[0].add(h);
@ -971,10 +990,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        } else if (tag.name.equalsIgnoreCase("script")) {
            final String src = tag.opts.getProperty("src", EMPTY_STRING);
            if (src.length() > 0) {
-            	AnchorURL absoluteSrc = absolutePath(src);
-            	if(absoluteSrc != null) {
-            		this.script.add(absoluteSrc);
-            	}
+                AnchorURL absoluteSrc = absolutePath(src);
+                if(absoluteSrc != null) {
+                    this.script.add(absoluteSrc);
+                }
                this.evaluationScores.match(Element.scriptpath, src);
            } else {
                this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(tag.content.getChars())).replaceAll(" "));
@ -996,54 +1015,47 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        this.fireScrapeTag1(tag.name, tag.opts, tag.content.getChars());
    }
    
-	/**
-	 * Scraping operation applied to any kind of tag opening, being either singleton
-	 * or paired tag, not restricted to tags listed in
-	 * {@link ContentScraper#linkTags0} and {@link ContentScraper#linkTags1}.
-	 */
-	@Override
-	public void scrapeAnyTagOpening(final Tag tag) {
-		if (tag != null && !tag.ignore && tag.opts != null) {
-			/*
-			 * HTML microdata can be annotated on any kind of tag, so we don't restrict this
-			 * scraping to the limited sets in linkTags0 and linkTags1
-			 */
-			this.linkedDataTypes.addAll(parseMicrodataItemType(tag.opts));
-		}
-	}
-	
-	@Override
-	public boolean shouldIgnoreTag(final Tag tag, final Tag parentTag) {
-		boolean ignore = false;
-		
-        /* First, inherit ignore property from eventual parent */
-		if(parentTag != null) {
-			ignore = parentTag.ignore;
-		}
-		
-		/* Parent is not marked as ignored : let's check the current tag */
-		if (!ignore &&
-		        this.ignoreDivClassNames != null &&
-		        tag != null &&
-		        (TagName.div.name().equals(tag.name) || TagName.nav.name().equals(tag.name))) {
-			final String classAttr = tag.opts.getProperty("class", EMPTY_STRING);
-			final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr);
-			ignore = !Collections.disjoint(this.ignoreDivClassNames, classes);
-		}
-		return ignore;
-	}
+    /**
+     * Scraping operation applied to any kind of tag opening, being either singleton
+     * or paired tag, not restricted to tags listed in
+     * {@link ContentScraper#linkTags0} and {@link ContentScraper#linkTags1}.
+     */
+    @Override
+    public void scrapeAnyTagOpening(final Tag tag) {
+        if (tag != null && tag.tv == TagValency.EVAL && tag.opts != null) {
+            /*
+             * HTML microdata can be annotated on any kind of tag, so we don't restrict this
+             * scraping to the limited sets in linkTags0 and linkTags1
+             */
+            this.linkedDataTypes.addAll(parseMicrodataItemType(tag.opts));
+        }
+    }
+    
+    @Override
+    public TagValency tagValency(final Tag tag, final Tag parentTag) {
+        if (parentTag != null && parentTag.tv != this.defaultValency) return parentTag.tv;
+        
+        if (this.valencySwitchTagNames != null &&
+            tag != null &&
+            (TagName.div.name().equals(tag.name) || TagName.nav.name().equals(tag.name))) {
+            final String classAttr = tag.opts.getProperty("class", EMPTY_STRING);
+            final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr);
+            if (!Collections.disjoint(this.valencySwitchTagNames, classes)) return this.defaultValency.reverse();
+        }
+        return this.defaultValency;
+    }
    
    /**
     * Add an anchor to the anchors list, and trigger any eventual listener
     * @param anchor anchor to add. Must not be null.
     */
    protected void addAnchor(AnchorURL anchor) {
-    	if(this.anchors.size() >= this.maxAnchors) {
-    		this.maxAnchorsExceeded = true;
-    	} else {
-    		this.anchors.add(anchor);
-    		this.fireAddAnchor(anchor.toNormalform(false));
-    	}
+        if(this.anchors.size() >= this.maxAnchors) {
+            this.maxAnchorsExceeded = true;
+        } else {
+            this.anchors.add(anchor);
+            this.fireAddAnchor(anchor.toNormalform(false));
+        }
    }


@ -1203,13 +1215,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        return this.iframes;
    }
    
-	/**
-	 * @return URLs of linked data item types referenced from HTML content with standard
-	 *         annotations such as RDFa, microdata, microformats or JSON-LD
-	 */
-	public SizeLimitedSet<DigestURL> getLinkedDataTypes() {
-		return this.linkedDataTypes;
-	}
+    /**
+     * @return URLs of linked data item types referenced from HTML content with standard
+     *         annotations such as RDFa, microdata, microformats or JSON-LD
+     */
+    public SizeLimitedSet<DigestURL> getLinkedDataTypes() {
+        return this.linkedDataTypes;
+    }

    public Set<AnchorURL> getScript() {
        return this.script;
@ -1258,32 +1270,32 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     * @return true when the limit on content size scraped has been exceeded
     */
    public boolean isContentSizeLimitExceeded() {
-		return this.contentSizeLimitExceeded;
-	}
+        return this.contentSizeLimitExceeded;
+    }
    
    /**
     * @param contentSizeLimitExceeded set to true when a limit on content size scraped has been exceeded
     */
    public void setContentSizeLimitExceeded(final boolean contentSizeLimitExceeded) {
-		this.contentSizeLimitExceeded = contentSizeLimitExceeded;
-	}
+        this.contentSizeLimitExceeded = contentSizeLimitExceeded;
+    }
    
    /**
     * @return true when the maxAnchors limit has been exceeded
     */
    public boolean isMaxAnchorsExceeded() {
-		return this.maxAnchorsExceeded;
-	}
+        return this.maxAnchorsExceeded;
+    }
    
    /**
     * @return true when at least one limit on content size, anchors number or links number has been exceeded
     */
-	public boolean isLimitsExceeded() {
-		return this.contentSizeLimitExceeded || this.maxAnchorsExceeded || this.css.isLimitExceeded()
-				|| this.rss.isLimitExceeded() || this.embeds.isLimitExceeded() || this.metas.isLimitExceeded()
-				|| this.hreflang.isLimitExceeded() || this.navigation.isLimitExceeded() || this.script.isLimitExceeded()
-				|| this.frames.isLimitExceeded() || this.iframes.isLimitExceeded() || this.linkedDataTypes.isLimitExceeded();
-	}
+    public boolean isLimitsExceeded() {
+        return this.contentSizeLimitExceeded || this.maxAnchorsExceeded || this.css.isLimitExceeded()
+                || this.rss.isLimitExceeded() || this.embeds.isLimitExceeded() || this.metas.isLimitExceeded()
+                || this.hreflang.isLimitExceeded() || this.navigation.isLimitExceeded() || this.script.isLimitExceeded()
+                || this.frames.isLimitExceeded() || this.iframes.isLimitExceeded() || this.linkedDataTypes.isLimitExceeded();
+    }
    
    /*
    DC in html example:
@ -1544,11 +1556,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    @Override
    public void registerHtmlFilterEventListener(final ScraperListener listener) {
        if (listener != null) {
-        	if(listener instanceof ContentScraperListener) {
-        		this.htmlFilterEventListeners.add(ContentScraperListener.class, (ContentScraperListener)listener);
-        	} else {
-        		this.htmlFilterEventListeners.add(ScraperListener.class, listener);
-        	}
+            if(listener instanceof ContentScraperListener) {
+                this.htmlFilterEventListeners.add(ContentScraperListener.class, (ContentScraperListener)listener);
+            } else {
+                this.htmlFilterEventListeners.add(ScraperListener.class, listener);
+            }
        }
    }

@ -1559,11 +1571,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    @Override
    public void deregisterHtmlFilterEventListener(final ScraperListener listener) {
        if (listener != null) {
-        	if(listener instanceof ContentScraperListener) {
-        		this.htmlFilterEventListeners.remove(ContentScraperListener.class, (ContentScraperListener)listener);
-        	} else {
-        		this.htmlFilterEventListeners.remove(ScraperListener.class, listener);
-        	}
+            if(listener instanceof ContentScraperListener) {
+                this.htmlFilterEventListeners.remove(ContentScraperListener.class, (ContentScraperListener)listener);
+            } else {
+                this.htmlFilterEventListeners.remove(ScraperListener.class, listener);
+            }
        }
    }

@ -1604,13 +1616,25 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        if (page == null) throw new IOException("no content in file " + file.toString());

        // scrape document to look up charset
-        final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), new DigestURL("http://localhost"), false, maxLinks, timezoneOffset);
+        final ScraperInputStream htmlFilter = new ScraperInputStream(
+                new ByteArrayInputStream(page), 
+                StandardCharsets.UTF_8.name(), 
+                new HashSet<String>(), TagValency.EVAL,
+                new VocabularyScraper(), 
+                new DigestURL("http://localhost"), 
+                false, maxLinks, timezoneOffset);
        String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
        htmlFilter.close();
        if (charset == null) charset = Charset.defaultCharset().toString();

        // scrape content
-        final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
+        final ContentScraper scraper = new ContentScraper(
+                new DigestURL("http://localhost"), 
+                maxLinks, 
+                new HashSet<String>(), 
+                TagValency.EVAL, 
+                new VocabularyScraper(), 
+                timezoneOffset);
        final Writer writer = new TransformerWriter(null, null, scraper, false);
        FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
        writer.close();
--- a/source/net/yacy/document/parser/html/Scraper.java
+++ b/source/net/yacy/document/parser/html/Scraper.java
@ -26,26 +26,26 @@ package net.yacy.document.parser.html;

 public interface Scraper {

-	/**
-	 * @param tag
-	 *            a tag name
-	 * @return true when the tag name belongs to the first category of tags
-	 *         according to the Scraper implementation, and is therefore candidate
-	 *         for processing by
-	 *         {@link #scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)}
-	 *         implementation
-	 */
+    /**
+     * @param tag
+     *            a tag name
+     * @return true when the tag name belongs to the first category of tags
+     *         according to the Scraper implementation, and is therefore candidate
+     *         for processing by
+     *         {@link #scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)}
+     *         implementation
+     */
    public boolean isTag0(String tag);

-	/**
-	 * @param tag
-	 *            a tag name
-	 * @return true when the tag name belongs to the second category of tags
-	 *         according to the Scraper implementation, and is therefore candidate
-	 *         for processing by
-	 *         {@link #scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)}
-	 *         implementation
-	 */
+    /**
+     * @param tag
+     *            a tag name
+     * @return true when the tag name belongs to the second category of tags
+     *         according to the Scraper implementation, and is therefore candidate
+     *         for processing by
+     *         {@link #scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)}
+     *         implementation
+     */
    public boolean isTag1(String tag);

    /**
@ -73,14 +73,16 @@ public interface Scraper {
     */
    public void scrapeAnyTagOpening(ContentScraper.Tag tag);
    
-	/**
-	 * @param tag
-	 *            a parsed tag
-	 * @param parentTag the eventual parent tag
-	 * @return true when the tag should be ignored according to the scraper
-	 *         implementation rules
-	 */
-    public boolean shouldIgnoreTag(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag);
+    /**
+     * @param tag
+     *            a parsed tag
+     * @param parentTag the eventual parent tag
+     * @return true when the tag should be ignored according to the scraper
+     *         implementation rules
+     */
+    public TagValency tagValency(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag);
+    
+    public TagValency defaultValency();

    public void scrapeComment(final char[] comment);

--- a/source/net/yacy/document/parser/html/ScraperInputStream.java
+++ b/source/net/yacy/document/parser/html/ScraperInputStream.java
@ -62,7 +62,8 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
    public ScraperInputStream(
            final InputStream inStream,
            final String inputStreamCharset,
-            final Set<String> ignore_class_name,
+            final Set<String> valencySwitchTagNames,
+            final TagValency defaultValency,
            final VocabularyScraper vocabularyScraper,
            final DigestURL rooturl,
            final boolean passbyIfBinarySuspect,
@ -73,7 +74,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
        this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
        this.bufferedIn.mark((int) preBufferSize);

-        final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
+        final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, valencySwitchTagNames, defaultValency, vocabularyScraper, timezoneOffset);
        scraper.registerHtmlFilterEventListener(this);

        try {
--- a/source/net/yacy/document/parser/html/TagValency.java
+++ b/source/net/yacy/document/parser/html/TagValency.java
@ -0,0 +1,30 @@
+/**
+ *  TagValency
+ *  Copyright 2023 by Michael Peter Christen, @0rb1t3r
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file lgpl21.txt
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+package net.yacy.document.parser.html;
+
+public enum TagValency {
+
+    IGNORE,  // do not index that tag
+    EVAL;    // do index that tag
+
+    public TagValency reverse() {
+        return this == IGNORE ? EVAL : IGNORE;
+    }
+}
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@ -64,14 +64,14 @@ public final class TransformerWriter extends Writer {
    private boolean inComment;
    private boolean binaryUnsuspect;
    private final boolean passbyIfBinarySuspect;
-    
+
    public TransformerWriter(
            final OutputStream outStream,
            final Charset charSet,
            final Scraper scraper,
            final boolean passbyIfBinarySuspect
    ) {
-    	this(outStream, charSet, scraper, passbyIfBinarySuspect, 64);
+        this(outStream, charSet, scraper, passbyIfBinarySuspect, 64);
    }

    public TransformerWriter(
@ -229,7 +229,7 @@ public final class TransformerWriter extends Writer {
            // we are not collection tag text -> case (1) - (3)
            // case (1): this is not a tag opener/closer
            if (this.scraper != null && content.length > 0) {
-            	this.scraper.scrapeText(content, null);
+                this.scraper.scrapeText(content, null);
            }
            return content;
        }
@ -283,24 +283,24 @@ public final class TransformerWriter extends Writer {

    private char[] filterTagOpening(final String tagname, final char[] content) {
        final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
-        ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser());
+        ContentScraper.Tag tag = new ContentScraper.Tag(tagname, this.scraper.defaultValency(), charBuffer.propParser());
        charBuffer.close();
        
        final ContentScraper.Tag parentTag;
        if(this.tagStack.size() > 0) {
-        	parentTag = this.tagStack.lastElement();
+            parentTag = this.tagStack.lastElement();
        } else {
-        	parentTag = null;
+            parentTag = null;
        }
        
        /* Check scraper ignoring rules */
-		if (this.scraper != null && this.scraper.shouldIgnoreTag(tag, parentTag)) {
-			tag.setIgnore(true);
-		}
+        if (this.scraper != null) {
+            tag.setValency(this.scraper.tagValency(tag, parentTag));
+        }
        
        /* Apply processing relevant for any kind of tag opening */
        if(this.scraper != null) {
-        	this.scraper.scrapeAnyTagOpening(tag);
+            this.scraper.scrapeAnyTagOpening(tag);
        }
        
        if (this.scraper != null && this.scraper.isTag0(tagname)) {
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -57,6 +57,7 @@ import net.yacy.document.VocabularyScraper;
 import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.ImageEntry;
 import net.yacy.document.parser.html.ScraperInputStream;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.document.parser.html.TransformerWriter;


@ -276,7 +277,16 @@ public class htmlParser extends AbstractParser implements Parser {
        if (charset == null) {
            ScraperInputStream htmlFilter = null;
            try {
-                htmlFilter = new ScraperInputStream(sourceStream, documentCharset, ignore_class_name, vocabularyScraper, location, false, maxLinks, timezoneOffset);
+                htmlFilter = new ScraperInputStream(
+                        sourceStream,
+                        documentCharset,
+                        ignore_class_name,
+                        TagValency.EVAL,
+                        vocabularyScraper,
+                        location,
+                        false,
+                        maxLinks,
+                        timezoneOffset);
                sourceStream = htmlFilter;
                charset = htmlFilter.detectCharset();
            } catch (final IOException e1) {
@ -311,7 +321,14 @@ public class htmlParser extends AbstractParser implements Parser {
        
        // parsing the content
        // for this static method no need to init local this.scraperObject here
-        final ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
+        final ContentScraper scraper = new ContentScraper(
+                location,
+                maxAnchors,
+                maxLinks,
+                ignore_class_name,
+                TagValency.EVAL,
+                vocabularyScraper,
+                timezoneOffset);
        final TransformerWriter writer = new TransformerWriter(null, null, scraper, false, Math.max(64, Math.min(4096, sourceStream.available())));
        try {
        	final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());
--- a/source/net/yacy/htroot/Crawler_p.java
+++ b/source/net/yacy/htroot/Crawler_p.java
@ -66,6 +66,7 @@ import net.yacy.data.WorkTables;
 import net.yacy.document.Document;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.document.parser.html.ContentScraper;
+import net.yacy.document.parser.html.TagValency;
 import net.yacy.document.parser.html.TransformerWriter;
 import net.yacy.kelondro.index.RowHandleSet;
 import net.yacy.kelondro.util.FileUtils;
@ -733,8 +734,13 @@ public class Crawler_p {
                                } else {
                                    /* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
                                    final String crawlingFileContent = post.get("crawlingFile$file", "");
-                                    final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
-                                            new HashSet<String>(), new VocabularyScraper(), profile.timezoneOffset());
+                                    final ContentScraper scraper = new ContentScraper(
+                                            new DigestURL(crawlingFile),
+                                            10000000,
+                                            new HashSet<String>(),
+                                            TagValency.EVAL,
+                                            new VocabularyScraper(),
+                                            profile.timezoneOffset());
                                    final FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
                                            sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
                                    sb.crawler.putActive(handle, profile);
@ -874,20 +880,20 @@ public class Crawler_p {
            final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException {
        List<AnchorURL> hyperlinks_from_file;
        // check if the crawl filter works correctly
-        final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
+        final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), TagValency.EVAL, new VocabularyScraper(), timezoneOffset);
        final Writer writer = new TransformerWriter(null, null, scraper, false);
        if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
            /* Let's report here detailed error to help user when he selected a wrong file */
            if(!crawlingFile.exists()) {
-            	writer.close();
+                writer.close();
                throw new FileNotFoundException(crawlingFile.getAbsolutePath() +  " does not exists");
            }
            if(!crawlingFile.isFile()) {
-            	writer.close();
+                writer.close();
                throw new FileNotFoundException(crawlingFile.getAbsolutePath() +  " exists but is not a regular file");
            }
            if(!crawlingFile.canRead()) {
-            	writer.close();
+                writer.close();
                throw new IOException("Can not read : " + crawlingFile.getAbsolutePath());
            }
        }