added a crawl filter based on <div> tag class names

When a crawl is started, a new field to exclude content from scraping is available. The field can be identified with the class name of div tags. All text contained in such a div tag where the configured class name(s) match are not indexed, while the remaining page is indexed.
7 years ago · 25573bd5ab
parent 607b39b427
commit 25573bd5ab
26 changed files with 280 additions and 82 deletions
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@ -366,6 +366,18 @@
 	        </dd>
 	      </dl>
        </fieldset>
+        <fieldset>
+          <legend>Content Filter</legend>
+          <p>These are limitations on parts of a document. The filter will be applied after a web page was loaded.</p>
+          <dl>
+	        <dt>Filter div class names</dt>
+	        <dd>
+            <table border="0">
+		    <tr><td width="110">set of class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of div class names which should be filtered out</td></tr>
+			</table>
+	        </dd>
+	      </dl>
+        </fieldset>
        <fieldset>
          <legend>Clean-Up before Crawl Start</legend>
          <dl>
--- a/htroot/CrawlStartExpert.java
+++ b/htroot/CrawlStartExpert.java
@ -513,6 +513,14 @@ public class CrawlStartExpert {
        }
        prop.put("agentSelect_defaultAgentName", ClientIdentification.yacyInternetCrawlerAgentName);

+        // ---------- Ignore Class Name
+        if (post != null && post.containsKey("ignoreclassname")) {
+            prop.put("ignoreclassname", 
+                    post.get("ignoreclassname", ""));
+        } else {
+            prop.put("ignoreclassname", "");
+        }
+        
        // ---------- Enrich Vocabulary
        Collection<Tagging> vocs = LibraryProvider.autotagging.getVocabularies();
        if (vocs.size() == 0) {
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -468,6 +468,15 @@ public class Crawler_p {
                boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
                String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", "");
                
+                String ignoreclassname_s = post.get("ignoreclassname");
+                Set<String> ignoreclassname = new HashSet<>();
+                if (ignoreclassname_s != null) {
+                	String[] ignoreclassname_a = ignoreclassname_s.trim().split(",");
+                	for (int i = 0; i < ignoreclassname_a.length; i++) {
+                		ignoreclassname.add(ignoreclassname_a[i].trim());
+                	}
+                }
+                
                // get vocabulary scraper info
                JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context
                for (String key: post.keySet()) {
@ -552,6 +561,7 @@ public class Crawler_p {
                            cachePolicy,
                            collection,
                            agentName,
+                            ignoreclassname,
                            new VocabularyScraper(vocabulary_scraper),
                            timezoneOffset);
                    handle = ASCII.getBytes(profile.handle());
@ -646,7 +656,7 @@ public class Crawler_p {
 								/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
 								final String crawlingFileContent = post.get("crawlingFile$file", "");
 								final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
-										new VocabularyScraper(), profile.timezoneOffset());
+										new HashSet<String>(), new VocabularyScraper(), profile.timezoneOffset());
 								FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
 										sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
 	                            sb.crawler.putActive(handle, profile);
@ -784,7 +794,7 @@ public class Crawler_p {
 			final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException {
 		List<AnchorURL> hyperlinks_from_file;
 		// check if the crawl filter works correctly
-		final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset);
+		final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
 		final Writer writer = new TransformerWriter(null, null, scraper, null, false);
 		if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
 			/* Let's report here detailed error to help user when he selected a wrong file */
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -159,7 +159,7 @@ public class QuickCrawlLink_p {
                        CacheStrategy.IFFRESH,
                        collection,
                        ClientIdentification.yacyIntranetCrawlerAgentName,
-                        null,
+                        null, null,
                        timezoneOffset);
                sb.crawler.putActive(pe.handle().getBytes(), pe);
            } catch (final Exception e) {
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@ -297,7 +297,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.NOCACHE,
                "robot_" + CRAWL_PROFILE_AUTOCRAWL_DEEP,
                ClientIdentification.yacyInternetCrawlerAgentName,
-                null,
+                null, null,
                0);
    	this.profilesActiveCrawls.put(
    	    UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()),
@ -330,7 +330,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.NOCACHE,
                "robot_" + CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
                ClientIdentification.yacyInternetCrawlerAgentName,
-                null,
+                null, null,
                0);
        this.profilesActiveCrawls.put(
            UTF8.getBytes(this.defaultAutocrawlShallowProfile.handle()),
@ -362,7 +362,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFFRESH,
                "robot_" + CRAWL_PROFILE_PROXY,
                ClientIdentification.yacyProxyAgentName,
-                null,
+                null, null,
                0);
        this.profilesActiveCrawls.put(
            UTF8.getBytes(this.defaultProxyProfile.handle()),
@ -394,7 +394,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFFRESH,
                "robot_" + CRAWL_PROFILE_REMOTE,
                ClientIdentification.yacyInternetCrawlerAgentName,
-                null,
+                null, null,
                0);
        this.profilesActiveCrawls.put(
            UTF8.getBytes(this.defaultRemoteProfile.handle()),
@ -426,7 +426,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
                ClientIdentification.yacyIntranetCrawlerAgentName,
-                null,
+                null, null,
                0);
        this.profilesActiveCrawls.put(
            UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
@ -458,7 +458,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
                ClientIdentification.yacyIntranetCrawlerAgentName,
-                null,
+                null, null,
                0);
        this.profilesActiveCrawls.put(
            UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
@ -491,7 +491,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
                ClientIdentification.browserAgentName,
-                null,
+                null, null,
                0);
        this.profilesActiveCrawls.put(
            UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
@ -523,7 +523,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
                ClientIdentification.yacyIntranetCrawlerAgentName,
-                null,
+                null, null,
                0);
        this.profilesActiveCrawls.put(
            UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
@ -555,7 +555,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
                ClientIdentification.yacyIntranetCrawlerAgentName,
-                null,
+                null, null,
                0);
        this.profilesActiveCrawls.put(
            UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
@ -587,7 +587,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.NOCACHE,
                "robot_" + CRAWL_PROFILE_SURROGATE,
                ClientIdentification.yacyIntranetCrawlerAgentName,
-                null,
+                null, null,
                0);
        this.profilesActiveCrawls.put(
            UTF8.getBytes(this.defaultSurrogateProfile.handle()),
@ -622,7 +622,7 @@ public final class CrawlSwitchboard {
                CacheStrategy.NOCACHE,
                collection,
                ClientIdentification.yacyIntranetCrawlerAgentName,
-                null,
+                null, null,
                0);
        this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile);
        this.defaultPushProfiles.put(collection, genericPushProfile);
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -28,10 +28,12 @@ package net.yacy.crawler.data;
 import java.util.Collection;
 import java.util.Date;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.LinkedHashSet;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.regex.Pattern;
@ -44,6 +46,8 @@ import net.yacy.cora.order.Digest;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.cora.util.JSONArray;
+import net.yacy.cora.util.JSONTokener;
 import net.yacy.crawler.CrawlSwitchboard;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.kelondro.data.word.Word;
@ -96,6 +100,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        INDEX_TEXT                   ("indexText",                  false, CrawlAttribute.BOOLEAN, "Index Text"),
        INDEX_MEDIA                  ("indexMedia",                 false, CrawlAttribute.BOOLEAN, "Index Media"),
        COLLECTIONS                  ("collections",                false, CrawlAttribute.STRING,  "Collections (comma-separated list)"),
+        IGNORE_DIV_CLASS_NAME        ("ignore_class_name",      false, CrawlAttribute.STRING,  "Ignore DIV Class names"),
        SCRAPER                      ("scraper",                    false, CrawlAttribute.STRING,  "Declaration for Vocabulary Scraper"),
        TIMEZONEOFFSET               ("timezoneOffset",             true,  CrawlAttribute.INTEGER, "Time Zone of Crawl Start Agent");
        
@ -128,6 +133,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    private Pattern snapshotsMustnotmatch = null;

    private final Map<String, AtomicInteger> doms;
+    private final Set<String> ignore_class_name;
    private final VocabularyScraper scraper;

    /**
@ -190,6 +196,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                 final CacheStrategy cacheStrategy,
                 final String collections,
                 final String userAgentName,
+                 final Set<String> ignore_class_name,
                 final VocabularyScraper scraper,
                 final int timezoneOffset) {
        super(40);
@ -230,9 +237,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key, snapshotsMustnotmatch);
        put(CrawlAttribute.CACHE_STRAGEGY.key,   cacheStrategy.toString());
        put(CrawlAttribute.COLLECTIONS.key,      CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
-        // we transform the scraper information into a JSON Array
+        // we transform the ignore_class_name and scraper information into a JSON Array
+        this.ignore_class_name = ignore_class_name == null ? new HashSet<String>() : ignore_class_name;
+        String jsonString = new JSONArray(ignore_class_name).toString();
+        put(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key, jsonString);
        this.scraper = scraper == null ? new VocabularyScraper() : scraper;
-        String jsonString = this.scraper.toString();
+        jsonString = this.scraper.toString();
        assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
        put(CrawlAttribute.SCRAPER.key, jsonString);
        put(CrawlAttribute.TIMEZONEOFFSET.key, timezoneOffset);
@ -246,10 +256,18 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        super(ext == null ? 1 : ext.size());
        if (ext != null) putAll(ext);
        this.doms = new ConcurrentHashMap<String, AtomicInteger>();
-        String jsonString = ext.get(CrawlAttribute.SCRAPER.key);
+        String jsonString = ext.get(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key);
+    	JSONArray a = jsonString == null ? new JSONArray() : new JSONArray(new JSONTokener(jsonString));
+        this.ignore_class_name = new HashSet<String>();
+        for (int i = 0; i < a.length(); i++) this.ignore_class_name.add(a.getString(i));
+        jsonString = ext.get(CrawlAttribute.SCRAPER.key);
        this.scraper = jsonString == null || jsonString.length() == 0 ? new VocabularyScraper() : new VocabularyScraper(jsonString);
    }

+    public Set<String> ignoreDivClassName() {
+        return this.ignore_class_name;
+    }
+
    public VocabularyScraper scraper() {
        return this.scraper;
    }
@ -798,4 +816,19 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i);

    }
+    
+	public static void main(String[] args) {
+    	// test to convert the key set from set to string and back
+    	Set<String> a = new HashSet<>();
+    	a.add("eins"); a.add("zwei"); a.add("drei");
+    	JSONArray j = new JSONArray(a);
+    	String s = j.toString();
+    	System.out.println(s);
+    	JSONTokener o = new JSONTokener(s);
+    	j = new JSONArray(o);
+    	System.out.println(j);
+    	Set<String> h = new HashSet<String>();
+        for (int i = 0; i < j.length(); i++) h.add(j.getString(i));
+    	System.out.println(h);
+    }
 }
--- a/source/net/yacy/crawler/retrieval/Response.java
+++ b/source/net/yacy/crawler/retrieval/Response.java
@ -28,6 +28,7 @@ package net.yacy.crawler.retrieval;

 import java.nio.charset.StandardCharsets;
 import java.util.Date;
+import java.util.HashSet;
 import java.util.Locale;

 import net.yacy.cora.document.analysis.Classification;
@ -861,7 +862,7 @@ public class Response {
        final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.getContentType());
        if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
        try {
-            return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.getContentType(), this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
+            return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.getContentType(), this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(), new HashSet<String>(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
        } catch(Parser.Failure e) {
        	throw e;
        } catch (final Exception e) {
--- a/source/net/yacy/data/BookmarkHelper.java
+++ b/source/net/yacy/data/BookmarkHelper.java
@ -135,7 +135,7 @@ public class BookmarkHelper {
        final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
        try {
            //load the links
-            final ContentScraper scraper = new ContentScraper(baseURL, 10000, new VocabularyScraper(), 0);
+            final ContentScraper scraper = new ContentScraper(baseURL, 10000, new HashSet<String>(), new VocabularyScraper(), 0);
            //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
            FileUtils.copy(input,writer);
--- a/source/net/yacy/data/ymark/YMarkCrawlStart.java
+++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java
@ -190,7 +190,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
 		                CacheStrategy.IFFRESH,
 		                "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
 		                ClientIdentification.yacyIntranetCrawlerAgentName,
-		                null,
+		                null, null,
 		                0); // TODO: make this a default profile in CrawlSwitchboard
 		sb.crawler.putActive(pe.handle().getBytes(), pe);
 		return sb.crawlStacker.stackCrawl(new Request(
--- a/source/net/yacy/document/AbstractParser.java
+++ b/source/net/yacy/document/AbstractParser.java
@ -50,6 +50,70 @@ public abstract class AbstractParser implements Parser {
 	    this.name = name;
 	}

+    /*
+     *  The following abstract implementations create a circular call which would cause an endless loop when called.
+     *  They are both here because one of them must be overridden by the implementing class.
+     */
+    
+    @Override
+    public Document[] parse(
+            DigestURL url,
+            String mimeType,
+            String charset,
+            VocabularyScraper scraper,
+            int timezoneOffset,
+            InputStream source
+            ) throws Parser.Failure, InterruptedException {
+    	return parse(url, mimeType, charset, new HashSet<String>(), scraper, timezoneOffset, source);
+    }
+
+    @Override
+    public Document[] parse(
+            DigestURL url,
+            String mimeType,
+            String charset,
+            Set<String> ignore_class_name,
+            VocabularyScraper scraper,
+            int timezoneOffset,
+            InputStream source
+            ) throws Parser.Failure, InterruptedException {
+    	return parse(url, mimeType, charset, scraper, timezoneOffset, source);
+    }
+    
+    
+    /*
+     *  The following abstract implementations create a circular call which would cause an endless loop when called.
+     *  They are both here because one of them must be overridden by the implementing class.
+     */
+
+    @Override
+    public Document[] parseWithLimits(
+    		final DigestURL location,
+    		final String mimeType,
+    		final String charset,
+    		final VocabularyScraper scraper,
+    		final int timezoneOffset,
+    		final InputStream source,
+    		final int maxLinks,
+    		final long maxBytes) throws UnsupportedOperationException, Failure, InterruptedException {
+    	return parseWithLimits(location, mimeType, charset, new HashSet<String>(), scraper, timezoneOffset, source, maxLinks, maxBytes);
+    }
+    
+    @Override
+    public Document[] parseWithLimits(
+    		DigestURL location,
+    		String mimeType,
+    		String charset,
+    		final Set<String> ignore_class_name,
+    		VocabularyScraper scraper,
+    		int timezoneOffset,
+    		InputStream source,
+    		int maxLinks,
+    		long maxBytes)
+    		throws Failure, InterruptedException, UnsupportedOperationException {
+    	return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, maxLinks, maxBytes);
+    }
+    
    /**
     * return the name of the parser
     */
@ -101,14 +165,6 @@ public abstract class AbstractParser implements Parser {
        return c;
    }
    
-    @Override
-    public Document[] parseWithLimits(DigestURL url, String mimeType, String charset, VocabularyScraper scraper,
-    		int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
-    		throws Failure, InterruptedException, UnsupportedOperationException {
-    	/* Please override on subclasses when implementation is possible */
-    	throw new UnsupportedOperationException();
-    }
-    
    @Override
    public boolean isParseWithLimitsSupported() {
    	/* Please override on subclasses when parseWithLimits is supported */
--- a/source/net/yacy/document/Parser.java
+++ b/source/net/yacy/document/Parser.java
@ -28,6 +28,7 @@ import java.util.Set;

 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.document.Parser.Failure;

 public interface Parser {

@ -64,6 +65,16 @@ public interface Parser {
            InputStream source
            ) throws Parser.Failure, InterruptedException;
    
+    public Document[] parse(
+            DigestURL url,
+            String mimeType,
+            String charset,
+            Set<String> ignore_class_name,
+            VocabularyScraper scraper,
+            int timezoneOffset,
+            InputStream source
+            ) throws Parser.Failure, InterruptedException;
+    
    /**
 	 * Parse an input stream, eventually terminating processing when a total of
 	 * maxLinks URLS (anchors, images links, media links...) have been reached,
@ -103,10 +114,17 @@ public interface Parser {
 	 *             when the parser implementation doesn't support parsing within
 	 *             limits
 	 */
-	public Document[] parseWithLimits(DigestURL url, String mimeType, String charset, VocabularyScraper scraper,
+	public Document[] parseWithLimits(DigestURL url, String mimeType, String charset,
+			VocabularyScraper scraper,
 			int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
 			throws Parser.Failure, InterruptedException, UnsupportedOperationException;
-    
+
+
+    public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset,
+    		final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
+    		final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
+    				throws Parser.Failure, InterruptedException, UnsupportedOperationException;
+
 	/**
 	 * @return true when the parser implementation supports the
 	 *         parseWithLimits() operation.
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -182,6 +182,7 @@ public final class TextParser {
            final DigestURL location,
            final String mimeType,
            final String charset,
+            final Set<String> ignore_class_name,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final int depth,
@ -198,7 +199,7 @@ public final class TextParser {
                throw new Parser.Failure(errorMsg, location);
            }
            sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
-            docs = parseSource(location, mimeType, charset, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream);
+            docs = parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -215,6 +216,7 @@ public final class TextParser {
            final DigestURL location,
            String mimeType,
            final String charset,
+            final Set<String> ignore_class_name,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final int depth,
@ -232,7 +234,7 @@ public final class TextParser {
        }
        assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);

-        Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
+        Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);

        return docs;
    }
@ -241,6 +243,7 @@ public final class TextParser {
            final DigestURL location,
            String mimeType,
            final String charset,
+            final Set<String> ignore_class_name,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final int depth,
@ -302,7 +305,7 @@ public final class TextParser {
 					CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);
 					
 					try {
-						return parseSource(location, mimeType, parser, charset, scraper, timezoneOffset,
+						return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset,
 								nonCloseInputStream, maxLinks, maxBytes);
 					} catch (Parser.Failure e) {
 						/* Try to reset the marked stream. If the failed parser has consumed too many bytes : 
@ -364,15 +367,16 @@ public final class TextParser {
        } catch (final IOException e) {
            throw new Parser.Failure(e.getMessage(), location);
        }
-        Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
+        Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);

        return docs;
    }

 	public static Document[] parseSource(final DigestURL location, String mimeType, final String charset,
+			final Set<String> ignore_class_name,
 			final VocabularyScraper scraper, final int timezoneOffset, final int depth, final long contentLength,
 			final InputStream sourceStream) throws Parser.Failure {
-		return parseSource(location, mimeType, charset, scraper, timezoneOffset, depth, contentLength, sourceStream,
+		return parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, contentLength, sourceStream,
 				Integer.MAX_VALUE, Long.MAX_VALUE);
 	}
    
@ -397,7 +401,7 @@ public final class TextParser {
 	public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
 			final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
 			long maxBytes) throws Parser.Failure{
-		return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, depth, contentLength,
+		return parseSource(location, mimeType, charset, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength,
 				sourceStream, maxLinks, maxBytes);
 	}
    
@ -420,6 +424,7 @@ public final class TextParser {
            final String mimeType,
            final Parser parser,
            final String charset,
+            final Set<String> ignore_class_name,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final InputStream sourceStream,
@ -435,11 +440,11 @@ public final class TextParser {
        try {
            final Document[] docs;
            if(parser.isParseWithLimitsSupported()) {
-            	docs = parser.parseWithLimits(location, mimeType, documentCharset, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
+            	docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
            } else {
            	/* Parser do not support partial parsing within limits : let's control it here*/
    			InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
-            	docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource);
+            	docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource);
            }
            return docs;
        } catch(Parser.Failure e) {
@ -468,6 +473,7 @@ public final class TextParser {
            final String mimeType,
            final Set<Parser> parsers,
            final String charset,
+            final Set<String> ignore_class_name,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final int depth,
@ -495,13 +501,13 @@ public final class TextParser {
            	}
                try {
                	if(parser.isParseWithLimitsSupported()) {
-                		docs = parser.parseWithLimits(location, mimeType, documentCharset, scraper, timezoneOffset, bis, maxLinks, maxBytes);
+                		docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis, maxLinks, maxBytes);
                	} else {
                        /* Partial parsing is not supported by this parser : check content length now */
                       	if(sourceArray.length > maxBytes) {
                       		throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);		
                       	}
-                		docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
+                		docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis);
                	}
                } catch (final Parser.Failure e) {
 					if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException && 
--- a/source/net/yacy/document/importer/MediawikiImporter.java
+++ b/source/net/yacy/document/importer/MediawikiImporter.java
@ -40,6 +40,7 @@ import java.io.UnsupportedEncodingException;
 import java.lang.reflect.Array;
 import java.net.MalformedURLException;
 import java.nio.charset.StandardCharsets;
+import java.util.HashSet;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.Callable;
@ -534,7 +535,7 @@ public class MediawikiImporter extends Thread implements Importer {
        public void genDocument() throws Parser.Failure {
            try {
 				this.url = new AnchorURL(this.urlStub + this.title);
-				final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
+				final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
 				this.document = Document.mergeDocuments(this.url, "text/html", parsed);
 				// the wiki parser is not able to find the proper title in the source text, so it must be set here
 				this.document.setTitle(this.title);
--- a/source/net/yacy/document/parser/bzipParser.java
+++ b/source/net/yacy/document/parser/bzipParser.java
@ -33,6 +33,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.net.MalformedURLException;
 import java.util.Date;
+import java.util.Set;

 import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
 import org.apache.commons.compress.compressors.bzip2.BZip2Utils;
@ -69,6 +70,7 @@ public class bzipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
+            Set<String> ignore_class_name,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            final InputStream source)
@ -123,7 +125,7 @@ public class bzipParser extends AbstractParser implements Parser {
            // creating a new parser class to parse the unzipped content
            final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName());
            final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
-            final Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
+            final Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tempFile);
            if (docs != null) maindoc.addSubDocuments(docs);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
--- a/source/net/yacy/document/parser/gzipParser.java
+++ b/source/net/yacy/document/parser/gzipParser.java
@ -33,6 +33,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.net.MalformedURLException;
 import java.util.Date;
+import java.util.Set;
 import java.util.zip.GZIPInputStream;

 import org.apache.commons.compress.compressors.gzip.GzipUtils;
@ -71,6 +72,7 @@ public class gzipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
+            Set<String> ignore_class_name,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            final InputStream source) throws Parser.Failure, InterruptedException {
@ -126,7 +128,7 @@ public class gzipParser extends AbstractParser implements Parser {
            // creating a new parser class to parse the unzipped content
            final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
            final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
-            Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
+            Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
            if (docs != null) maindoc.addSubDocuments(docs);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -209,6 +209,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private final int maxAnchors;
    
    private final VocabularyScraper vocabularyScraper;
+    private final Set<String> ignore_class_name;
    private final int timezoneOffset;
    private int breadcrumbs;

@ -241,13 +242,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     * @param timezoneOffset local time zone offset
     */
    @SuppressWarnings("unchecked")
-    public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
+    public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
        // the root value here will not be used to load the resource.
        // it is only the reference for relative links
        super(linkTags0, linkTags1);
        assert root != null;
        this.root = root;
        this.vocabularyScraper = vocabularyScraper;
+        this.ignore_class_name = ignore_class_name;
        this.timezoneOffset = timezoneOffset;
        this.evaluationScores = new Evaluation();
        this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
@ -294,8 +296,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     * @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
     * @param timezoneOffset local time zone offset
     */
-    public ContentScraper(final DigestURL root, final int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
-        this(root, Integer.MAX_VALUE, maxLinks, vocabularyScraper, timezoneOffset);
+    public ContentScraper(final DigestURL root, final int maxLinks, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
+        this(root, Integer.MAX_VALUE, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
    }

    @Override
@ -835,11 +837,17 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        }
        final String h;
        if (tag.name.equalsIgnoreCase("div")) {
-            final String id = tag.opts.getProperty("id", EMPTY_STRING);
-            this.evaluationScores.match(Element.divid, id);
-            final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
-            if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
-                breadcrumbs++;
+            final String classn = tag.opts.getProperty("class", EMPTY_STRING);
+            if (classn.length() > 0 && this.ignore_class_name.contains(classn)) {
+            	// we remove everything inside that tag, so it can be ignored
+            	tag.content.clear();
+            } else {
+	            final String id = tag.opts.getProperty("id", EMPTY_STRING);
+	            this.evaluationScores.match(Element.divid, id);
+	            final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
+	            if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
+	                breadcrumbs++;
+	            }
            }
        } else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
@ -1477,13 +1485,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        if (page == null) throw new IOException("no content in file " + file.toString());

        // scrape document to look up charset
-        final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), StandardCharsets.UTF_8.name(), new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks, timezoneOffset);
+        final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks, timezoneOffset);
        String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
        htmlFilter.close();
        if (charset == null) charset = Charset.defaultCharset().toString();

        // scrape content
-        final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new VocabularyScraper(), timezoneOffset);
+        final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
        final Writer writer = new TransformerWriter(null, null, scraper, null, false);
        FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
        writer.close();
--- a/source/net/yacy/document/parser/html/ScraperInputStream.java
+++ b/source/net/yacy/document/parser/html/ScraperInputStream.java
@ -35,6 +35,7 @@ import java.io.UnsupportedEncodingException;
 import java.io.Writer;
 import java.nio.charset.StandardCharsets;
 import java.util.Properties;
+import java.util.Set;

 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.util.CommonPattern;
@ -61,6 +62,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
    public ScraperInputStream(
            final InputStream inStream,
            final String inputStreamCharset,
+            final Set<String> ignore_class_name,
            final VocabularyScraper vocabularyScraper,
            final DigestURL rooturl,
            final Transformer transformer,
@ -72,7 +74,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
        this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
        this.bufferedIn.mark((int) preBufferSize);

-        final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, vocabularyScraper, timezoneOffset);
+        final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
        scraper.registerHtmlFilterEventListener(this);

        try {
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -36,8 +36,10 @@ import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.StandardCharsets;
 import java.nio.charset.UnsupportedCharsetException;
+import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.Locale;
+import java.util.Set;

 import org.apache.commons.io.IOUtils;

@ -105,7 +107,20 @@ public class htmlParser extends AbstractParser implements Parser {
            final int timezoneOffset,
            final InputStream sourceStream) throws Parser.Failure, InterruptedException {

-        return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
+        return parseWithLimits(location, mimeType, documentCharset, new HashSet<String>(), vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
+    }
+    
+    @Override
+    public Document[] parse(
+            final DigestURL location,
+            final String mimeType,
+            final String documentCharset,
+            final Set<String> ignore_class_name, 
+            final VocabularyScraper vocscraper,
+            final int timezoneOffset,
+            final InputStream sourceStream) throws Parser.Failure, InterruptedException {
+
+        return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
    }
    
    @Override
@ -114,19 +129,20 @@ public class htmlParser extends AbstractParser implements Parser {
    }
    
    @Override
-    public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper,
+    public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset,
+    		final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
    		final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
    		throws Failure {
-        return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, maxLinks, maxLinks, maxBytes);
+        return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, maxLinks, maxLinks, maxBytes);
    }
    
-    private Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper,
+    private Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
    		final int timezoneOffset, final InputStream sourceStream, final int maxAnchors, final int maxLinks, final long maxBytes)
    		throws Failure {
        try {
            // first get a document from the parsed html
            Charset[] detectedcharsetcontainer = new Charset[]{null};
-            ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
+            ContentScraper scraper = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
            // parseToScraper also detects/corrects/sets charset from html content tag
            final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
            Document documentSnapshot = null;
@ -135,10 +151,10 @@ public class htmlParser extends AbstractParser implements Parser {
                // and create a sub-document for snapshot page (which will be merged by loader)
                // TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler
                if (location.getRef() != null && location.getRef().startsWith("!")) {
-                    documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
+                    documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
                } else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both)
                    if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) {
-                        documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
+                        documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
                    }
                }
            } catch (Exception ex1) { // ignore any exception for any issue with snapshot
@ -203,7 +219,7 @@ public class htmlParser extends AbstractParser implements Parser {
        return ppd;
    }

-    public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxAnchors, final int maxLinks) throws IOException {
+    public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxAnchors, final int maxLinks) throws IOException {
        Charset[] detectedcharsetcontainer = new Charset[]{null};
        InputStream sourceStream;
        try {
@ -213,7 +229,7 @@ public class htmlParser extends AbstractParser implements Parser {
        }
        ContentScraper scraper; // for this static methode no need to init local this.scraperObject
        try {
-            scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
+            scraper = parseToScraper(location, documentCharset, ignore_class_name, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
        } catch (Failure e) {
            throw new IOException(e.getMessage());
        }
@ -238,6 +254,7 @@ public class htmlParser extends AbstractParser implements Parser {
    public static ContentScraper parseToScraper(
            final DigestURL location,
            final String documentCharset,
+            final Set<String> ignore_class_name,
            final VocabularyScraper vocabularyScraper,
            final Charset[] detectedcharsetcontainer,
            final int timezoneOffset,
@ -258,7 +275,7 @@ public class htmlParser extends AbstractParser implements Parser {
        if (charset == null) {
            ScraperInputStream htmlFilter = null;
            try {
-                htmlFilter = new ScraperInputStream(sourceStream, documentCharset, vocabularyScraper, location, null, false, maxLinks, timezoneOffset);
+                htmlFilter = new ScraperInputStream(sourceStream, documentCharset, ignore_class_name, vocabularyScraper, location, null, false, maxLinks, timezoneOffset);
                sourceStream = htmlFilter;
                charset = htmlFilter.detectCharset();
            } catch (final IOException e1) {
@ -293,7 +310,7 @@ public class htmlParser extends AbstractParser implements Parser {
        
        // parsing the content
        // for this static method no need to init local this.scraperObject here
-        final ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, vocabularyScraper, timezoneOffset);
+        final ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
        final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
        try {
        	final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());
@ -420,8 +437,10 @@ public class htmlParser extends AbstractParser implements Parser {
     * @param maxBytes the maximum number of content bytes to process
     * @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot
     */
-    private Document parseAlternativeSnapshot(final DigestURL location, final String mimeType, final String documentCharset,
-            final VocabularyScraper vocscraper, final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) {
+    private Document parseAlternativeSnapshot(
+    		final DigestURL location, final String mimeType, final String documentCharset,
+    		final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
+    		final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) {
        Document documentSnapshot = null;
        try {
            // construct url for case (1) with anchor
@ -440,7 +459,7 @@ public class htmlParser extends AbstractParser implements Parser {
            InputStream snapshotStream = null;
            try {
            	snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
-            	ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
+            	ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
                documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot);
            } finally {
            	if(snapshotStream != null) {
--- a/source/net/yacy/document/parser/sevenzipParser.java
+++ b/source/net/yacy/document/parser/sevenzipParser.java
@ -33,6 +33,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.util.Date;
+import java.util.Set;

 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
@ -62,6 +63,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
+            final Set<String> ignore_class_name,
            final int timezoneOffset,
            final IInStream source) throws Parser.Failure, InterruptedException {

@ -92,7 +94,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
        } catch (final IOException e) {
            throw new Parser.Failure("error opening 7zip archive: " + e.getMessage(), location);
        }
-        final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), timezoneOffset);
+        final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), ignore_class_name, timezoneOffset);
        AbstractParser.log.fine("processing archive contents...");
        try {
            archive.Extract(null, -1, 0, aec);
@ -114,9 +116,10 @@ public class sevenzipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
+            final Set<String> ignore_class_name,
            final int timezoneOffset,
            final byte[] source) throws Parser.Failure, InterruptedException {
-        return parse(location, mimeType, charset, timezoneOffset, new ByteArrayIInStream(source));
+        return parse(location, mimeType, charset, ignore_class_name, timezoneOffset, new ByteArrayIInStream(source));
    }

    @Override
@ -124,13 +127,14 @@ public class sevenzipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
+            Set<String> ignore_class_name,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            final InputStream source) throws Parser.Failure, InterruptedException {
        try {
            final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
            FileUtils.copy(source, cfos);
-            return new Document[]{parse(location, mimeType, charset, timezoneOffset, cfos.toByteArray())};
+            return new Document[]{parse(location, mimeType, charset, ignore_class_name, timezoneOffset, cfos.toByteArray())};
        } catch (final IOException e) {
            throw new Parser.Failure("error processing 7zip archive: " + e.getMessage(), location);
        }
@ -144,6 +148,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
         private ByteArrayOutputStream cfos = null;
         private final Document doc;
         private final String prefix;
+         private Set<String> ignore_class_name;
         private final int timezoneOffset;

         public SZParserExtractCallback(
@ -151,11 +156,13 @@ public class sevenzipParser extends AbstractParser implements Parser {
                 final IInArchive handler,
                 final Document doc,
                 final String prefix,
+                 final Set<String> ignore_class_name,
                 final int timezoneOffset) {
             super.Init(handler);
             this.log = logger;
             this.doc = doc;
             this.prefix = prefix;
+             this.ignore_class_name = ignore_class_name;
             this.timezoneOffset = timezoneOffset;
         }

@ -198,7 +205,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
                     // below for reversion of the effects
                     final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath);
                     final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
-                     theDocs = TextParser.parseSource(url, mime, null, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());
+                     theDocs = TextParser.parseSource(url, mime, null, this.ignore_class_name, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());

                     this.doc.addSubDocuments(theDocs);
                 }
--- a/source/net/yacy/document/parser/tarParser.java
+++ b/source/net/yacy/document/parser/tarParser.java
@ -31,6 +31,7 @@ import java.io.InputStream;
 import java.io.RandomAccessFile;
 import java.net.MalformedURLException;
 import java.util.Date;
+import java.util.Set;
 import java.util.zip.GZIPInputStream;

 import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
@ -69,6 +70,7 @@ public class tarParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
+            final Set<String> ignore_class_name,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            InputStream source) throws Parser.Failure, InterruptedException {
@ -110,7 +112,7 @@ public class tarParser extends AbstractParser implements Parser {
 					 * as a possible parser for the sub resource.
 					 */
                    final DigestURL subLocation = new DigestURL(parentTarURL, name);
-                    final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, scraper, timezoneOffset,	999, tmp);
+                    final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, ignore_class_name, scraper, timezoneOffset,	999, tmp);
                    if (subDocs == null) {
                    	continue;
                    }
--- a/source/net/yacy/document/parser/zipParser.java
+++ b/source/net/yacy/document/parser/zipParser.java
@ -28,6 +28,7 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Date;
+import java.util.Set;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;

@ -69,6 +70,7 @@ public class zipParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
+            final Set<String> ignore_class_name,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            final InputStream source)
@ -117,7 +119,7 @@ public class zipParser extends AbstractParser implements Parser {
                    FileUtils.copy(zis, tmp, entry.getSize());
                    final DigestURL virtualURL = DigestURL.newURL(location, "#" + name);
                    //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
-                    final Document[] docs = TextParser.parseSource(virtualURL, mime, null, scraper, timezoneOffset, 999, tmp);
+                    final Document[] docs = TextParser.parseSource(virtualURL, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tmp);
                    if (docs == null) continue;
                    maindoc.addSubDocuments(docs);
                } catch (final Parser.Failure e) {
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -688,7 +688,12 @@ public final class LoaderDispatcher {
     * @return a map from URLs to the anchor texts of the urls
     * @throws IOException
     */
-    public final Map<AnchorURL, String> loadLinks(final DigestURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent, final int timezoneOffset) throws IOException {
+    public final Map<AnchorURL, String> loadLinks(
+    		final DigestURL url,
+    		final CacheStrategy cacheStrategy,
+    		BlacklistType blacklistType,
+    		final ClientIdentification.Agent agent,
+    		final int timezoneOffset) throws IOException {
        final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, agent);
        if (response == null) throw new IOException("response == null");
        final ResponseHeader responseHeader = response.getResponseHeader();
@ -699,7 +704,7 @@ public final class LoaderDispatcher {
        final String supportError = TextParser.supports(url, responseHeader.getContentType());
        if (supportError != null) throw new IOException("no parser support: " + supportError);
        try {
-            documents = TextParser.parseSource(url, responseHeader.getContentType(), responseHeader.getCharacterEncoding(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent());
+            documents = TextParser.parseSource(url, responseHeader.getContentType(), responseHeader.getCharacterEncoding(), response.profile().ignoreDivClassName(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent());
            if (documents == null) throw new IOException("document == null");
        } catch (final Exception e) {
            throw new IOException("parser error: " + e.getMessage());
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2951,6 +2951,7 @@ public final class Switchboard extends serverSwitch {
                    new AnchorURL(response.url()),
                    response.getMimeType(),
                    response.getCharacterEncoding(),
+                    response.profile().ignoreDivClassName(),
                    response.profile().scraper(),
                    response.profile().timezoneOffset(),
                    response.depth(),
--- a/source/net/yacy/search/index/DocumentIndex.java
+++ b/source/net/yacy/search/index/DocumentIndex.java
@ -30,6 +30,7 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.MalformedURLException;
+import java.util.HashSet;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.LinkedBlockingQueue;

@ -162,7 +163,7 @@ public class DocumentIndex extends Segment {
        InputStream sourceStream = null;
        try {
        	sourceStream = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
-            documents = TextParser.parseSource(url, null, null, new VocabularyScraper(), timezoneOffset, 0, length, sourceStream);
+            documents = TextParser.parseSource(url, null, null, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, 0, length, sourceStream);
        } catch (final Exception e ) {
            throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage());
        } finally {
--- a/test/java/net/yacy/document/parser/html/ContentScraperTest.java
+++ b/test/java/net/yacy/document/parser/html/ContentScraperTest.java
@ -29,6 +29,7 @@ import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Collection;
 import java.util.Date;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Set;

@ -145,7 +146,7 @@ public class ContentScraperTest {
                + "<time datetime='2016-12-23'>23. Dezember 2016</time>" // html5 time tag
                + "</body></html>";

-        ContentScraper scraper = new ContentScraper(root, 10, new VocabularyScraper(), 0);
+        ContentScraper scraper = new ContentScraper(root, 10, new HashSet<String>(), new VocabularyScraper(), 0);
        final Writer writer = new TransformerWriter(null, null, scraper, null, false);

        FileUtils.copy(new StringReader(page), writer);
--- a/test/java/net/yacy/document/parser/htmlParserTest.java
+++ b/test/java/net/yacy/document/parser/htmlParserTest.java
@ -10,6 +10,7 @@ import java.io.InputStream;
 import java.net.MalformedURLException;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;

@ -265,7 +266,7 @@ public class htmlParserTest extends TestCase {
                + "<figure><img width=\"550px\" title=\"image as exemple\" alt=\"image as exemple\" src=\"./img/my_image.png\"></figrue>" // + img width 550 (+html5 figure)
                + "</body></html>";

-        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10);
+        ContentScraper scraper = parseToScraper(url, charset, new HashSet<String>(), new VocabularyScraper(), 0, testhtml, 10, 10);
        List<AnchorURL> anchorlist = scraper.getAnchors();

        String linktxt = anchorlist.get(0).getTextProperty();
@ -307,7 +308,7 @@ public class htmlParserTest extends TestCase {
        }
        testHtml.append("</p></body></html>");
        
-        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testHtml.toString(), Integer.MAX_VALUE, Integer.MAX_VALUE);
+        ContentScraper scraper = parseToScraper(url, charset, new HashSet<String>(), new VocabularyScraper(), 0, testHtml.toString(), Integer.MAX_VALUE, Integer.MAX_VALUE);
        assertEquals(nestingDepth, scraper.getAnchors().size());
        assertEquals(1, scraper.getImages().size());

@ -328,7 +329,7 @@ public class htmlParserTest extends TestCase {
                + "<p>" + textSource + "</p>"
                + "</body></html>";

-        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10);
+        ContentScraper scraper = parseToScraper(url, charset, new HashSet<String>(), new VocabularyScraper(), 0, testhtml, 10, 10);

        String txt = scraper.getText();
        System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]");
@ -357,7 +358,7 @@ public class htmlParserTest extends TestCase {
                + "</head>\n"
                + "<body>" + textSource + "</body>\n"
                + "</html>";
-        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10);
+        ContentScraper scraper = parseToScraper(url, charset, new HashSet<String>(), new VocabularyScraper(), 0, testhtml, 10, 10);

        String txt = scraper.getText();
        System.out.println("ScraperScriptTagTest: [" + textSource + "] = [" + txt + "]");