From eb20589e29747949c2cfdb47a79a7c7d26f887ac Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Sat, 10 Feb 2018 11:56:28 +0100
Subject: [PATCH] Fixed issue #158 : completed div CSS class ignore in crawl

---
 htroot/CrawlStartExpert.html                  |   2 +-
 .../document/parser/html/AbstractScraper.java |  11 --
 .../document/parser/html/ContentScraper.java  |  88 +++++++++++----
 .../yacy/document/parser/html/Scraper.java    |  23 ++--
 .../parser/html/TransformerWriter.java        |  26 ++++-
 .../yacy/document/parser/htmlParserTest.java  | 102 ++++++++++++++++++
 6 files changed, 208 insertions(+), 44 deletions(-)
diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html
index cba76c34d..78af86373 100644
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@@ -373,7 +373,7 @@
 	        <dt>Filter div class names</dt>
 	        <dd>
             <table border="0">
-		    <tr><td width="110">set of class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of div class names which should be filtered out</td></tr>
+		    <tr><td width="110">set of CSS class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of &lt;div&gt; element class names which should be filtered out</td></tr>
 			</table>
 	        </dd>
 	      </dl>
diff --git a/source/net/yacy/document/parser/html/AbstractScraper.java b/source/net/yacy/document/parser/html/AbstractScraper.java
index e0980c21b..1f4a5fd0b 100644
--- a/source/net/yacy/document/parser/html/AbstractScraper.java
+++ b/source/net/yacy/document/parser/html/AbstractScraper.java
@@ -65,17 +65,6 @@ public abstract class AbstractScraper implements Scraper {
         return (this.tags1 != null) && (this.tags1.contains(tag.toLowerCase()));
     }
 
-    //the 'missing' method that shall be implemented:
-    @Override
-    public abstract void scrapeText(char[] text, String insideTag);
-
-    // the other methods must take into account to construct the return value correctly
-    @Override
-    public abstract void scrapeTag0(ContentScraper.Tag tag);
-
-    @Override
-    public abstract void scrapeTag1(ContentScraper.Tag tag);
-
     public static String stripAllTags(final char[] s) {
         if (s.length > 80 && !MemoryControl.request(s.length * 2, false)) return "";
         final StringBuilder r = new StringBuilder(s.length);
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 76981ffc2..1a4d46bab 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -145,6 +145,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         public String name;
         public Properties opts;
         public CharBuffer content;
+        
+        /** Set to true when this tag should be ignored from scraping */
+        private boolean ignore = false;
+        
         public Tag(final String name) {
             this.name = name;
             this.opts = new Properties();
@@ -174,6 +178,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         public String toString() {
             return "<" + name + " " + opts + ">" + content + "</" + name + ">";
         }
+        
+        /** @return true when this tag should be ignored from scraping */
+        public boolean isIgnore() {
+			return this.ignore;
+		}
+        
+        /**
+         * @param ignore true when this tag should be ignored from scraping
+         */
+        public void setIgnore(final boolean ignore) {
+			this.ignore = ignore;
+		}
     }
 
     // all these tags must be given in lowercase, because the tags from the files are compared in lowercase
@@ -216,7 +232,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     private final int maxAnchors;
     
     private final VocabularyScraper vocabularyScraper;
-    private final Set<String> ignore_class_name;
+    
+    /** Set of CSS class names whose matching div elements content should be ignored */
+    private final Set<String> ignoreDivClassNames;
+    
     private final int timezoneOffset;
     private int breadcrumbs;
 
@@ -245,18 +264,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
      * @param root the document root url
      * @param maxAnchors the maximum number of URLs to process and store in the anchors property.
      * @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
+     * @param ignoreDivClassNames an eventual set of CSS class names whose matching div elements content should be ignored
      * @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
      * @param timezoneOffset local time zone offset
      */
     @SuppressWarnings("unchecked")
-    public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
+    public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set<String> ignoreDivClassNames, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
         // the root value here will not be used to load the resource.
         // it is only the reference for relative links
         super(linkTags0, linkTags1);
         assert root != null;
         this.root = root;
         this.vocabularyScraper = vocabularyScraper;
-        this.ignore_class_name = ignore_class_name;
+        this.ignoreDivClassNames = ignoreDivClassNames;
         this.timezoneOffset = timezoneOffset;
         this.evaluationScores = new Evaluation();
         this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
@@ -314,9 +334,15 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     }
 
     @Override
-    public void scrapeText(final char[] newtext0, final String insideTag) {
-        // System.out.println("SCRAPE: " + UTF8.String(newtext));
-        if (insideTag != null && (TagName.script.name().equals(insideTag) || TagName.style.name().equals(insideTag))) return;
+    public void scrapeText(final char[] newtext0, final Tag insideTag) {
+        if (insideTag != null) {
+        	if(insideTag.ignore) {
+        		return;
+        	}
+			if ((TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name))) {
+				return;
+			}
+        }
         int p, pl, q, s = 0;
         char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray();
         
@@ -377,7 +403,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         }
         // find tags inside text
         String b = cleanLine(stripAllTags(newtext));
-        if ((insideTag != null) && (!(insideTag.equals("a")))) {
+        if ((insideTag != null) && (!(insideTag.name.equals(TagName.a.name())))) {
             // texts inside tags sometimes have no punctuation at the line end
             // this is bad for the text semantics, because it is not possible for the
             // condenser to distinguish headlines from text beginnings.
@@ -697,6 +723,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
      */
     @Override
     public void scrapeTag0(final Tag tag) {
+    	if(tag.ignore) {
+    		return;
+    	}
         checkOpts(tag);
         if (tag.name.equalsIgnoreCase("img")) {
             final String src = tag.opts.getProperty("src", EMPTY_STRING);
@@ -861,6 +890,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
      */
     @Override
     public void scrapeTag1(final Tag tag) {
+    	if(tag.ignore) {
+    		return;
+    	}
         checkOpts(tag);
         // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
         if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
@@ -882,18 +914,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         }
         final String h;
         if (tag.name.equalsIgnoreCase("div")) {
-            final String classn = tag.opts.getProperty("class", EMPTY_STRING);
-            if (classn.length() > 0 && this.ignore_class_name.contains(classn)) {
-            	// we remove everything inside that tag, so it can be ignored
-            	tag.content.clear();
-            } else {
-	            final String id = tag.opts.getProperty("id", EMPTY_STRING);
-	            this.evaluationScores.match(Element.divid, id);
-	            final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
-	            if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
-	                breadcrumbs++;
-	            }
-            }
+	       final String id = tag.opts.getProperty("id", EMPTY_STRING);
+	       this.evaluationScores.match(Element.divid, id);
+	       final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
+	       if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
+	    	   breadcrumbs++;
+	       }
         } else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
             h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
             if (h.length() > 0) this.headlines[0].add(h);
@@ -974,14 +1000,32 @@ public class ContentScraper extends AbstractScraper implements Scraper {
 	 * {@link ContentScraper#linkTags0} and {@link ContentScraper#linkTags1}.
 	 */
 	@Override
-	public void scrapeAnyTagOpening(final String tagName, final Properties tagAttributes) {
-		if (tagAttributes != null) {
+	public void scrapeAnyTagOpening(final Tag tag) {
+		if (tag != null && !tag.ignore && tag.opts != null) {
 			/*
 			 * HTML microdata can be annotated on any kind of tag, so we don't restrict this
 			 * scraping to the limited sets in linkTags0 and linkTags1
 			 */
-			this.linkedDataTypes.addAll(parseMicrodataItemType(tagAttributes));
+			this.linkedDataTypes.addAll(parseMicrodataItemType(tag.opts));
+		}
+	}
+	
+	@Override
+	public boolean shouldIgnoreTag(final Tag tag, final Tag parentTag) {
+		boolean ignore = false;
+		
+        /* First, inherit ignore property from eventual parent */
+		if(parentTag != null) {
+			ignore = parentTag.ignore;
+		}
+		
+		/* Parent is not marked as ignored : let's check the current tag */
+		if (!ignore && this.ignoreDivClassNames != null && tag != null && TagName.div.name().equals(tag.name)) {
+			final String classAttr = tag.opts.getProperty("class", EMPTY_STRING);
+			final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr);
+			ignore = !Collections.disjoint(this.ignoreDivClassNames, classes);
 		}
+		return ignore;
 	}
     
     /**
diff --git a/source/net/yacy/document/parser/html/Scraper.java b/source/net/yacy/document/parser/html/Scraper.java
index b483d5a8b..704b3560b 100644
--- a/source/net/yacy/document/parser/html/Scraper.java
+++ b/source/net/yacy/document/parser/html/Scraper.java
@@ -24,8 +24,6 @@
 
 package net.yacy.document.parser.html;
 
-import java.util.Properties;
-
 public interface Scraper {
 
 	/**
@@ -50,7 +48,12 @@ public interface Scraper {
 	 */
     public boolean isTag1(String tag);
 
-    public void scrapeText(char[] text, String insideTag);
+    /**
+     * Process plain text
+     * @param plain text to process
+     * @param insideTag the eventual direct parent tag. May be null.
+     */
+    public void scrapeText(char[] text, ContentScraper.Tag insideTag);
 
     /**
      * Process a tag belonging to the first category of tags according to the Scraper implementation
@@ -66,10 +69,18 @@ public interface Scraper {
     
     /**
      * Processing applied to any kind of tag opening.
-     * @param tagName the tag name
-     * @param tagAttributes the atttributes of the tag
+     * @param tag a parsed tag
      */
-    public void scrapeAnyTagOpening(String tagName, Properties tagAttributes);
+    public void scrapeAnyTagOpening(ContentScraper.Tag tag);
+    
+	/**
+	 * @param tag
+	 *            a parsed tag
+	 * @param parentTag the eventual parent tag
+	 * @return true when the tag should be ignored according to the scraper
+	 *         implementation rules
+	 */
+    public boolean shouldIgnoreTag(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag);
 
     public void scrapeComment(final char[] comment);
 
diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java
index eb246a997..1bf300e5e 100644
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@@ -232,15 +232,19 @@ public final class TransformerWriter extends Writer {
         if (this.tagStack.size() == 0) {
             // we are not collection tag text -> case (1) - (3)
             // case (1): this is not a tag opener/closer
-            if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null);
-            if (this.transformer != null) return this.transformer.transformText(content);
+            if (this.scraper != null && content.length > 0) {
+            	this.scraper.scrapeText(content, null);
+            }
+            if (this.transformer != null) {
+            	return this.transformer.transformText(content);
+            }
             return content;
         }
 
         // we are collection tag text for the tag 'filterTag' -> case (4) - (7)
         // case (4): getting no tag, go on collecting content
         if (this.scraper != null) {
-            this.scraper.scrapeText(content, this.tagStack.lastElement().name);
+            this.scraper.scrapeText(content, this.tagStack.lastElement());
         }
         if (this.transformer != null) {
             this.tagStack.lastElement().content.append(this.transformer.transformText(content));
@@ -293,8 +297,22 @@ public final class TransformerWriter extends Writer {
         ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser());
         charBuffer.close();
         
+        final ContentScraper.Tag parentTag;
+        if(this.tagStack.size() > 0) {
+        	parentTag = this.tagStack.lastElement();
+        } else {
+        	parentTag = null;
+        }
+        
+        /* Check scraper ignoring rules */
+		if (this.scraper != null && this.scraper.shouldIgnoreTag(tag, parentTag)) {
+			tag.setIgnore(true);
+		}
+        
         /* Apply processing relevant for any kind of tag opening */
-        this.scraper.scrapeAnyTagOpening(tag.name, tag.opts);
+        if(this.scraper != null) {
+        	this.scraper.scrapeAnyTagOpening(tag);
+        }
         
         if (this.scraper != null && this.scraper.isTag0(tagname)) {
             // this single tag is collected at once here
diff --git a/test/java/net/yacy/document/parser/htmlParserTest.java b/test/java/net/yacy/document/parser/htmlParserTest.java
index 4366d8c4b..5c4b62c28 100644
--- a/test/java/net/yacy/document/parser/htmlParserTest.java
+++ b/test/java/net/yacy/document/parser/htmlParserTest.java
@@ -13,6 +13,7 @@ import java.nio.charset.StandardCharsets;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
+import java.util.Set;
 
 import org.junit.Test;
 
@@ -138,6 +139,107 @@ public class htmlParserTest extends TestCase {
 			}
 		}
 	}
+	
+	/**
+	 * Test the htmlParser.parse() method, when filtering out div elements on their CSS class.
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseHtmlDivClassFilter() throws Exception {
+		final AnchorURL url = new AnchorURL("http://localhost/test.html");
+		final String mimetype = "text/html";
+		final StringBuilder testHtml = new StringBuilder("<!DOCTYPE html><head><title>Test document</title></head>");
+
+		testHtml.append("<div class=\"top\">Top text");
+		testHtml.append("<a href=\"http://localhost/top.html\">Top link</a>");
+		testHtml.append("</div>");
+
+		testHtml.append("<div class=\"optional\">Some optional content");
+		testHtml.append("<a href=\"http://localhost/content.html\">Link from optional block</a>");
+		testHtml.append("</div>");
+
+		testHtml.append("<p class=\"optional\">A paragraph</p>");
+
+		testHtml.append("<div class=\"optional-text\">Text-only optional block</div>");
+		
+		testHtml.append("<div class=\"optional desc\">");
+		testHtml.append("<div class=\"optional child\">");
+		testHtml.append("<div class=\"child\">");
+		testHtml.append("<p>Child text at depth 3</p>");
+		testHtml.append("</div></div></div>");
+
+		testHtml.append("<div class=\"bottom optional media\" itemscope itemtype=\"https://schema.org/LocalBusiness\"><img itemprop=\"logo\" src=\"http://localhost/image.png\" alt=\"Our Company\"></div>");
+		
+		final htmlParser parser = new htmlParser();
+
+		/* No CSS class filter */
+		try (InputStream sourceStream = new ByteArrayInputStream(
+				testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
+			final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream);
+			final Document doc = docs[0];
+			final String parsedDext = doc.getTextString();
+			
+			/* Check everything has been parsed */
+			assertEquals(2, doc.getAnchors().size());
+			assertEquals(1, doc.getImages().size());
+			assertEquals(1, doc.getLinkedDataTypes().size());
+			assertTrue(parsedDext.contains("Top"));
+			assertTrue(parsedDext.contains("Some"));
+			assertTrue(parsedDext.contains("from"));
+			assertTrue(parsedDext.contains("paragraph"));
+			assertTrue(parsedDext.contains("Text-only"));
+			assertTrue(parsedDext.contains("depth"));
+		}
+		
+		/* Filter on CSS classes with no matching elements */
+		try (InputStream sourceStream = new ByteArrayInputStream(
+				testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
+			final Set<String> ignore = new HashSet<>();
+			ignore.add("opt");
+			ignore.add("head");
+			ignore.add("container");
+			final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream);
+			final Document doc = docs[0];
+			final String parsedDext = doc.getTextString();
+			
+			/* Check everything has been parsed */
+			assertEquals(2, doc.getAnchors().size());
+			assertEquals(1, doc.getImages().size());
+			assertEquals(1, doc.getLinkedDataTypes().size());
+			assertTrue(parsedDext.contains("Top"));
+			assertTrue(parsedDext.contains("Some"));
+			assertTrue(parsedDext.contains("from"));
+			assertTrue(parsedDext.contains("paragraph"));
+			assertTrue(parsedDext.contains("Text-only"));
+			assertTrue(parsedDext.contains("depth"));
+		}
+		
+		/* Filter on CSS class with matching elements */
+		try (InputStream sourceStream = new ByteArrayInputStream(
+				testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
+			final Set<String> ignore = new HashSet<>();
+			ignore.add("optional");
+			final Document[] docs = parser.parse(url, mimetype, null, ignore, new VocabularyScraper(), 0, sourceStream);
+			final Document doc = docs[0];
+			final String parsedDext = doc.getTextString();
+			
+			/* Check matching blocks have been ignored */
+			assertEquals(1, doc.getAnchors().size());
+			assertEquals("http://localhost/top.html", doc.getAnchors().iterator().next().toString());
+			assertEquals(0, doc.getLinkedDataTypes().size());
+			assertEquals(0, doc.getImages().size());
+			assertFalse(parsedDext.contains("Some"));
+			assertFalse(parsedDext.contains("from"));
+			assertFalse(parsedDext.contains("depth"));
+			
+			/* Check non-matching blocks have been normally parsed */
+			assertTrue(parsedDext.contains("Top"));
+			assertTrue(parsedDext.contains("Text-only"));
+			assertTrue(parsedDext.contains("paragraph"));
+		}
+	}
     
     /**
      * Test the htmlParser.parseWithLimits() method with test content within bounds.

set of class names		comma-separated list of div class names which should be filtered out
set of CSS class names		comma-separated list of <div> element class names which should be filtered out