From eb20589e29747949c2cfdb47a79a7c7d26f887ac Mon Sep 17 00:00:00 2001 From: luccioman Date: Sat, 10 Feb 2018 11:56:28 +0100 Subject: [PATCH] Fixed issue #158 : completed div CSS class ignore in crawl --- htroot/CrawlStartExpert.html | 2 +- .../document/parser/html/AbstractScraper.java | 11 -- .../document/parser/html/ContentScraper.java | 88 +++++++++++---- .../yacy/document/parser/html/Scraper.java | 23 ++-- .../parser/html/TransformerWriter.java | 26 ++++- .../yacy/document/parser/htmlParserTest.java | 102 ++++++++++++++++++ 6 files changed, 208 insertions(+), 44 deletions(-) diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index cba76c34d..78af86373 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -373,7 +373,7 @@
Filter div class names
- +
set of class namescomma-separated list of div class names which should be filtered out
set of CSS class namescomma-separated list of <div> element class names which should be filtered out
diff --git a/source/net/yacy/document/parser/html/AbstractScraper.java b/source/net/yacy/document/parser/html/AbstractScraper.java index e0980c21b..1f4a5fd0b 100644 --- a/source/net/yacy/document/parser/html/AbstractScraper.java +++ b/source/net/yacy/document/parser/html/AbstractScraper.java @@ -65,17 +65,6 @@ public abstract class AbstractScraper implements Scraper { return (this.tags1 != null) && (this.tags1.contains(tag.toLowerCase())); } - //the 'missing' method that shall be implemented: - @Override - public abstract void scrapeText(char[] text, String insideTag); - - // the other methods must take into account to construct the return value correctly - @Override - public abstract void scrapeTag0(ContentScraper.Tag tag); - - @Override - public abstract void scrapeTag1(ContentScraper.Tag tag); - public static String stripAllTags(final char[] s) { if (s.length > 80 && !MemoryControl.request(s.length * 2, false)) return ""; final StringBuilder r = new StringBuilder(s.length); diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 76981ffc2..1a4d46bab 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -145,6 +145,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { public String name; public Properties opts; public CharBuffer content; + + /** Set to true when this tag should be ignored from scraping */ + private boolean ignore = false; + public Tag(final String name) { this.name = name; this.opts = new Properties(); @@ -174,6 +178,18 @@ public class ContentScraper extends AbstractScraper implements Scraper { public String toString() { return "<" + name + " " + opts + ">" + content + ""; } + + /** @return true when this tag should be ignored from scraping */ + public boolean isIgnore() { + return this.ignore; + } + + /** + * @param ignore true when this tag should be ignored from scraping + */ + public void setIgnore(final boolean ignore) { + this.ignore = ignore; + } } // all these tags must be given in lowercase, because the tags from the files are compared in lowercase @@ -216,7 +232,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final int maxAnchors; private final VocabularyScraper vocabularyScraper; - private final Set ignore_class_name; + + /** Set of CSS class names whose matching div elements content should be ignored */ + private final Set ignoreDivClassNames; + private final int timezoneOffset; private int breadcrumbs; @@ -245,18 +264,19 @@ public class ContentScraper extends AbstractScraper implements Scraper { * @param root the document root url * @param maxAnchors the maximum number of URLs to process and store in the anchors property. * @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store + * @param ignoreDivClassNames an eventual set of CSS class names whose matching div elements content should be ignored * @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms * @param timezoneOffset local time zone offset */ @SuppressWarnings("unchecked") - public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set ignore_class_name, final VocabularyScraper vocabularyScraper, int timezoneOffset) { + public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set ignoreDivClassNames, final VocabularyScraper vocabularyScraper, int timezoneOffset) { // the root value here will not be used to load the resource. // it is only the reference for relative links super(linkTags0, linkTags1); assert root != null; this.root = root; this.vocabularyScraper = vocabularyScraper; - this.ignore_class_name = ignore_class_name; + this.ignoreDivClassNames = ignoreDivClassNames; this.timezoneOffset = timezoneOffset; this.evaluationScores = new Evaluation(); this.rss = new SizeLimitedMap(maxLinks); @@ -314,9 +334,15 @@ public class ContentScraper extends AbstractScraper implements Scraper { } @Override - public void scrapeText(final char[] newtext0, final String insideTag) { - // System.out.println("SCRAPE: " + UTF8.String(newtext)); - if (insideTag != null && (TagName.script.name().equals(insideTag) || TagName.style.name().equals(insideTag))) return; + public void scrapeText(final char[] newtext0, final Tag insideTag) { + if (insideTag != null) { + if(insideTag.ignore) { + return; + } + if ((TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name))) { + return; + } + } int p, pl, q, s = 0; char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray(); @@ -377,7 +403,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } // find tags inside text String b = cleanLine(stripAllTags(newtext)); - if ((insideTag != null) && (!(insideTag.equals("a")))) { + if ((insideTag != null) && (!(insideTag.name.equals(TagName.a.name())))) { // texts inside tags sometimes have no punctuation at the line end // this is bad for the text semantics, because it is not possible for the // condenser to distinguish headlines from text beginnings. @@ -697,6 +723,9 @@ public class ContentScraper extends AbstractScraper implements Scraper { */ @Override public void scrapeTag0(final Tag tag) { + if(tag.ignore) { + return; + } checkOpts(tag); if (tag.name.equalsIgnoreCase("img")) { final String src = tag.opts.getProperty("src", EMPTY_STRING); @@ -861,6 +890,9 @@ public class ContentScraper extends AbstractScraper implements Scraper { */ @Override public void scrapeTag1(final Tag tag) { + if(tag.ignore) { + return; + } checkOpts(tag); // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text)); if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) { @@ -882,18 +914,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { } final String h; if (tag.name.equalsIgnoreCase("div")) { - final String classn = tag.opts.getProperty("class", EMPTY_STRING); - if (classn.length() > 0 && this.ignore_class_name.contains(classn)) { - // we remove everything inside that tag, so it can be ignored - tag.content.clear(); - } else { - final String id = tag.opts.getProperty("id", EMPTY_STRING); - this.evaluationScores.match(Element.divid, id); - final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING); - if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) { - breadcrumbs++; - } - } + final String id = tag.opts.getProperty("id", EMPTY_STRING); + this.evaluationScores.match(Element.divid, id); + final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING); + if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) { + breadcrumbs++; + } } else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) { h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.headlines[0].add(h); @@ -974,14 +1000,32 @@ public class ContentScraper extends AbstractScraper implements Scraper { * {@link ContentScraper#linkTags0} and {@link ContentScraper#linkTags1}. */ @Override - public void scrapeAnyTagOpening(final String tagName, final Properties tagAttributes) { - if (tagAttributes != null) { + public void scrapeAnyTagOpening(final Tag tag) { + if (tag != null && !tag.ignore && tag.opts != null) { /* * HTML microdata can be annotated on any kind of tag, so we don't restrict this * scraping to the limited sets in linkTags0 and linkTags1 */ - this.linkedDataTypes.addAll(parseMicrodataItemType(tagAttributes)); + this.linkedDataTypes.addAll(parseMicrodataItemType(tag.opts)); + } + } + + @Override + public boolean shouldIgnoreTag(final Tag tag, final Tag parentTag) { + boolean ignore = false; + + /* First, inherit ignore property from eventual parent */ + if(parentTag != null) { + ignore = parentTag.ignore; + } + + /* Parent is not marked as ignored : let's check the current tag */ + if (!ignore && this.ignoreDivClassNames != null && tag != null && TagName.div.name().equals(tag.name)) { + final String classAttr = tag.opts.getProperty("class", EMPTY_STRING); + final Set classes = ContentScraper.parseSpaceSeparatedTokens(classAttr); + ignore = !Collections.disjoint(this.ignoreDivClassNames, classes); } + return ignore; } /** diff --git a/source/net/yacy/document/parser/html/Scraper.java b/source/net/yacy/document/parser/html/Scraper.java index b483d5a8b..704b3560b 100644 --- a/source/net/yacy/document/parser/html/Scraper.java +++ b/source/net/yacy/document/parser/html/Scraper.java @@ -24,8 +24,6 @@ package net.yacy.document.parser.html; -import java.util.Properties; - public interface Scraper { /** @@ -50,7 +48,12 @@ public interface Scraper { */ public boolean isTag1(String tag); - public void scrapeText(char[] text, String insideTag); + /** + * Process plain text + * @param plain text to process + * @param insideTag the eventual direct parent tag. May be null. + */ + public void scrapeText(char[] text, ContentScraper.Tag insideTag); /** * Process a tag belonging to the first category of tags according to the Scraper implementation @@ -66,10 +69,18 @@ public interface Scraper { /** * Processing applied to any kind of tag opening. - * @param tagName the tag name - * @param tagAttributes the atttributes of the tag + * @param tag a parsed tag */ - public void scrapeAnyTagOpening(String tagName, Properties tagAttributes); + public void scrapeAnyTagOpening(ContentScraper.Tag tag); + + /** + * @param tag + * a parsed tag + * @param parentTag the eventual parent tag + * @return true when the tag should be ignored according to the scraper + * implementation rules + */ + public boolean shouldIgnoreTag(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag); public void scrapeComment(final char[] comment); diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java index eb246a997..1bf300e5e 100644 --- a/source/net/yacy/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -232,15 +232,19 @@ public final class TransformerWriter extends Writer { if (this.tagStack.size() == 0) { // we are not collection tag text -> case (1) - (3) // case (1): this is not a tag opener/closer - if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null); - if (this.transformer != null) return this.transformer.transformText(content); + if (this.scraper != null && content.length > 0) { + this.scraper.scrapeText(content, null); + } + if (this.transformer != null) { + return this.transformer.transformText(content); + } return content; } // we are collection tag text for the tag 'filterTag' -> case (4) - (7) // case (4): getting no tag, go on collecting content if (this.scraper != null) { - this.scraper.scrapeText(content, this.tagStack.lastElement().name); + this.scraper.scrapeText(content, this.tagStack.lastElement()); } if (this.transformer != null) { this.tagStack.lastElement().content.append(this.transformer.transformText(content)); @@ -293,8 +297,22 @@ public final class TransformerWriter extends Writer { ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser()); charBuffer.close(); + final ContentScraper.Tag parentTag; + if(this.tagStack.size() > 0) { + parentTag = this.tagStack.lastElement(); + } else { + parentTag = null; + } + + /* Check scraper ignoring rules */ + if (this.scraper != null && this.scraper.shouldIgnoreTag(tag, parentTag)) { + tag.setIgnore(true); + } + /* Apply processing relevant for any kind of tag opening */ - this.scraper.scrapeAnyTagOpening(tag.name, tag.opts); + if(this.scraper != null) { + this.scraper.scrapeAnyTagOpening(tag); + } if (this.scraper != null && this.scraper.isTag0(tagname)) { // this single tag is collected at once here diff --git a/test/java/net/yacy/document/parser/htmlParserTest.java b/test/java/net/yacy/document/parser/htmlParserTest.java index 4366d8c4b..5c4b62c28 100644 --- a/test/java/net/yacy/document/parser/htmlParserTest.java +++ b/test/java/net/yacy/document/parser/htmlParserTest.java @@ -13,6 +13,7 @@ import java.nio.charset.StandardCharsets; import java.util.HashSet; import java.util.List; import java.util.Locale; +import java.util.Set; import org.junit.Test; @@ -138,6 +139,107 @@ public class htmlParserTest extends TestCase { } } } + + /** + * Test the htmlParser.parse() method, when filtering out div elements on their CSS class. + * + * @throws Exception + * when an unexpected error occurred + */ + @Test + public void testParseHtmlDivClassFilter() throws Exception { + final AnchorURL url = new AnchorURL("http://localhost/test.html"); + final String mimetype = "text/html"; + final StringBuilder testHtml = new StringBuilder("Test document"); + + testHtml.append("
Top text"); + testHtml.append("Top link"); + testHtml.append("
"); + + testHtml.append("
Some optional content"); + testHtml.append("Link from optional block"); + testHtml.append("
"); + + testHtml.append("

A paragraph

"); + + testHtml.append("
Text-only optional block
"); + + testHtml.append("
"); + testHtml.append("
"); + testHtml.append("
"); + testHtml.append("

Child text at depth 3

"); + testHtml.append("
"); + + testHtml.append("
\"Our
"); + + final htmlParser parser = new htmlParser(); + + /* No CSS class filter */ + try (InputStream sourceStream = new ByteArrayInputStream( + testHtml.toString().getBytes(StandardCharsets.UTF_8));) { + final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream); + final Document doc = docs[0]; + final String parsedDext = doc.getTextString(); + + /* Check everything has been parsed */ + assertEquals(2, doc.getAnchors().size()); + assertEquals(1, doc.getImages().size()); + assertEquals(1, doc.getLinkedDataTypes().size()); + assertTrue(parsedDext.contains("Top")); + assertTrue(parsedDext.contains("Some")); + assertTrue(parsedDext.contains("from")); + assertTrue(parsedDext.contains("paragraph")); + assertTrue(parsedDext.contains("Text-only")); + assertTrue(parsedDext.contains("depth")); + } + + /* Filter on CSS classes with no matching elements */ + try (InputStream sourceStream = new ByteArrayInputStream( + testHtml.toString().getBytes(StandardCharsets.UTF_8));) { + final Set ignore = new HashSet<>(); + ignore.add("opt"); + ignore.add("head"); + ignore.add("container"); + final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream); + final Document doc = docs[0]; + final String parsedDext = doc.getTextString(); + + /* Check everything has been parsed */ + assertEquals(2, doc.getAnchors().size()); + assertEquals(1, doc.getImages().size()); + assertEquals(1, doc.getLinkedDataTypes().size()); + assertTrue(parsedDext.contains("Top")); + assertTrue(parsedDext.contains("Some")); + assertTrue(parsedDext.contains("from")); + assertTrue(parsedDext.contains("paragraph")); + assertTrue(parsedDext.contains("Text-only")); + assertTrue(parsedDext.contains("depth")); + } + + /* Filter on CSS class with matching elements */ + try (InputStream sourceStream = new ByteArrayInputStream( + testHtml.toString().getBytes(StandardCharsets.UTF_8));) { + final Set ignore = new HashSet<>(); + ignore.add("optional"); + final Document[] docs = parser.parse(url, mimetype, null, ignore, new VocabularyScraper(), 0, sourceStream); + final Document doc = docs[0]; + final String parsedDext = doc.getTextString(); + + /* Check matching blocks have been ignored */ + assertEquals(1, doc.getAnchors().size()); + assertEquals("http://localhost/top.html", doc.getAnchors().iterator().next().toString()); + assertEquals(0, doc.getLinkedDataTypes().size()); + assertEquals(0, doc.getImages().size()); + assertFalse(parsedDext.contains("Some")); + assertFalse(parsedDext.contains("from")); + assertFalse(parsedDext.contains("depth")); + + /* Check non-matching blocks have been normally parsed */ + assertTrue(parsedDext.contains("Top")); + assertTrue(parsedDext.contains("Text-only")); + assertTrue(parsedDext.contains("paragraph")); + } + } /** * Test the htmlParser.parseWithLimits() method with test content within bounds.