introduction of tag-to-indexing relation TagValency

2 years ago · 5acd98f4da
parent 95e02e5291
commit 5acd98f4da
10 changed files with 651 additions and 570 deletions
--- a/source/net/yacy/data/BookmarkHelper.java
+++ b/source/net/yacy/data/BookmarkHelper.java
@ -60,6 +60,7 @@ import net.yacy.data.BookmarksDB.Bookmark;
 import net.yacy.data.BookmarksDB.Tag;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.TagValency;
 import net.yacy.document.parser.html.TransformerWriter;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.util.FileUtils;
@ -135,7 +136,7 @@ public class BookmarkHelper {
        final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
        try {
            //load the links
-            final ContentScraper scraper = new ContentScraper(baseURL, 10000, new HashSet<String>(), new VocabularyScraper(), 0);
+            final ContentScraper scraper = new ContentScraper(baseURL, 10000, new HashSet<String>(), TagValency.EVAL, new VocabularyScraper(), 0);
            //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
            final Writer writer = new TransformerWriter(null, null, scraper, false);
            FileUtils.copy(input,writer);
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -146,22 +146,22 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        public String name;
        public Properties opts;
        public CharBuffer content;
-        
+        private TagValency tv;
-        /** Set to true when this tag should be ignored from scraping */
+        public Tag(final String name, TagValency defaultValency) {
        private boolean ignore = false;
        public Tag(final String name) {
            this.name = name;
            this.tv = defaultValency;
            this.opts = new Properties();
            this.content = new CharBuffer(MAX_TAGSIZE);
        }
-        public Tag(final String name, final Properties opts) {
+        public Tag(final String name, TagValency defaultValency, final Properties opts) {
            this.name = name;
            this.tv = defaultValency;
            this.opts = opts;
            this.content = new CharBuffer(MAX_TAGSIZE);
        }
-        public Tag(final String name, final Properties opts, final CharBuffer content) {
+        public Tag(final String name, TagValency defaultValency, final Properties opts, final CharBuffer content) {
            this.name = name;
            this.tv = defaultValency;
            this.opts = opts;
            this.content = content;
        }
@ -178,14 +178,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        /** @return true when this tag should be ignored from scraping */
        public boolean isIgnore() {
-			return this.ignore;
+            return this.tv == TagValency.IGNORE;
        }
-        
+        public TagValency getValency() {
-        /**
+            return this.tv;
-         * @param ignore true when this tag should be ignored from scraping
+        }
-         */
+        public void setValency(final TagValency tv) {
-        public void setIgnore(final boolean ignore) {
+            this.tv = tv;
 			this.ignore = ignore;
        }
    }
@ -230,8 +229,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private final VocabularyScraper vocabularyScraper;
-    /** Set of CSS class names whose matching div elements content should be ignored */
+    /** Set of CSS class names whose matching div elements may switch from IGNORE to EVAL or vice versa */
-    private final Set<String> ignoreDivClassNames;
+    private final Set<String> valencySwitchTagNames;
    private final TagValency defaultValency;
    private final int timezoneOffset;
    private int breadcrumbs;
@ -261,19 +261,28 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     * @param root the document root url
     * @param maxAnchors the maximum number of URLs to process and store in the anchors property.
     * @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
-     * @param ignoreDivClassNames an eventual set of CSS class names whose matching div elements content should be ignored
+     * @param valencySwitchTagNames an eventual set of CSS class names whose matching div elements content should be ignored
     * @param defaultValency the valency default; should be TagValency.EVAL by default
     * @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
     * @param timezoneOffset local time zone offset
     */
    @SuppressWarnings("unchecked")
-    public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set<String> ignoreDivClassNames, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
+    public ContentScraper(
            final DigestURL root,
            final int maxAnchors,
            final int maxLinks,
            final Set<String> valencySwitchTagNames,
            final TagValency defaultValency,
            final VocabularyScraper vocabularyScraper,
            int timezoneOffset) {
        // the root value here will not be used to load the resource.
        // it is only the reference for relative links
        super(linkTags0, linkTags1);
        assert root != null;
        this.root = root;
        this.vocabularyScraper = vocabularyScraper;
-        this.ignoreDivClassNames = ignoreDivClassNames;
+        this.valencySwitchTagNames = valencySwitchTagNames;
        this.defaultValency = defaultValency;
        this.timezoneOffset = timezoneOffset;
        this.evaluationScores = new Evaluation();
        this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
@ -321,8 +330,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     * @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
     * @param timezoneOffset local time zone offset
     */
-    public ContentScraper(final DigestURL root, final int maxLinks, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
+    public ContentScraper(
-        this(root, Integer.MAX_VALUE, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
+            final DigestURL root,
            final int maxLinks,
            final Set<String> valencySwitchTagNames,
            final TagValency defaultValency,
            final VocabularyScraper vocabularyScraper,
            int timezoneOffset) {
        this(root, Integer.MAX_VALUE, maxLinks, valencySwitchTagNames, defaultValency, vocabularyScraper, timezoneOffset);
    }
    public TagValency defaultValency() {
        return this.defaultValency;
    }
    @Override
@ -333,7 +352,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    @Override
    public void scrapeText(final char[] newtext0, final Tag insideTag) {
        if (insideTag != null) {
-        	if(insideTag.ignore) {
+            if (insideTag.tv == TagValency.IGNORE) {
                return;
            }
            if ((TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name))) {
@ -720,7 +739,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     */
    @Override
    public void scrapeTag0(final Tag tag) {
-    	if(tag.ignore) {
+        if (tag.tv == TagValency.IGNORE) {
            return;
        }
        checkOpts(tag);
@ -893,7 +912,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     */
    @Override
    public void scrapeTag1(final Tag tag) {
-    	if(tag.ignore) {
+        if (tag.tv == TagValency.IGNORE) {
            return;
        }
        checkOpts(tag);
@ -1003,7 +1022,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     */
    @Override
    public void scrapeAnyTagOpening(final Tag tag) {
-		if (tag != null && !tag.ignore && tag.opts != null) {
+        if (tag != null && tag.tv == TagValency.EVAL && tag.opts != null) {
            /*
             * HTML microdata can be annotated on any kind of tag, so we don't restrict this
             * scraping to the limited sets in linkTags0 and linkTags1
@ -1013,24 +1032,17 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    }
    @Override
-	public boolean shouldIgnoreTag(final Tag tag, final Tag parentTag) {
+    public TagValency tagValency(final Tag tag, final Tag parentTag) {
-		boolean ignore = false;
+        if (parentTag != null && parentTag.tv != this.defaultValency) return parentTag.tv;
        /* First, inherit ignore property from eventual parent */
 		if(parentTag != null) {
 			ignore = parentTag.ignore;
 		}
-		/* Parent is not marked as ignored : let's check the current tag */
+        if (this.valencySwitchTagNames != null &&
 		if (!ignore &&
 		        this.ignoreDivClassNames != null &&
            tag != null &&
            (TagName.div.name().equals(tag.name) || TagName.nav.name().equals(tag.name))) {
            final String classAttr = tag.opts.getProperty("class", EMPTY_STRING);
            final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr);
-			ignore = !Collections.disjoint(this.ignoreDivClassNames, classes);
+            if (!Collections.disjoint(this.valencySwitchTagNames, classes)) return this.defaultValency.reverse();
        }
-		return ignore;
+        return this.defaultValency;
    }
    /**
@ -1604,13 +1616,25 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        if (page == null) throw new IOException("no content in file " + file.toString());
        // scrape document to look up charset
-        final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), new DigestURL("http://localhost"), false, maxLinks, timezoneOffset);
+        final ScraperInputStream htmlFilter = new ScraperInputStream(
                new ByteArrayInputStream(page), 
                StandardCharsets.UTF_8.name(), 
                new HashSet<String>(), TagValency.EVAL,
                new VocabularyScraper(), 
                new DigestURL("http://localhost"), 
                false, maxLinks, timezoneOffset);
        String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
        htmlFilter.close();
        if (charset == null) charset = Charset.defaultCharset().toString();
        // scrape content
-        final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
+        final ContentScraper scraper = new ContentScraper(
                new DigestURL("http://localhost"), 
                maxLinks, 
                new HashSet<String>(), 
                TagValency.EVAL, 
                new VocabularyScraper(), 
                timezoneOffset);
        final Writer writer = new TransformerWriter(null, null, scraper, false);
        FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
        writer.close();
--- a/source/net/yacy/document/parser/html/Scraper.java
+++ b/source/net/yacy/document/parser/html/Scraper.java
@ -80,7 +80,9 @@ public interface Scraper {
     * @return true when the tag should be ignored according to the scraper
     *         implementation rules
     */
-    public boolean shouldIgnoreTag(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag);
+    public TagValency tagValency(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag);
    public TagValency defaultValency();
    public void scrapeComment(final char[] comment);
--- a/source/net/yacy/document/parser/html/ScraperInputStream.java
+++ b/source/net/yacy/document/parser/html/ScraperInputStream.java
@ -62,7 +62,8 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
    public ScraperInputStream(
            final InputStream inStream,
            final String inputStreamCharset,
-            final Set<String> ignore_class_name,
+            final Set<String> valencySwitchTagNames,
            final TagValency defaultValency,
            final VocabularyScraper vocabularyScraper,
            final DigestURL rooturl,
            final boolean passbyIfBinarySuspect,
@ -73,7 +74,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
        this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
        this.bufferedIn.mark((int) preBufferSize);
-        final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
+        final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, valencySwitchTagNames, defaultValency, vocabularyScraper, timezoneOffset);
        scraper.registerHtmlFilterEventListener(this);
        try {
--- a/source/net/yacy/document/parser/html/TagValency.java
+++ b/source/net/yacy/document/parser/html/TagValency.java
@ -0,0 +1,30 @@
 /**
 *  TagValency
 *  Copyright 2023 by Michael Peter Christen, @0rb1t3r
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */
 package net.yacy.document.parser.html;
 public enum TagValency {
    IGNORE,  // do not index that tag
    EVAL;    // do index that tag
    public TagValency reverse() {
        return this == IGNORE ? EVAL : IGNORE;
    }
 }
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@ -283,7 +283,7 @@ public final class TransformerWriter extends Writer {
    private char[] filterTagOpening(final String tagname, final char[] content) {
        final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
-        ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser());
+        ContentScraper.Tag tag = new ContentScraper.Tag(tagname, this.scraper.defaultValency(), charBuffer.propParser());
        charBuffer.close();
        final ContentScraper.Tag parentTag;
@ -294,8 +294,8 @@ public final class TransformerWriter extends Writer {
        }
        /* Check scraper ignoring rules */
-		if (this.scraper != null && this.scraper.shouldIgnoreTag(tag, parentTag)) {
+        if (this.scraper != null) {
-			tag.setIgnore(true);
+            tag.setValency(this.scraper.tagValency(tag, parentTag));
        }
        /* Apply processing relevant for any kind of tag opening */
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -57,6 +57,7 @@ import net.yacy.document.VocabularyScraper;
 import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.ImageEntry;
 import net.yacy.document.parser.html.ScraperInputStream;
 import net.yacy.document.parser.html.TagValency;
 import net.yacy.document.parser.html.TransformerWriter;
@ -276,7 +277,16 @@ public class htmlParser extends AbstractParser implements Parser {
        if (charset == null) {
            ScraperInputStream htmlFilter = null;
            try {
-                htmlFilter = new ScraperInputStream(sourceStream, documentCharset, ignore_class_name, vocabularyScraper, location, false, maxLinks, timezoneOffset);
+                htmlFilter = new ScraperInputStream(
                        sourceStream,
                        documentCharset,
                        ignore_class_name,
                        TagValency.EVAL,
                        vocabularyScraper,
                        location,
                        false,
                        maxLinks,
                        timezoneOffset);
                sourceStream = htmlFilter;
                charset = htmlFilter.detectCharset();
            } catch (final IOException e1) {
@ -311,7 +321,14 @@ public class htmlParser extends AbstractParser implements Parser {
        // parsing the content
        // for this static method no need to init local this.scraperObject here
-        final ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
+        final ContentScraper scraper = new ContentScraper(
                location,
                maxAnchors,
                maxLinks,
                ignore_class_name,
                TagValency.EVAL,
                vocabularyScraper,
                timezoneOffset);
        final TransformerWriter writer = new TransformerWriter(null, null, scraper, false, Math.max(64, Math.min(4096, sourceStream.available())));
        try {
        	final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());
--- a/source/net/yacy/htroot/Crawler_p.java
+++ b/source/net/yacy/htroot/Crawler_p.java
@ -66,6 +66,7 @@ import net.yacy.data.WorkTables;
 import net.yacy.document.Document;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.TagValency;
 import net.yacy.document.parser.html.TransformerWriter;
 import net.yacy.kelondro.index.RowHandleSet;
 import net.yacy.kelondro.util.FileUtils;
@ -733,8 +734,13 @@ public class Crawler_p {
                                } else {
                                    /* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
                                    final String crawlingFileContent = post.get("crawlingFile$file", "");
-                                    final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
+                                    final ContentScraper scraper = new ContentScraper(
-                                            new HashSet<String>(), new VocabularyScraper(), profile.timezoneOffset());
+                                            new DigestURL(crawlingFile),
                                            10000000,
                                            new HashSet<String>(),
                                            TagValency.EVAL,
                                            new VocabularyScraper(),
                                            profile.timezoneOffset());
                                    final FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
                                            sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
                                    sb.crawler.putActive(handle, profile);
@ -874,7 +880,7 @@ public class Crawler_p {
            final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException {
        List<AnchorURL> hyperlinks_from_file;
        // check if the crawl filter works correctly
-        final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
+        final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), TagValency.EVAL, new VocabularyScraper(), timezoneOffset);
        final Writer writer = new TransformerWriter(null, null, scraper, false);
        if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
            /* Let's report here detailed error to help user when he selected a wrong file */