fix for parser problem if a <a>-tag is 'within' html tags with unclosed

tags. That prevented the <a> tags from beeing recognized. This is a fix for http://forum.yacy-websuche.de/viewtopic.php?p=25516#p25516
13 years ago · 4d5da75814
parent eb2c8ffa62
commit 4d5da75814
1 changed files with 69 additions and 36 deletions
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@ -191,11 +191,21 @@ public final class TransformerWriter extends Writer {
    }
    private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) {
-//      System.out.println("FILTER1: filterTag=" + ((filterTag == null) ? "null" : filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + UTF8.String(content)); // debug
+        //System.out.println("FILTER1: filterTag=" + ((this.filterTag == null) ? "null" : this.filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
        // distinguish the following cases:
        // - (1) not collecting data for a tag and getting no tag (not opener and not close)
        // - (2) not collecting data for a tag and getting a tag opener
        // - (3) not collecting data for a tag and getting a tag close
        // - (4) collecting data for a tag and getting no tag (not opener and not close)
        // - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag
        // - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener)
        // - (7) collecting data for a tag and getting the correct close tag for that collecting tag
        if (this.filterTag == null) {
-            // we are not collection tag text
+            // we are not collection tag text -> case (1) - (3)
            if (tag == null) {
-                // and this is not a tag opener/closer
+                // case (1): this is not a tag opener/closer
                if (this.scraper != null) this.scraper.scrapeText(content, null);
                if (this.transformer != null) return this.transformer.transformText(content);
                return content;
@ -203,43 +213,19 @@ public final class TransformerWriter extends Writer {
            // we have a new tag
            if (opening) {
-                if ((this.scraper != null) && (this.scraper.isTag0(tag))) {
+                // case (2):
-                    // this single tag is collected at once here
+                return filterTagOpening(tag, content, quotechar);
                    final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
                    this.scraper.scrapeTag0(tag, charBuffer.propParser());
                    charBuffer.close();
                }
                if ((this.transformer != null) && (this.transformer.isTag0(tag))) {
                    // this single tag is collected at once here
                    final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
                    try {
                        return this.transformer.transformTag0(tag, scb.propParser(), quotechar);
                    } finally {
                        scb.close();
                    }
                } else if (((this.scraper != null) && (this.scraper.isTag1(tag))) ||
                           ((this.transformer != null) && (this.transformer.isTag1(tag)))) {
                    // ok, start collecting
                    this.filterTag = tag;
                    final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
                    this.filterOpts = scb.propParser();
                    scb.close();
                    if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
                    return new char[0];
                } else {
                     // we ignore that thing and return it again
                     return genTag0raw(tag, true, content);
                }
            }
-            // we ignore that thing and return it again
+            // its a close tag
            // case (3): we ignore that thing and return it again
            return genTag0raw(tag, false, content);
        }
-        // we are collection tag text for the tag 'filterTag'
+        // we are collection tag text for the tag 'filterTag' -> case (4) - (7)
        if (tag == null) {
-            // go on collecting content
+            // case (4): getting no tag, go on collecting content
            if (this.scraper != null) this.scraper.scrapeText(content, this.filterTag);
            if (this.transformer != null) {
                this.filterCont.append(this.transformer.transformText(content));
@ -250,13 +236,60 @@ public final class TransformerWriter extends Writer {
        }
        // it's a tag! which one?
-        if ((opening) || (!(tag.equalsIgnoreCase(this.filterTag)))) {
+        StringBuilder ret0 = new StringBuilder();
-            // this tag is not our concern. just add it
+        if (opening) {
            // case (5): the opening should not be here. we close the previous tag as if it had been closed correctly
            // this may happen if the html is not well-formed, like forgotten close tags
            ret0.append(filterTagCloseing(quotechar));
            // after this point we just go on and process a new tag
            ret0.append(filterTagOpening(tag, content, quotechar));
            return ret0.toString().toCharArray();
        }
        if (!tag.equalsIgnoreCase(this.filterTag)) {
            // case (6): its a closing tag, but the wrong one. just add it.
            this.filterCont.append(genTag0raw(tag, opening, content));
            return new char[0];
        }
        // it's our closing tag! return complete result.
        return filterTagCloseing(quotechar);
    }
    private char[] filterTagOpening(final String tag, final char[] content, final char quotechar) {
        if (this.scraper != null && this.scraper.isTag0(tag)) {
            // this single tag is collected at once here
            final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
            this.scraper.scrapeTag0(tag, charBuffer.propParser());
            charBuffer.close();
        }
        if (this.transformer != null && this.transformer.isTag0(tag)) {
            // this single tag is collected at once here
            final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
            char[] b = new char[0];
            try {
                b = this.transformer.transformTag0(tag, scb.propParser(), quotechar);
            } finally {
                scb.close();
            }
            return b;
        } else if ((this.scraper != null && this.scraper.isTag1(tag)) ||
                   (this.transformer != null && this.transformer.isTag1(tag))) {
            // ok, start collecting
            this.filterTag = tag;
            final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
            this.filterOpts = scb.propParser();
            scb.close();
            if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
            return new char[0];
        } else {
             // we ignore that thing and return it again
             return genTag0raw(tag, true, content);
        }
    }
    private char[] filterTagCloseing(final char quotechar) {
        char[] ret;
        if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
        if (this.transformer != null) {
@ -291,7 +324,7 @@ public final class TransformerWriter extends Writer {
    private char[] filterSentence(final char[] in, final char quotechar) {
        if (in.length == 0) return in;
-//      System.out.println("FILTER0: " + UTF8.String(in)); // debug
+        //System.out.println("FILTER0: " + new String(in)); // debug
        // scan the string and parse structure
        if (in.length > 2 && in[0] == lb) {