fix for parser problem if a <a>-tag is 'within' html tags with unclosed

tags. That prevented the <a> tags from beeing recognized. This is a fix for http://forum.yacy-websuche.de/viewtopic.php?p=25516#p25516
13 years ago · 4d5da75814
parent eb2c8ffa62
commit 4d5da75814
1 changed files with 69 additions and 36 deletions
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@ -191,11 +191,21 @@ public final class TransformerWriter extends Writer {
    }

    private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) {
-//      System.out.println("FILTER1: filterTag=" + ((filterTag == null) ? "null" : filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + UTF8.String(content)); // debug
+        //System.out.println("FILTER1: filterTag=" + ((this.filterTag == null) ? "null" : this.filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
+        // distinguish the following cases:
+        // - (1) not collecting data for a tag and getting no tag (not opener and not close)
+        // - (2) not collecting data for a tag and getting a tag opener
+        // - (3) not collecting data for a tag and getting a tag close
+        // - (4) collecting data for a tag and getting no tag (not opener and not close)
+        // - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag
+        // - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener)
+        // - (7) collecting data for a tag and getting the correct close tag for that collecting tag
+
        if (this.filterTag == null) {
-            // we are not collection tag text
+            // we are not collection tag text -> case (1) - (3)
+
            if (tag == null) {
-                // and this is not a tag opener/closer
+                // case (1): this is not a tag opener/closer
                if (this.scraper != null) this.scraper.scrapeText(content, null);
                if (this.transformer != null) return this.transformer.transformText(content);
                return content;
@ -203,43 +213,19 @@ public final class TransformerWriter extends Writer {

            // we have a new tag
            if (opening) {
-                if ((this.scraper != null) && (this.scraper.isTag0(tag))) {
-                    // this single tag is collected at once here
-                    final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
-                    this.scraper.scrapeTag0(tag, charBuffer.propParser());
-                    charBuffer.close();
-                }
-                if ((this.transformer != null) && (this.transformer.isTag0(tag))) {
-                    // this single tag is collected at once here
-                    final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
-                    try {
-                        return this.transformer.transformTag0(tag, scb.propParser(), quotechar);
-                    } finally {
-                        scb.close();
-                    }
-                } else if (((this.scraper != null) && (this.scraper.isTag1(tag))) ||
-                           ((this.transformer != null) && (this.transformer.isTag1(tag)))) {
-                    // ok, start collecting
-                    this.filterTag = tag;
-                    final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
-                    this.filterOpts = scb.propParser();
-                    scb.close();
-                    if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
-                    return new char[0];
-                } else {
-                     // we ignore that thing and return it again
-                     return genTag0raw(tag, true, content);
-                }
+                // case (2):
+                return filterTagOpening(tag, content, quotechar);
            }

-            // we ignore that thing and return it again
+            // its a close tag
+            // case (3): we ignore that thing and return it again
            return genTag0raw(tag, false, content);

        }

-        // we are collection tag text for the tag 'filterTag'
+        // we are collection tag text for the tag 'filterTag' -> case (4) - (7)
        if (tag == null) {
-            // go on collecting content
+            // case (4): getting no tag, go on collecting content
            if (this.scraper != null) this.scraper.scrapeText(content, this.filterTag);
            if (this.transformer != null) {
                this.filterCont.append(this.transformer.transformText(content));
@ -250,13 +236,60 @@ public final class TransformerWriter extends Writer {
        }

        // it's a tag! which one?
-        if ((opening) || (!(tag.equalsIgnoreCase(this.filterTag)))) {
-            // this tag is not our concern. just add it
+        StringBuilder ret0 = new StringBuilder();
+        if (opening) {
+            // case (5): the opening should not be here. we close the previous tag as if it had been closed correctly
+            // this may happen if the html is not well-formed, like forgotten close tags
+            ret0.append(filterTagCloseing(quotechar));
+
+            // after this point we just go on and process a new tag
+            ret0.append(filterTagOpening(tag, content, quotechar));
+            return ret0.toString().toCharArray();
+        }
+
+        if (!tag.equalsIgnoreCase(this.filterTag)) {
+            // case (6): its a closing tag, but the wrong one. just add it.
            this.filterCont.append(genTag0raw(tag, opening, content));
            return new char[0];
        }

        // it's our closing tag! return complete result.
+        return filterTagCloseing(quotechar);
+    }
+
+    private char[] filterTagOpening(final String tag, final char[] content, final char quotechar) {
+        if (this.scraper != null && this.scraper.isTag0(tag)) {
+            // this single tag is collected at once here
+            final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
+            this.scraper.scrapeTag0(tag, charBuffer.propParser());
+            charBuffer.close();
+        }
+        if (this.transformer != null && this.transformer.isTag0(tag)) {
+            // this single tag is collected at once here
+            final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
+            char[] b = new char[0];
+            try {
+                b = this.transformer.transformTag0(tag, scb.propParser(), quotechar);
+            } finally {
+                scb.close();
+            }
+            return b;
+        } else if ((this.scraper != null && this.scraper.isTag1(tag)) ||
+                   (this.transformer != null && this.transformer.isTag1(tag))) {
+            // ok, start collecting
+            this.filterTag = tag;
+            final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
+            this.filterOpts = scb.propParser();
+            scb.close();
+            if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
+            return new char[0];
+        } else {
+             // we ignore that thing and return it again
+             return genTag0raw(tag, true, content);
+        }
+    }
+
+    private char[] filterTagCloseing(final char quotechar) {
        char[] ret;
        if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
        if (this.transformer != null) {
@ -291,7 +324,7 @@ public final class TransformerWriter extends Writer {

    private char[] filterSentence(final char[] in, final char quotechar) {
        if (in.length == 0) return in;
-//      System.out.println("FILTER0: " + UTF8.String(in)); // debug
+        //System.out.println("FILTER0: " + new String(in)); // debug
        // scan the string and parse structure
        if (in.length > 2 && in[0] == lb) {