diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java index 77a6b0bd6..4901a2543 100644 --- a/source/net/yacy/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -191,11 +191,21 @@ public final class TransformerWriter extends Writer { } private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) { -// System.out.println("FILTER1: filterTag=" + ((filterTag == null) ? "null" : filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + UTF8.String(content)); // debug + //System.out.println("FILTER1: filterTag=" + ((this.filterTag == null) ? "null" : this.filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug + // distinguish the following cases: + // - (1) not collecting data for a tag and getting no tag (not opener and not close) + // - (2) not collecting data for a tag and getting a tag opener + // - (3) not collecting data for a tag and getting a tag close + // - (4) collecting data for a tag and getting no tag (not opener and not close) + // - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag + // - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener) + // - (7) collecting data for a tag and getting the correct close tag for that collecting tag + if (this.filterTag == null) { - // we are not collection tag text + // we are not collection tag text -> case (1) - (3) + if (tag == null) { - // and this is not a tag opener/closer + // case (1): this is not a tag opener/closer if (this.scraper != null) this.scraper.scrapeText(content, null); if (this.transformer != null) return this.transformer.transformText(content); return content; @@ -203,43 +213,19 @@ public final class TransformerWriter extends Writer { // we have a new tag if (opening) { - if ((this.scraper != null) && (this.scraper.isTag0(tag))) { - // this single tag is collected at once here - final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); - this.scraper.scrapeTag0(tag, charBuffer.propParser()); - charBuffer.close(); - } - if ((this.transformer != null) && (this.transformer.isTag0(tag))) { - // this single tag is collected at once here - final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); - try { - return this.transformer.transformTag0(tag, scb.propParser(), quotechar); - } finally { - scb.close(); - } - } else if (((this.scraper != null) && (this.scraper.isTag1(tag))) || - ((this.transformer != null) && (this.transformer.isTag1(tag)))) { - // ok, start collecting - this.filterTag = tag; - final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); - this.filterOpts = scb.propParser(); - scb.close(); - if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset(); - return new char[0]; - } else { - // we ignore that thing and return it again - return genTag0raw(tag, true, content); - } + // case (2): + return filterTagOpening(tag, content, quotechar); } - // we ignore that thing and return it again + // its a close tag + // case (3): we ignore that thing and return it again return genTag0raw(tag, false, content); } - // we are collection tag text for the tag 'filterTag' + // we are collection tag text for the tag 'filterTag' -> case (4) - (7) if (tag == null) { - // go on collecting content + // case (4): getting no tag, go on collecting content if (this.scraper != null) this.scraper.scrapeText(content, this.filterTag); if (this.transformer != null) { this.filterCont.append(this.transformer.transformText(content)); @@ -250,13 +236,60 @@ public final class TransformerWriter extends Writer { } // it's a tag! which one? - if ((opening) || (!(tag.equalsIgnoreCase(this.filterTag)))) { - // this tag is not our concern. just add it + StringBuilder ret0 = new StringBuilder(); + if (opening) { + // case (5): the opening should not be here. we close the previous tag as if it had been closed correctly + // this may happen if the html is not well-formed, like forgotten close tags + ret0.append(filterTagCloseing(quotechar)); + + // after this point we just go on and process a new tag + ret0.append(filterTagOpening(tag, content, quotechar)); + return ret0.toString().toCharArray(); + } + + if (!tag.equalsIgnoreCase(this.filterTag)) { + // case (6): its a closing tag, but the wrong one. just add it. this.filterCont.append(genTag0raw(tag, opening, content)); return new char[0]; } // it's our closing tag! return complete result. + return filterTagCloseing(quotechar); + } + + private char[] filterTagOpening(final String tag, final char[] content, final char quotechar) { + if (this.scraper != null && this.scraper.isTag0(tag)) { + // this single tag is collected at once here + final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); + this.scraper.scrapeTag0(tag, charBuffer.propParser()); + charBuffer.close(); + } + if (this.transformer != null && this.transformer.isTag0(tag)) { + // this single tag is collected at once here + final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); + char[] b = new char[0]; + try { + b = this.transformer.transformTag0(tag, scb.propParser(), quotechar); + } finally { + scb.close(); + } + return b; + } else if ((this.scraper != null && this.scraper.isTag1(tag)) || + (this.transformer != null && this.transformer.isTag1(tag))) { + // ok, start collecting + this.filterTag = tag; + final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); + this.filterOpts = scb.propParser(); + scb.close(); + if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset(); + return new char[0]; + } else { + // we ignore that thing and return it again + return genTag0raw(tag, true, content); + } + } + + private char[] filterTagCloseing(final char quotechar) { char[] ret; if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars()); if (this.transformer != null) { @@ -291,7 +324,7 @@ public final class TransformerWriter extends Writer { private char[] filterSentence(final char[] in, final char quotechar) { if (in.length == 0) return in; -// System.out.println("FILTER0: " + UTF8.String(in)); // debug + //System.out.println("FILTER0: " + new String(in)); // debug // scan the string and parse structure if (in.length > 2 && in[0] == lb) {