diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java
index 77a6b0bd6..4901a2543 100644
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@@ -191,11 +191,21 @@ public final class TransformerWriter extends Writer {
}
private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) {
-// System.out.println("FILTER1: filterTag=" + ((filterTag == null) ? "null" : filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + UTF8.String(content)); // debug
+ //System.out.println("FILTER1: filterTag=" + ((this.filterTag == null) ? "null" : this.filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
+ // distinguish the following cases:
+ // - (1) not collecting data for a tag and getting no tag (not opener and not close)
+ // - (2) not collecting data for a tag and getting a tag opener
+ // - (3) not collecting data for a tag and getting a tag close
+ // - (4) collecting data for a tag and getting no tag (not opener and not close)
+ // - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag
+ // - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener)
+ // - (7) collecting data for a tag and getting the correct close tag for that collecting tag
+
if (this.filterTag == null) {
- // we are not collection tag text
+ // we are not collection tag text -> case (1) - (3)
+
if (tag == null) {
- // and this is not a tag opener/closer
+ // case (1): this is not a tag opener/closer
if (this.scraper != null) this.scraper.scrapeText(content, null);
if (this.transformer != null) return this.transformer.transformText(content);
return content;
@@ -203,43 +213,19 @@ public final class TransformerWriter extends Writer {
// we have a new tag
if (opening) {
- if ((this.scraper != null) && (this.scraper.isTag0(tag))) {
- // this single tag is collected at once here
- final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
- this.scraper.scrapeTag0(tag, charBuffer.propParser());
- charBuffer.close();
- }
- if ((this.transformer != null) && (this.transformer.isTag0(tag))) {
- // this single tag is collected at once here
- final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
- try {
- return this.transformer.transformTag0(tag, scb.propParser(), quotechar);
- } finally {
- scb.close();
- }
- } else if (((this.scraper != null) && (this.scraper.isTag1(tag))) ||
- ((this.transformer != null) && (this.transformer.isTag1(tag)))) {
- // ok, start collecting
- this.filterTag = tag;
- final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
- this.filterOpts = scb.propParser();
- scb.close();
- if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
- return new char[0];
- } else {
- // we ignore that thing and return it again
- return genTag0raw(tag, true, content);
- }
+ // case (2):
+ return filterTagOpening(tag, content, quotechar);
}
- // we ignore that thing and return it again
+ // its a close tag
+ // case (3): we ignore that thing and return it again
return genTag0raw(tag, false, content);
}
- // we are collection tag text for the tag 'filterTag'
+ // we are collection tag text for the tag 'filterTag' -> case (4) - (7)
if (tag == null) {
- // go on collecting content
+ // case (4): getting no tag, go on collecting content
if (this.scraper != null) this.scraper.scrapeText(content, this.filterTag);
if (this.transformer != null) {
this.filterCont.append(this.transformer.transformText(content));
@@ -250,13 +236,60 @@ public final class TransformerWriter extends Writer {
}
// it's a tag! which one?
- if ((opening) || (!(tag.equalsIgnoreCase(this.filterTag)))) {
- // this tag is not our concern. just add it
+ StringBuilder ret0 = new StringBuilder();
+ if (opening) {
+ // case (5): the opening should not be here. we close the previous tag as if it had been closed correctly
+ // this may happen if the html is not well-formed, like forgotten close tags
+ ret0.append(filterTagCloseing(quotechar));
+
+ // after this point we just go on and process a new tag
+ ret0.append(filterTagOpening(tag, content, quotechar));
+ return ret0.toString().toCharArray();
+ }
+
+ if (!tag.equalsIgnoreCase(this.filterTag)) {
+ // case (6): its a closing tag, but the wrong one. just add it.
this.filterCont.append(genTag0raw(tag, opening, content));
return new char[0];
}
// it's our closing tag! return complete result.
+ return filterTagCloseing(quotechar);
+ }
+
+ private char[] filterTagOpening(final String tag, final char[] content, final char quotechar) {
+ if (this.scraper != null && this.scraper.isTag0(tag)) {
+ // this single tag is collected at once here
+ final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
+ this.scraper.scrapeTag0(tag, charBuffer.propParser());
+ charBuffer.close();
+ }
+ if (this.transformer != null && this.transformer.isTag0(tag)) {
+ // this single tag is collected at once here
+ final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
+ char[] b = new char[0];
+ try {
+ b = this.transformer.transformTag0(tag, scb.propParser(), quotechar);
+ } finally {
+ scb.close();
+ }
+ return b;
+ } else if ((this.scraper != null && this.scraper.isTag1(tag)) ||
+ (this.transformer != null && this.transformer.isTag1(tag))) {
+ // ok, start collecting
+ this.filterTag = tag;
+ final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
+ this.filterOpts = scb.propParser();
+ scb.close();
+ if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
+ return new char[0];
+ } else {
+ // we ignore that thing and return it again
+ return genTag0raw(tag, true, content);
+ }
+ }
+
+ private char[] filterTagCloseing(final char quotechar) {
char[] ret;
if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
if (this.transformer != null) {
@@ -291,7 +324,7 @@ public final class TransformerWriter extends Writer {
private char[] filterSentence(final char[] in, final char quotechar) {
if (in.length == 0) return in;
-// System.out.println("FILTER0: " + UTF8.String(in)); // debug
+ //System.out.println("FILTER0: " + new String(in)); // debug
// scan the string and parse structure
if (in.length > 2 && in[0] == lb) {