fix for parser problem if a <a>-tag is 'within' html tags with unclosed

tags. That prevented the <a> tags from beeing recognized. This is a fix
for http://forum.yacy-websuche.de/viewtopic.php?p=25516#p25516
pull/1/head
Michael Peter Christen 13 years ago
parent eb2c8ffa62
commit 4d5da75814

@ -191,11 +191,21 @@ public final class TransformerWriter extends Writer {
} }
private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) { private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) {
// System.out.println("FILTER1: filterTag=" + ((filterTag == null) ? "null" : filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + UTF8.String(content)); // debug //System.out.println("FILTER1: filterTag=" + ((this.filterTag == null) ? "null" : this.filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
// distinguish the following cases:
// - (1) not collecting data for a tag and getting no tag (not opener and not close)
// - (2) not collecting data for a tag and getting a tag opener
// - (3) not collecting data for a tag and getting a tag close
// - (4) collecting data for a tag and getting no tag (not opener and not close)
// - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag
// - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener)
// - (7) collecting data for a tag and getting the correct close tag for that collecting tag
if (this.filterTag == null) { if (this.filterTag == null) {
// we are not collection tag text // we are not collection tag text -> case (1) - (3)
if (tag == null) { if (tag == null) {
// and this is not a tag opener/closer // case (1): this is not a tag opener/closer
if (this.scraper != null) this.scraper.scrapeText(content, null); if (this.scraper != null) this.scraper.scrapeText(content, null);
if (this.transformer != null) return this.transformer.transformText(content); if (this.transformer != null) return this.transformer.transformText(content);
return content; return content;
@ -203,43 +213,19 @@ public final class TransformerWriter extends Writer {
// we have a new tag // we have a new tag
if (opening) { if (opening) {
if ((this.scraper != null) && (this.scraper.isTag0(tag))) { // case (2):
// this single tag is collected at once here return filterTagOpening(tag, content, quotechar);
final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
this.scraper.scrapeTag0(tag, charBuffer.propParser());
charBuffer.close();
}
if ((this.transformer != null) && (this.transformer.isTag0(tag))) {
// this single tag is collected at once here
final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
try {
return this.transformer.transformTag0(tag, scb.propParser(), quotechar);
} finally {
scb.close();
}
} else if (((this.scraper != null) && (this.scraper.isTag1(tag))) ||
((this.transformer != null) && (this.transformer.isTag1(tag)))) {
// ok, start collecting
this.filterTag = tag;
final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
this.filterOpts = scb.propParser();
scb.close();
if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
return new char[0];
} else {
// we ignore that thing and return it again
return genTag0raw(tag, true, content);
}
} }
// we ignore that thing and return it again // its a close tag
// case (3): we ignore that thing and return it again
return genTag0raw(tag, false, content); return genTag0raw(tag, false, content);
} }
// we are collection tag text for the tag 'filterTag' // we are collection tag text for the tag 'filterTag' -> case (4) - (7)
if (tag == null) { if (tag == null) {
// go on collecting content // case (4): getting no tag, go on collecting content
if (this.scraper != null) this.scraper.scrapeText(content, this.filterTag); if (this.scraper != null) this.scraper.scrapeText(content, this.filterTag);
if (this.transformer != null) { if (this.transformer != null) {
this.filterCont.append(this.transformer.transformText(content)); this.filterCont.append(this.transformer.transformText(content));
@ -250,13 +236,60 @@ public final class TransformerWriter extends Writer {
} }
// it's a tag! which one? // it's a tag! which one?
if ((opening) || (!(tag.equalsIgnoreCase(this.filterTag)))) { StringBuilder ret0 = new StringBuilder();
// this tag is not our concern. just add it if (opening) {
// case (5): the opening should not be here. we close the previous tag as if it had been closed correctly
// this may happen if the html is not well-formed, like forgotten close tags
ret0.append(filterTagCloseing(quotechar));
// after this point we just go on and process a new tag
ret0.append(filterTagOpening(tag, content, quotechar));
return ret0.toString().toCharArray();
}
if (!tag.equalsIgnoreCase(this.filterTag)) {
// case (6): its a closing tag, but the wrong one. just add it.
this.filterCont.append(genTag0raw(tag, opening, content)); this.filterCont.append(genTag0raw(tag, opening, content));
return new char[0]; return new char[0];
} }
// it's our closing tag! return complete result. // it's our closing tag! return complete result.
return filterTagCloseing(quotechar);
}
private char[] filterTagOpening(final String tag, final char[] content, final char quotechar) {
if (this.scraper != null && this.scraper.isTag0(tag)) {
// this single tag is collected at once here
final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
this.scraper.scrapeTag0(tag, charBuffer.propParser());
charBuffer.close();
}
if (this.transformer != null && this.transformer.isTag0(tag)) {
// this single tag is collected at once here
final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
char[] b = new char[0];
try {
b = this.transformer.transformTag0(tag, scb.propParser(), quotechar);
} finally {
scb.close();
}
return b;
} else if ((this.scraper != null && this.scraper.isTag1(tag)) ||
(this.transformer != null && this.transformer.isTag1(tag))) {
// ok, start collecting
this.filterTag = tag;
final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
this.filterOpts = scb.propParser();
scb.close();
if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
return new char[0];
} else {
// we ignore that thing and return it again
return genTag0raw(tag, true, content);
}
}
private char[] filterTagCloseing(final char quotechar) {
char[] ret; char[] ret;
if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars()); if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
if (this.transformer != null) { if (this.transformer != null) {
@ -291,7 +324,7 @@ public final class TransformerWriter extends Writer {
private char[] filterSentence(final char[] in, final char quotechar) { private char[] filterSentence(final char[] in, final char quotechar) {
if (in.length == 0) return in; if (in.length == 0) return in;
// System.out.println("FILTER0: " + UTF8.String(in)); // debug //System.out.println("FILTER0: " + new String(in)); // debug
// scan the string and parse structure // scan the string and parse structure
if (in.length > 2 && in[0] == lb) { if (in.length > 2 && in[0] == lb) {

Loading…
Cancel
Save