fixed a parsing bug by reverting SVN 7766

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7910 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent ce11b7b6d2
commit 231074bf0a

@ -117,10 +117,10 @@ public final class TransformerWriter extends Writer {
bb.append((int)'>');
final char[] result = bb.getChars();
try {
bb.close();
} catch (final IOException e) {
Log.logException(e);
}
bb.close();
} catch (final IOException e) {
Log.logException(e);
}
return result;
}
@ -137,10 +137,10 @@ public final class TransformerWriter extends Writer {
bb.append((int)'<').append((int)'/').append(tagname).append((int)'>');
final char[] result = bb.getChars();
try {
bb.close();
} catch (final IOException e) {
Log.logException(e);
}
bb.close();
} catch (final IOException e) {
Log.logException(e);
}
return result;
}
@ -155,10 +155,10 @@ public final class TransformerWriter extends Writer {
bb.append((int)'>');
final char[] result = bb.getChars();
try {
bb.close();
} catch (final IOException e) {
Log.logException(e);
}
bb.close();
} catch (final IOException e) {
Log.logException(e);
}
return result;
}
@ -168,10 +168,10 @@ public final class TransformerWriter extends Writer {
cb.append(text).append((int)'<').append((int)'/').append(tagname).append((int)'>');
final char[] result = cb.getChars();
try {
cb.close();
} catch (final IOException e) {
Log.logException(e);
}
cb.close();
} catch (final IOException e) {
Log.logException(e);
}
return result;
}
@ -188,89 +188,53 @@ public final class TransformerWriter extends Writer {
}
final char[] result;
if (bb.length() > 0)
result = bb.getChars(1);
result = bb.getChars(1);
else
result = bb.getChars();
result = bb.getChars();
try {
bb.close();
} catch (final IOException ex) {
Log.logException(ex);
}
bb.close();
} catch (final IOException ex) {
Log.logException(ex);
}
return result;
}
private static final char[] mergedScrape(final char[] a, final char[] b) {
if (a == null) return b;
if (b == null) return a;
final StringBuilder sb = new StringBuilder(a.length + b.length + 1);
sb.append(a).append(' ').append(b);
return sb.toString().toCharArray();
}
/**
* every tag that appears is handed to the filterTag method. The method then returns text from the tag
* but also operates on the tag content to scrape information from it. In case that a tag is unclosed if
* another tag appears, both, the unclosed and the new one are merged into one new char[]
* @param tag
* @param opening
* @param content
* @param quotechar
* @return
*/
private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) {
//System.out.println("FILTER1: filterTag=" + ((this.filterTag == null) ? "null" : this.filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
char[] unclosed = null;
if (this.filterTag != null && opening) {
// there is a missing close tag for the currently parsed tag filterTag
// close that tag here and go on with new tag
if (this.scraper != null) {
this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
}
if (this.transformer != null) {
unclosed = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
} else {
unclosed = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
}
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
}
// System.out.println("FILTER1: filterTag=" + ((filterTag == null) ? "null" : filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + UTF8.String(content)); // debug
if (this.filterTag == null) {
// we are not collection tag text
if (tag == null) {
// and this is not a tag opener/closer
if (this.scraper != null) this.scraper.scrapeText(content, null);
if (this.transformer != null) return mergedScrape(unclosed, this.transformer.transformText(content));
return mergedScrape(unclosed, content);
if (this.transformer != null) return this.transformer.transformText(content);
return content;
}
// we have a new tag
if (opening) {
if ((this.scraper != null) && (this.scraper.isTag0(tag))) {
// this single tag is collected at once here
final CharBuffer charBuffer = new CharBuffer(content);
final CharBuffer charBuffer = new CharBuffer(content);
this.scraper.scrapeTag0(tag, charBuffer.propParser());
try {
charBuffer.close();
} catch (final IOException e) {
// TODO Auto-generated catch block
Log.logException(e);
}
charBuffer.close();
} catch (final IOException e) {
// TODO Auto-generated catch block
Log.logException(e);
}
}
if ((this.transformer != null) && (this.transformer.isTag0(tag))) {
// this single tag is collected at once here
final CharBuffer scb = new CharBuffer(content);
try {
return mergedScrape(unclosed, this.transformer.transformTag0(tag, scb.propParser(), quotechar));
} finally {
try {
scb.close();
} catch (final IOException e) {
Log.logException(e);
}
}
final CharBuffer scb = new CharBuffer(content);
try {
return this.transformer.transformTag0(tag, scb.propParser(), quotechar);
} finally {
try {
scb.close();
} catch (final IOException e) {
Log.logException(e);
}
}
} else if (((this.scraper != null) && (this.scraper.isTag1(tag))) ||
((this.transformer != null) && (this.transformer.isTag1(tag)))) {
// ok, start collecting
@ -278,20 +242,20 @@ public final class TransformerWriter extends Writer {
final CharBuffer scb = new CharBuffer(content);
this.filterOpts = scb.propParser();
try {
scb.close();
} catch (final IOException e) {
Log.logException(e);
}
scb.close();
} catch (final IOException e) {
Log.logException(e);
}
if (this.filterCont == null) this.filterCont = new CharBuffer(Math.max(100, content.length)); else this.filterCont.reset();
return mergedScrape(unclosed, new char[0]);
return new char[0];
} else {
// we ignore that thing and return it again
return mergedScrape(unclosed, genTag0raw(tag, true, content));
return genTag0raw(tag, true, content);
}
}
// we ignore that thing and return it again
return mergedScrape(unclosed, genTag0raw(tag, false, content));
return genTag0raw(tag, false, content);
}
@ -299,21 +263,19 @@ public final class TransformerWriter extends Writer {
if (tag == null) {
// go on collecting content
if (this.scraper != null) this.scraper.scrapeText(content, this.filterTag);
try {
if (this.transformer != null) {
this.filterCont.append(this.transformer.transformText(content));
} else {
this.filterCont.append(content);
}
} catch (final OutOfMemoryError e) {}
return mergedScrape(unclosed, new char[0]);
if (this.transformer != null) {
this.filterCont.append(this.transformer.transformText(content));
} else {
this.filterCont.append(content);
}
return new char[0];
}
// it's a tag! which one?
if (opening || !(tag.equalsIgnoreCase(this.filterTag))) {
if ((opening) || (!(tag.equalsIgnoreCase(this.filterTag)))) {
// this tag is not our concern. just add it
this.filterCont.append(genTag0raw(tag, opening, content));
return mergedScrape(unclosed, new char[0]);
return new char[0];
}
// it's our closing tag! return complete result.
@ -327,7 +289,7 @@ public final class TransformerWriter extends Writer {
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
return mergedScrape(unclosed, ret);
return ret;
}
private char[] filterFinalize(final char quotechar) {
@ -337,9 +299,7 @@ public final class TransformerWriter extends Writer {
// it's our closing tag! return complete result.
char[] ret;
if (this.scraper != null) {
this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
}
if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
if (this.transformer != null) {
ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
} else {
@ -413,7 +373,7 @@ public final class TransformerWriter extends Writer {
// the tag ends here. after filtering: pass on
filtered = filterSentence(this.buffer.getChars(), singlequote);
if (this.out != null) { this.out.write(filtered); }
// buffer = new serverByteBuffer();
// this.buffer = new serverByteBuffer();
this.buffer.reset();
}
} else if (this.inDoubleQuote) {
@ -425,7 +385,7 @@ public final class TransformerWriter extends Writer {
// the tag ends here. after filtering: pass on
filtered = filterSentence(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
// buffer = new serverByteBuffer();
// this.buffer = new serverByteBuffer();
this.buffer.reset();
}
} else if (this.inComment) {
@ -438,7 +398,7 @@ public final class TransformerWriter extends Writer {
final char[] comment = this.buffer.getChars();
if (this.scraper != null) this.scraper.scrapeComment(comment);
if (this.out != null) this.out.write(comment);
// buffer = new serverByteBuffer();
// this.buffer = new serverByteBuffer();
this.buffer.reset();
}
} else if (this.inScript) {
@ -456,7 +416,7 @@ public final class TransformerWriter extends Writer {
// script is at end
this.inScript = false;
if (this.out != null) this.out.write(this.buffer.getChars());
// buffer = new serverByteBuffer();
// this.buffer = new serverByteBuffer();
this.buffer.reset();
}
} else if (this.inStyle) {
@ -473,7 +433,7 @@ public final class TransformerWriter extends Writer {
// style is at end
this.inStyle = false;
if (this.out != null) this.out.write(this.buffer.getChars());
// buffer = new serverByteBuffer();
// this.buffer = new serverByteBuffer();
this.buffer.reset();
}
} else {
@ -517,7 +477,7 @@ public final class TransformerWriter extends Writer {
// the tag ends here. after filtering: pass on
filtered = filterSentence(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
// buffer = new serverByteBuffer();
// this.buffer = new serverByteBuffer();
this.buffer.reset();
} else if (c == lb) {
// this is an error case
@ -526,7 +486,7 @@ public final class TransformerWriter extends Writer {
filtered = filterSentence(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
}
// buffer = new serverByteBuffer();
// this.buffer = new serverByteBuffer();
this.buffer.reset();
this.buffer.append(c);
} else {
@ -540,7 +500,7 @@ public final class TransformerWriter extends Writer {
filtered = filterSentence(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
}
// buffer = new serverByteBuffer();
// this.buffer = new serverByteBuffer();
this.buffer.reset();
this.buffer.append(c);
} else {
@ -566,7 +526,7 @@ public final class TransformerWriter extends Writer {
}
public void flush() throws IOException {
// we cannot flush the current string buffer to prevent that
// we cannot flush the current string this.buffer to prevent that
// the filter process is messed up
// instead, we simply flush the underlying output stream
if (this.out != null) this.out.flush();

Loading…
Cancel
Save