fixed scraper

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2720 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 6557112d8f
commit 72482b1426

@ -304,7 +304,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
}
//the 'missing' method that shall be implemented:
public abstract void scrapeText(char[] text, boolean insideTag);
public abstract void scrapeText(char[] text, String insideTag);
// the other methods must take into account to construct the return value correctly
public abstract void scrapeTag0(String tagname, Properties tagopts);

@ -119,10 +119,10 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return (c == '.') || (c == '!') || (c == '?');
}
public void scrapeText(char[] newtext, boolean insideTag) {
public void scrapeText(char[] newtext, String insideTag) {
// System.out.println("SCRAPE: " + new String(newtext));
serverCharBuffer b = super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim();
if (insideTag) {
if ((insideTag != null) && (!(insideTag.equals("a")))) {
// texts inside tags sometimes have no punctuation at the line end
// this is bad for the text sematics, because it is not possible for the
// condenser to distinguish headlines from text beginnings.

@ -48,7 +48,7 @@ public interface htmlFilterScraper {
public boolean isTag1(String tag);
public void scrapeText(char[] text, boolean insideTag);
public void scrapeText(char[] text, String insideTag);
public void scrapeTag0(String tagname, Properties tagopts);

@ -186,7 +186,7 @@ public final class htmlFilterWriter extends Writer {
// we are not collection tag text
if (tag == null) {
// and this is not a tag opener/closer
if (scraper != null) scraper.scrapeText(content, false);
if (scraper != null) scraper.scrapeText(content, null);
if (transformer != null) return transformer.transformText(content);
return content;
}
@ -221,7 +221,7 @@ public final class htmlFilterWriter extends Writer {
// we are collection tag text for the tag 'filterTag'
if (tag == null) {
// go on collecting content
if (scraper != null) scraper.scrapeText(content, true);
if (scraper != null) scraper.scrapeText(content, filterTag);
if (transformer != null) {
filterCont.append(transformer.transformText(content));
} else {

Loading…
Cancel
Save