fixed scraper

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2720 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 6557112d8f
commit 72482b1426

@ -304,7 +304,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
} }
//the 'missing' method that shall be implemented: //the 'missing' method that shall be implemented:
public abstract void scrapeText(char[] text, boolean insideTag); public abstract void scrapeText(char[] text, String insideTag);
// the other methods must take into account to construct the return value correctly // the other methods must take into account to construct the return value correctly
public abstract void scrapeTag0(String tagname, Properties tagopts); public abstract void scrapeTag0(String tagname, Properties tagopts);

@ -119,10 +119,10 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return (c == '.') || (c == '!') || (c == '?'); return (c == '.') || (c == '!') || (c == '?');
} }
public void scrapeText(char[] newtext, boolean insideTag) { public void scrapeText(char[] newtext, String insideTag) {
// System.out.println("SCRAPE: " + new String(newtext)); // System.out.println("SCRAPE: " + new String(newtext));
serverCharBuffer b = super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim(); serverCharBuffer b = super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim();
if (insideTag) { if ((insideTag != null) && (!(insideTag.equals("a")))) {
// texts inside tags sometimes have no punctuation at the line end // texts inside tags sometimes have no punctuation at the line end
// this is bad for the text sematics, because it is not possible for the // this is bad for the text sematics, because it is not possible for the
// condenser to distinguish headlines from text beginnings. // condenser to distinguish headlines from text beginnings.

@ -48,7 +48,7 @@ public interface htmlFilterScraper {
public boolean isTag1(String tag); public boolean isTag1(String tag);
public void scrapeText(char[] text, boolean insideTag); public void scrapeText(char[] text, String insideTag);
public void scrapeTag0(String tagname, Properties tagopts); public void scrapeTag0(String tagname, Properties tagopts);

@ -186,7 +186,7 @@ public final class htmlFilterWriter extends Writer {
// we are not collection tag text // we are not collection tag text
if (tag == null) { if (tag == null) {
// and this is not a tag opener/closer // and this is not a tag opener/closer
if (scraper != null) scraper.scrapeText(content, false); if (scraper != null) scraper.scrapeText(content, null);
if (transformer != null) return transformer.transformText(content); if (transformer != null) return transformer.transformText(content);
return content; return content;
} }
@ -221,7 +221,7 @@ public final class htmlFilterWriter extends Writer {
// we are collection tag text for the tag 'filterTag' // we are collection tag text for the tag 'filterTag'
if (tag == null) { if (tag == null) {
// go on collecting content // go on collecting content
if (scraper != null) scraper.scrapeText(content, true); if (scraper != null) scraper.scrapeText(content, filterTag);
if (transformer != null) { if (transformer != null) {
filterCont.append(transformer.transformText(content)); filterCont.append(transformer.transformText(content));
} else { } else {

Loading…
Cancel
Save