fixed scraper

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2720 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 72482b1426
parent 6557112d8f
commit 72482b1426
4 changed files with 6 additions and 6 deletions
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
@ -304,7 +304,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
    }

    //the 'missing' method that shall be implemented:
-    public abstract void scrapeText(char[] text, boolean insideTag);
+    public abstract void scrapeText(char[] text, String insideTag);

    // the other methods must take into account to construct the return value correctly
    public abstract void scrapeTag0(String tagname, Properties tagopts);
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -119,10 +119,10 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        return (c == '.') || (c == '!') || (c == '?');
    }
    
-    public void scrapeText(char[] newtext, boolean insideTag) {
+    public void scrapeText(char[] newtext, String insideTag) {
        // System.out.println("SCRAPE: " + new String(newtext));
        serverCharBuffer b = super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim();
-        if (insideTag) {
+        if ((insideTag != null) && (!(insideTag.equals("a")))) {
            // texts inside tags sometimes have no punctuation at the line end
            // this is bad for the text sematics, because it is not possible for the
            // condenser to distinguish headlines from text beginnings.
--- a/source/de/anomic/htmlFilter/htmlFilterScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterScraper.java
@ -48,7 +48,7 @@ public interface htmlFilterScraper {

    public boolean isTag1(String tag);

-    public void scrapeText(char[] text, boolean insideTag);
+    public void scrapeText(char[] text, String insideTag);

    public void scrapeTag0(String tagname, Properties tagopts);

--- a/source/de/anomic/htmlFilter/htmlFilterWriter.java
+++ b/source/de/anomic/htmlFilter/htmlFilterWriter.java
@ -186,7 +186,7 @@ public final class htmlFilterWriter extends Writer {
            // we are not collection tag text
            if (tag == null) {
                // and this is not a tag opener/closer
-                if (scraper != null) scraper.scrapeText(content, false);
+                if (scraper != null) scraper.scrapeText(content, null);
                if (transformer != null) return transformer.transformText(content);
                return content;
            }
@ -221,7 +221,7 @@ public final class htmlFilterWriter extends Writer {
        // we are collection tag text for the tag 'filterTag'
        if (tag == null) {
            // go on collecting content
-            if (scraper != null) scraper.scrapeText(content, true);
+            if (scraper != null) scraper.scrapeText(content, filterTag);
            if (transformer != null) {
                filterCont.append(transformer.transformText(content));
            } else {