lines inside tags without punctuation are extended by a single dot.

This enables the condenser to distinguish the lines in a better way. The result is a better preparation of snippets. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2715 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · fd61209797
parent e25172853a
commit fd61209797
5 changed files with 25 additions and 15 deletions
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
@ -304,7 +304,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
    }
    //the 'missing' method that shall be implemented:
-    public abstract void scrapeText(char[] text);
+    public abstract void scrapeText(char[] text, boolean insideTag);
    // the other methods must take into account to construct the return value correctly
    public abstract void scrapeTag0(String tagname, Properties tagopts);
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -115,10 +115,23 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        this.content = new serverCharBuffer(1024);
    }
-    public void scrapeText(char[] newtext) {
+    public final static boolean punctuation(char c) {
        return (c == '.') || (c == '!') || (c == '?');
    }
    public void scrapeText(char[] newtext, boolean insideTag) {
        // System.out.println("SCRAPE: " + new String(newtext));
-        if ((content.length() != 0) && (content.charAt(content.length() - 1) != 32)) content.append(32);
+        serverCharBuffer b = super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim();
-        content.append(super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim()).append(32);
+        if (insideTag) {
            // texts inside tags sometimes have no punctuation at the line end
            // this is bad for the text sematics, because it is not possible for the
            // condenser to distinguish headlines from text beginnings.
            // to make it easier for the condenser, a dot ('.') is appended in case that
            // no punctuation is part of the newtext line
            if ((b.length() != 0) && (!(punctuation(b.charAt(b.length() - 1))))) b.append('.');
            //System.out.println("*** Appended dot: " + b.toString());
        }
        if (b.length() != 0) content.append(b).append((char) 32);
    }
    public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
--- a/source/de/anomic/htmlFilter/htmlFilterScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterScraper.java
@ -48,7 +48,7 @@ public interface htmlFilterScraper {
    public boolean isTag1(String tag);
-    public void scrapeText(char[] text);
+    public void scrapeText(char[] text, boolean insideTag);
    public void scrapeTag0(String tagname, Properties tagopts);
--- a/source/de/anomic/htmlFilter/htmlFilterWriter.java
+++ b/source/de/anomic/htmlFilter/htmlFilterWriter.java
@ -186,7 +186,7 @@ public final class htmlFilterWriter extends Writer {
            // we are not collection tag text
            if (tag == null) {
                // and this is not a tag opener/closer
-                if (scraper != null) scraper.scrapeText(content);
+                if (scraper != null) scraper.scrapeText(content, false);
                if (transformer != null) return transformer.transformText(content);
                return content;
            }
@ -221,7 +221,7 @@ public final class htmlFilterWriter extends Writer {
        // we are collection tag text for the tag 'filterTag'
        if (tag == null) {
            // go on collecting content
-            if (scraper != null) scraper.scrapeText(content);
+            if (scraper != null) scraper.scrapeText(content, true);
            if (transformer != null) {
                filterCont.append(transformer.transformText(content));
            } else {
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@ -61,6 +61,7 @@ import java.util.Map;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.kelondro.kelondroMSetTools;
 public final class plasmaCondenser {
@ -192,7 +193,7 @@ public final class plasmaCondenser {
            word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
            // System.out.println("PARSED-WORD " + word);
            wordlen = word.length();
-            if ((wordlen == 1) && (punctuation(word.charAt(0)))) {
+            if ((wordlen == 1) && (htmlFilterContentScraper.punctuation(word.charAt(0)))) {
                // store sentence
                if (sentence.length() > 0) {
                    // we store the punctuation symbol as first element of the sentence vector
@ -470,10 +471,6 @@ public final class plasmaCondenser {
        writer.close();
    }
    protected final static boolean punctuation(char c) {
        return (c == '.') || (c == '!') || (c == '?');
    }
    public final static boolean invisible(char c) {
        // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
        if ((c < ' ') || (c > 'z')) return true;
@ -507,7 +504,7 @@ public final class plasmaCondenser {
            char c;
            loop: while (e.hasMoreElements()) {
                s = (String) e.nextElement();
-                if ((s.length() == 1) && (punctuation(s.charAt(0)))) return s;
+                if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s;
                if (s.length() < ml) continue loop;
                for (int i = 0; i < s.length(); i++) {
                    c = s.charAt(i);
@ -562,7 +559,7 @@ public final class plasmaCondenser {
                    for (int i = 0; i < r.length(); i++) {
                        c = r.charAt(i);
                        if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8
-                        else if (punctuation(c)) sb = sb.append(' ').append(c).append(' ');
+                        else if (htmlFilterContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' ');
                        else sb = sb.append(c);
                    }
                    s = sb.toString().trim(); 
@ -721,7 +718,7 @@ public final class plasmaCondenser {
            if (nextChar < 0) return null;
            c = (char) nextChar;
            s.append(c);
-            if (punctuation(c)) break;
+            if (htmlFilterContentScraper.punctuation(c)) break;
        }
        // replace line endings and tabs by blanks