diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
index 5c2855d8d..f90f188db 100644
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
@@ -304,7 +304,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
}
//the 'missing' method that shall be implemented:
- public abstract void scrapeText(char[] text);
+ public abstract void scrapeText(char[] text, boolean insideTag);
// the other methods must take into account to construct the return value correctly
public abstract void scrapeTag0(String tagname, Properties tagopts);
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index 44f33cb81..d7a99276e 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -115,10 +115,23 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.content = new serverCharBuffer(1024);
}
- public void scrapeText(char[] newtext) {
+ public final static boolean punctuation(char c) {
+ return (c == '.') || (c == '!') || (c == '?');
+ }
+
+ public void scrapeText(char[] newtext, boolean insideTag) {
// System.out.println("SCRAPE: " + new String(newtext));
- if ((content.length() != 0) && (content.charAt(content.length() - 1) != 32)) content.append(32);
- content.append(super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim()).append(32);
+ serverCharBuffer b = super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim();
+ if (insideTag) {
+ // texts inside tags sometimes have no punctuation at the line end
+ // this is bad for the text sematics, because it is not possible for the
+ // condenser to distinguish headlines from text beginnings.
+ // to make it easier for the condenser, a dot ('.') is appended in case that
+ // no punctuation is part of the newtext line
+ if ((b.length() != 0) && (!(punctuation(b.charAt(b.length() - 1))))) b.append('.');
+ //System.out.println("*** Appended dot: " + b.toString());
+ }
+ if (b.length() != 0) content.append(b).append((char) 32);
}
public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
diff --git a/source/de/anomic/htmlFilter/htmlFilterScraper.java b/source/de/anomic/htmlFilter/htmlFilterScraper.java
index 678eaaf4c..8c506de70 100644
--- a/source/de/anomic/htmlFilter/htmlFilterScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterScraper.java
@@ -48,7 +48,7 @@ public interface htmlFilterScraper {
public boolean isTag1(String tag);
- public void scrapeText(char[] text);
+ public void scrapeText(char[] text, boolean insideTag);
public void scrapeTag0(String tagname, Properties tagopts);
diff --git a/source/de/anomic/htmlFilter/htmlFilterWriter.java b/source/de/anomic/htmlFilter/htmlFilterWriter.java
index f174876a2..3045643c3 100644
--- a/source/de/anomic/htmlFilter/htmlFilterWriter.java
+++ b/source/de/anomic/htmlFilter/htmlFilterWriter.java
@@ -186,7 +186,7 @@ public final class htmlFilterWriter extends Writer {
// we are not collection tag text
if (tag == null) {
// and this is not a tag opener/closer
- if (scraper != null) scraper.scrapeText(content);
+ if (scraper != null) scraper.scrapeText(content, false);
if (transformer != null) return transformer.transformText(content);
return content;
}
@@ -221,7 +221,7 @@ public final class htmlFilterWriter extends Writer {
// we are collection tag text for the tag 'filterTag'
if (tag == null) {
// go on collecting content
- if (scraper != null) scraper.scrapeText(content);
+ if (scraper != null) scraper.scrapeText(content, true);
if (transformer != null) {
filterCont.append(transformer.transformText(content));
} else {
diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
index 6dc70d2f4..a6f91fb3f 100644
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@@ -61,6 +61,7 @@ import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
+import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroMSetTools;
public final class plasmaCondenser {
@@ -192,7 +193,7 @@ public final class plasmaCondenser {
word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
// System.out.println("PARSED-WORD " + word);
wordlen = word.length();
- if ((wordlen == 1) && (punctuation(word.charAt(0)))) {
+ if ((wordlen == 1) && (htmlFilterContentScraper.punctuation(word.charAt(0)))) {
// store sentence
if (sentence.length() > 0) {
// we store the punctuation symbol as first element of the sentence vector
@@ -470,10 +471,6 @@ public final class plasmaCondenser {
writer.close();
}
- protected final static boolean punctuation(char c) {
- return (c == '.') || (c == '!') || (c == '?');
- }
-
public final static boolean invisible(char c) {
// TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
if ((c < ' ') || (c > 'z')) return true;
@@ -507,7 +504,7 @@ public final class plasmaCondenser {
char c;
loop: while (e.hasMoreElements()) {
s = (String) e.nextElement();
- if ((s.length() == 1) && (punctuation(s.charAt(0)))) return s;
+ if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s;
if (s.length() < ml) continue loop;
for (int i = 0; i < s.length(); i++) {
c = s.charAt(i);
@@ -562,7 +559,7 @@ public final class plasmaCondenser {
for (int i = 0; i < r.length(); i++) {
c = r.charAt(i);
if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8
- else if (punctuation(c)) sb = sb.append(' ').append(c).append(' ');
+ else if (htmlFilterContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' ');
else sb = sb.append(c);
}
s = sb.toString().trim();
@@ -721,7 +718,7 @@ public final class plasmaCondenser {
if (nextChar < 0) return null;
c = (char) nextChar;
s.append(c);
- if (punctuation(c)) break;
+ if (htmlFilterContentScraper.punctuation(c)) break;
}
// replace line endings and tabs by blanks