diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
index 40f414288..f3e1041c0 100644
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@@ -48,6 +48,7 @@
import java.io.File;
import java.io.Writer;
+import java.util.Enumeration;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
@@ -128,9 +129,9 @@ public class CacheAdmin_p {
.append("EMAIL:
").append(formatAnchor(document.getEmaillinks())).append("
")
.append("TEXT:
").append(new String(scraper.getText())).append("
")
.append("LINES:
");
- final String[] sentences = document.getSentences();
- for (int i = 0; i < sentences.length; i++) {
- info.append(sentences[i]).append("
");
+ final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
+ if (sentences != null) while (sentences.hasMoreElements()) {
+ info.append((String) sentences.nextElement()).append("
");
}
info.append("
");
if (document != null) document.close();
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index 525c9d7e4..8681df3b4 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -49,6 +49,7 @@ import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
+import java.util.Enumeration;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
@@ -262,11 +263,12 @@ public class ViewFile {
prop.put("viewMode_parsedText",content);
} else {
prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
- String[] sentences = document.getSentences();
+ final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
boolean dark = true;
- for (int i=0; i < sentences.length; i++) {
- String currentSentence = wikiCode.replaceHTML(sentences[i]);
+ int i = 0;
+ if (sentences != null) while (sentences.hasMoreElements()) {
+ String currentSentence = wikiCode.replaceHTML((String) sentences.nextElement());
// Search word highlighting
String words = post.get("words",null);
@@ -286,8 +288,9 @@ public class ViewFile {
prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1));
prop.put("viewMode_sentences_" + i + "_text",currentSentence);
prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
+ i++;
}
- prop.put("viewMode_sentences",sentences.length);
+ prop.put("viewMode_sentences", i);
}
if (document != null) document.close();
diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
index d72eb43f8..6dc70d2f4 100644
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@@ -51,6 +51,8 @@ import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
@@ -469,7 +471,7 @@ public final class plasmaCondenser {
}
protected final static boolean punctuation(char c) {
- return ("!?.".indexOf(c) >= 0);
+ return (c == '.') || (c == '!') || (c == '?');
}
public final static boolean invisible(char c) {
@@ -648,7 +650,89 @@ public final class plasmaCondenser {
return counter;
}
}
+
+ public static Enumeration sentencesFromInputStream(InputStream is, String charset) {
+ try {
+ return new sentencesFromInputStreamEnum(is, charset);
+ } catch (UnsupportedEncodingException e) {
+ return null;
+ }
+ }
+
+ private static class sentencesFromInputStreamEnum implements Enumeration {
+ // read sentences from a given input stream
+ // this enumerates String objects
+
+ Object buffer = null;
+ BufferedReader raf;
+ int counter = 0;
+
+ public sentencesFromInputStreamEnum(InputStream is, String charset) throws UnsupportedEncodingException {
+ raf = new BufferedReader((charset == null) ? new InputStreamReader(is) : new InputStreamReader(is, charset));
+ buffer = nextElement0();
+ counter = 0;
+ }
+
+ private Object nextElement0() {
+ try {
+ String s = readSentence(raf);
+ if (s == null) {
+ raf.close();
+ return null;
+ }
+ return s;
+ } catch (IOException e) {
+ try {
+ raf.close();
+ } catch (Exception ee) {
+ }
+ return null;
+ }
+ }
+
+ public boolean hasMoreElements() {
+ return buffer != null;
+ }
+
+ public Object nextElement() {
+ if (buffer == null) {
+ return null;
+ } else {
+ counter = counter + ((String) buffer).length() + 1;
+ Object r = buffer;
+ buffer = nextElement0();
+ return r;
+ }
+ }
+ public int count() {
+ return counter;
+ }
+ }
+
+ static String readSentence(Reader reader) throws IOException {
+ StringBuffer s = new StringBuffer();
+ int nextChar;
+ char c;
+
+ // find sentence end
+ for (;;) {
+ nextChar = reader.read();
+ if (nextChar < 0) return null;
+ c = (char) nextChar;
+ s.append(c);
+ if (punctuation(c)) break;
+ }
+
+ // replace line endings and tabs by blanks
+ for (int i = 0; i < s.length(); i++) {
+ if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) || (s.charAt(i) == (char) 8)) s.setCharAt(i, ' ');
+ }
+ // remove all double-spaces
+ int p; while ((p = s.indexOf(" ")) >= 0) s.deleteCharAt(p);
+ return new String(s);
+
+ }
/*
private static void addLineSearchProp(Properties prop, String s, String[] searchwords, HashSet foundsearch) {
// we store lines containing a key in search vector
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index b420b7ffc..81404dc06 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -55,6 +55,7 @@ import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.util.Arrays;
+import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
@@ -815,12 +816,12 @@ public final class plasmaParser {
System.out.println(document.getMainLongTitle());
// found text
- String[] sentences = document.getSentences();
- if (sentences != null) {
- for (int i = 0; i < sentences.length; i++) {
+ final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
+ int i = 0;
+ if (sentences != null) while (sentences.hasMoreElements()) {
System.out.print("line " + i + ": ");
- System.out.println(sentences[i]);
- }
+ System.out.println((String) sentences.nextElement());
+ i++;
}
// found links
diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java
index a9b276d1f..d032d7b9e 100644
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@@ -50,6 +50,7 @@ import java.io.InputStream;
import java.net.MalformedURLException;
import de.anomic.server.serverFileUtils;
+import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
@@ -189,13 +190,9 @@ public class plasmaParserDocument {
return -1;
}
- public plasmaCondenser getCondenser() {
- if (condenser == null) condenser = new plasmaCondenser(getText(), 0, 0);
- return condenser;
- }
-
- public String[] getSentences() {
- return getCondenser().sentences();
+ public Enumeration getSentences(String charset) {
+ if (this.text == null) return null;
+ return plasmaCondenser.sentencesFromInputStream(getText(), charset);
}
public String getKeywords(char separator) {
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index 80a63a1a1..e1e7e71eb 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -47,6 +47,7 @@ package de.anomic.plasma;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
@@ -249,10 +250,10 @@ public class plasmaSnippetCache {
if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
//System.out.println("loaded document for URL " + url);
- String[] sentences = document.getSentences();
+ final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
document.close();
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
- if ((sentences == null) || (sentences.length == 0)) {
+ if (sentences == null) {
//System.out.println("found no sentences in url " + url);
return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
}
@@ -357,26 +358,30 @@ public class plasmaSnippetCache {
return (String) snippetsCache.get(key);
}
- private String computeSnippet(String[] sentences, Set queryhashes, int minLength, int maxLength) {
+ private String computeSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
try {
- if ((sentences == null) || (sentences.length == 0)) return null;
+ if (sentences == null) return null;
if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
Iterator j;
HashMap hs;
String hash;
- for (int i = 0; i < sentences.length; i++) {
+ ArrayList sb = new ArrayList();
+ String sentence;
+ while (sentences.hasMoreElements()) {
+ sentence = (String) sentences.nextElement();
//System.out.println("Sentence " + i + ": " + sentences[i]);
- if (sentences[i].length() > minLength) {
- hs = hashSentence(sentences[i]);
+ if (sentence.length() > minLength) {
+ hs = hashSentence(sentence);
j = queryhashes.iterator();
while (j.hasNext()) {
hash = (String) j.next();
if (hs.containsKey(hash)) {
//System.out.println("hash " + hash + " appears in line " + i);
- hitTable.incScore(new Integer(i));
+ hitTable.incScore(new Integer(sb.size()));
}
}
+ sb.add(sentence);
}
}
int score = hitTable.getMaxScore(); // best number of hits
@@ -385,15 +390,14 @@ public class plasmaSnippetCache {
// now find the shortest line of these hits
int shortLineIndex = -1;
int shortLineLength = Integer.MAX_VALUE;
- for (int i = 0; i < sentences.length; i++) {
- if ((hitTable.getScore(new Integer(i)) == score) &&
- (sentences[i].length() < shortLineLength)) {
+ for (int i = 0; i < sb.size(); i++) {
+ if ((hitTable.getScore(new Integer(i)) == score) && (((String) sb.get(i)).length() < shortLineLength)) {
shortLineIndex = i;
- shortLineLength = sentences[i].length();
+ shortLineLength = ((String) sb.get(i)).length();
}
}
// find a first result
- String result = sentences[shortLineIndex];
+ String result = (String) sb.get(shortLineIndex);
// remove all hashes that appear in the result
hs = hashSentence(result);
j = queryhashes.iterator();