diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
index 4692aae0b..4659f72d6 100644
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@@ -48,7 +48,6 @@
import java.io.File;
import java.io.Writer;
-import java.util.Enumeration;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
@@ -132,9 +131,9 @@ public class CacheAdmin_p {
.append("EMAIL:
").append(formatAnchor(document.getEmaillinks())).append("
")
.append("TEXT:
").append(new String(scraper.getText())).append("
")
.append("LINES:
");
- final Enumeration sentences = document.getSentences(false);
- if (sentences != null) while (sentences.hasMoreElements()) {
- info.append((String) sentences.nextElement()).append("
");
+ final Iterator sentences = document.getSentences(false);
+ if (sentences != null) while (sentences.hasNext()) {
+ info.append((String) sentences.next()).append("
");
}
info.append("
");
if (document != null) document.close();
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index 6bc426371..99a8ac7f3 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -50,7 +50,6 @@ import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.net.URLEncoder;
-import java.util.Enumeration;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
@@ -312,7 +311,7 @@ public class ViewFile {
prop.put("viewMode_parsedText", content);
} else if (viewMode.equals("sentences")) {
prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES);
- final Enumeration sentences = document.getSentences(pre);
+ final Iterator sentences = document.getSentences(pre);
boolean dark = true;
int i = 0;
@@ -320,9 +319,9 @@ public class ViewFile {
String[] wordArray = wordArray(post.get("words", null));
// Search word highlighting
- while (sentences.hasMoreElements()) {
+ while (sentences.hasNext()) {
prop.put("viewMode_sentences_" + i + "_nr", Integer.toString(i + 1));
- prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, (String) sentences.nextElement()));
+ prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, (String) sentences.next()));
prop.put("viewMode_sentences_" + i + "_dark", ((dark) ? 1 : 0));
dark = !dark;
i++;
diff --git a/htroot/xml/snippet.java b/htroot/xml/snippet.java
index 075137495..d15098800 100644
--- a/htroot/xml/snippet.java
+++ b/htroot/xml/snippet.java
@@ -14,6 +14,7 @@ import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
+import de.anomic.server.logging.serverLog;
public class snippet {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) throws MalformedURLException {
@@ -53,6 +54,7 @@ public class snippet {
} else {
String error = snippet.getError();
if ((remove) && (error.equals("no matching snippet found"))) {
+ serverLog.logInfo("snippet-fetch", "no snippet found, remove words '" + querystring + "' for url = " + url.toNormalform());
switchboard.wordIndex.removeReferences(query, plasmaURL.urlHash(url));
}
prop.put("text", error);
diff --git a/source/de/anomic/index/indexRAMRI.java b/source/de/anomic/index/indexRAMRI.java
index 869071a70..3f3749823 100644
--- a/source/de/anomic/index/indexRAMRI.java
+++ b/source/de/anomic/index/indexRAMRI.java
@@ -28,9 +28,11 @@ package de.anomic.index;
import java.io.File;
import java.io.IOException;
+import java.util.Collections;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
+import java.util.SortedMap;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBase64Order;
@@ -49,7 +51,7 @@ public final class indexRAMRI implements indexRI {
// class variables
private final File databaseRoot;
- protected final TreeMap cache; // wordhash-container
+ protected final SortedMap cache; // wordhash-container
private final kelondroMScoreCluster hashScore;
private final kelondroMScoreCluster hashDate;
private long initTime;
@@ -72,7 +74,7 @@ public final class indexRAMRI implements indexRI {
// creates a new index cache
// the cache has a back-end where indexes that do not fit in the cache are flushed
this.databaseRoot = databaseRoot;
- this.cache = new TreeMap();
+ this.cache = Collections.synchronizedSortedMap(new TreeMap());
this.hashScore = new kelondroMScoreCluster();
this.hashDate = new kelondroMScoreCluster();
this.initTime = System.currentTimeMillis();
diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
index f69b0de52..1da6f5630 100644
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@@ -740,8 +740,8 @@ public final class plasmaCondenser {
StringBuffer sb;
char c;
while (s.length() == 0) {
- if (e.hasMoreElements()) {
- r = (String) e.nextElement();
+ if (e.hasNext()) {
+ r = (String) e.next();
if (r == null) return null;
r = r.trim();
sb = new StringBuffer(r.length() * 2);
@@ -788,7 +788,7 @@ public final class plasmaCondenser {
}
}
- public static class sentencesFromInputStreamEnum implements Enumeration {
+ public static class sentencesFromInputStreamEnum implements Iterator {
// read sentences from a given input stream
// this enumerates String objects
@@ -826,11 +826,11 @@ public final class plasmaCondenser {
}
}
- public boolean hasMoreElements() {
+ public boolean hasNext() {
return buffer != null;
}
- public Object nextElement() {
+ public Object next() {
if (buffer == null) {
return null;
} else {
@@ -844,6 +844,10 @@ public final class plasmaCondenser {
public int count() {
return counter;
}
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
}
static String readSentence(Reader reader, boolean pre) throws IOException {
diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java
index 9dae62b7c..f6fabcfdb 100644
--- a/source/de/anomic/plasma/plasmaCrawlStacker.java
+++ b/source/de/anomic/plasma/plasmaCrawlStacker.java
@@ -397,6 +397,7 @@ public final class plasmaCrawlStacker {
indexURLEntry oldEntry = null;
oldEntry = this.sb.wordIndex.loadedURL.load(nexturlhash, null);
boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
+ // FIXME: this does not work correctly?
if ((dbocc != null) && (!(recrawl))) {
reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")";
//this.log.logFine("URL '" + nexturlString + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index 8a2a99ce5..2b0cca8ee 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -56,7 +56,6 @@ import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import java.util.Arrays;
-import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
@@ -955,11 +954,11 @@ public final class plasmaParser {
System.out.println(document.getMainLongTitle());
// found text
- final Enumeration sentences = document.getSentences(false);
+ final Iterator sentences = document.getSentences(false);
int i = 0;
- if (sentences != null) while (sentences.hasMoreElements()) {
+ if (sentences != null) while (sentences.hasNext()) {
System.out.print("line " + i + ": ");
- System.out.println((String) sentences.nextElement());
+ System.out.println((String) sentences.next());
i++;
}
diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java
index 532b7417c..787d2077c 100644
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@@ -50,7 +50,6 @@ import java.io.InputStream;
import java.net.MalformedURLException;
import de.anomic.server.serverFileUtils;
-import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
@@ -194,7 +193,7 @@ public class plasmaParserDocument {
return -1;
}
- public Enumeration getSentences(boolean pre) {
+ public Iterator getSentences(boolean pre) {
if (this.text == null) return null;
plasmaCondenser.sentencesFromInputStreamEnum e = plasmaCondenser.sentencesFromInputStream(getText(), this.charset);
e.pre(pre);
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index 7028655bf..6f2e912bc 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -47,13 +47,13 @@ package de.anomic.plasma;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
+import java.util.TreeMap;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
@@ -264,7 +264,7 @@ public class plasmaSnippetCache {
// we have found a parseable non-empty file: use the lines
// compute snippet from text
- final Enumeration sentences = document.getSentences(pre);
+ final Iterator sentences = document.getSentences(pre);
if (sentences == null) return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
String textline = computeTextSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
@@ -282,7 +282,7 @@ public class plasmaSnippetCache {
//if (hrefline != null) line += (line.length() == 0) ? hrefline : "
" + hrefline;
if (textline != null) line += (line.length() == 0) ? textline : "
" + textline;
- if (line == null) return new Snippet(null, ERROR_NO_MATCH, "no matching snippet found");
+ if ((line == null) || (line.length() < 3 /*snippetMinLength*/)) return new Snippet(null, ERROR_NO_MATCH, "no matching snippet found");
if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);
// finally store this snippet in our own cache
@@ -403,58 +403,64 @@ public class plasmaSnippetCache {
return result.substring(6);
}
- private String computeTextSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
+ private String computeTextSnippet(Iterator sentences, Set queryhashes, int minLength, int maxLength) {
try {
if (sentences == null) return null;
if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
- kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
Iterator j;
HashMap hs;
String hash;
- ArrayList sb = new ArrayList();
String sentence;
- while (sentences.hasMoreElements()) {
- sentence = (String) sentences.nextElement();
+ TreeMap os = new TreeMap();
+ int uniqCounter = 9999;
+ int score;
+ while (sentences.hasNext()) {
+ sentence = (String) sentences.next();
//System.out.println("Snippet-Sentence :" + sentence); // DEBUG
if (sentence.length() > minLength) {
hs = hashSentence(sentence);
j = queryhashes.iterator();
+ score = 0;
while (j.hasNext()) {
hash = (String) j.next();
if (hs.containsKey(hash)) {
//System.out.println("hash " + hash + " appears in line " + i);
- hitTable.incScore(new Integer(sb.size()));
+ score++;
}
}
- sb.add(sentence);
+ if (score > 0) {
+ os.put(new Integer(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence);
+ }
}
}
- int score = hitTable.getMaxScore(); // best number of hits
- if (score <= 0) return null;
- // we found (a) line(s) that have hits.
- // now find the shortest line of these hits
- int shortLineIndex = -1;
- int shortLineLength = Integer.MAX_VALUE;
- for (int i = 0; i < sb.size(); i++) {
- if ((hitTable.getScore(new Integer(i)) == score) && (((String) sb.get(i)).length() < shortLineLength)) {
- shortLineIndex = i;
- shortLineLength = ((String) sb.get(i)).length();
+
+ String result;
+ Set remaininghashes;
+ while (os.size() > 0) {
+ sentence = (String) os.remove((Integer) os.lastKey()); // sentence with the biggest score
+ result = computeTextSnippet(sentence, queryhashes, minLength, maxLength);
+ if ((result != null) && (result.length() > 0)) {
+ remaininghashes = removeAppearanceHashes(result, queryhashes);
+ if (remaininghashes.size() == 0) {
+ // we have found the snippet
+ return result;
+ } else if (remaininghashes.size() < queryhashes.size()) {
+ // the result has not all words in it.
+ // find another sentence that represents the missing other words
+ // and find recursively more sentences
+ maxLength = maxLength - result.length();
+ if (maxLength < 20) maxLength = 20;
+ String nextSnippet = computeTextSnippet(os.values().iterator(), remaininghashes, minLength / 2, maxLength);
+ if ((nextSnippet == null) || (nextSnippet.length() < (minLength / 2))) return null; // no success
+ return result + (" / " + nextSnippet);
+ } else {
+ // error
+ //assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'";
+ continue;
+ }
}
}
-
- // find a first result
- String result = computeTextSnippet((String) sb.get(shortLineIndex), queryhashes, minLength, maxLength);
- Set remaininghashes = removeAppearanceHashes(result, queryhashes);
-
- if (remaininghashes.size() == 0) return result;
- // the result has not all words in it.
- // find another sentence that represents the missing other words
- // and find recursively more sentences
- maxLength = maxLength - result.length();
- if (maxLength < 20) maxLength = 20;
- String nextSnippet = computeTextSnippet(sentences, remaininghashes, minLength, maxLength);
- if (nextSnippet == null) return null;
- return result + (" / " + nextSnippet);
+ return null;
} catch (IndexOutOfBoundsException e) {
log.logSevere("computeSnippet: error with string generation", e);
return "";