diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index 4692aae0b..4659f72d6 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -48,7 +48,6 @@ import java.io.File; import java.io.Writer; -import java.util.Enumeration; import java.util.Iterator; import java.util.Map; import java.util.TreeSet; @@ -132,9 +131,9 @@ public class CacheAdmin_p { .append("EMAIL:
").append(formatAnchor(document.getEmaillinks())).append("
") .append("TEXT:
").append(new String(scraper.getText())).append("
") .append("LINES:
"); - final Enumeration sentences = document.getSentences(false); - if (sentences != null) while (sentences.hasMoreElements()) { - info.append((String) sentences.nextElement()).append("
"); + final Iterator sentences = document.getSentences(false); + if (sentences != null) while (sentences.hasNext()) { + info.append((String) sentences.next()).append("
"); } info.append("

"); if (document != null) document.close(); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 6bc426371..99a8ac7f3 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -50,7 +50,6 @@ import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URLDecoder; import java.net.URLEncoder; -import java.util.Enumeration; import java.util.Iterator; import java.util.Map; import java.util.TreeSet; @@ -312,7 +311,7 @@ public class ViewFile { prop.put("viewMode_parsedText", content); } else if (viewMode.equals("sentences")) { prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES); - final Enumeration sentences = document.getSentences(pre); + final Iterator sentences = document.getSentences(pre); boolean dark = true; int i = 0; @@ -320,9 +319,9 @@ public class ViewFile { String[] wordArray = wordArray(post.get("words", null)); // Search word highlighting - while (sentences.hasMoreElements()) { + while (sentences.hasNext()) { prop.put("viewMode_sentences_" + i + "_nr", Integer.toString(i + 1)); - prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, (String) sentences.nextElement())); + prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, (String) sentences.next())); prop.put("viewMode_sentences_" + i + "_dark", ((dark) ? 1 : 0)); dark = !dark; i++; diff --git a/htroot/xml/snippet.java b/htroot/xml/snippet.java index 075137495..d15098800 100644 --- a/htroot/xml/snippet.java +++ b/htroot/xml/snippet.java @@ -14,6 +14,7 @@ import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; +import de.anomic.server.logging.serverLog; public class snippet { public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) throws MalformedURLException { @@ -53,6 +54,7 @@ public class snippet { } else { String error = snippet.getError(); if ((remove) && (error.equals("no matching snippet found"))) { + serverLog.logInfo("snippet-fetch", "no snippet found, remove words '" + querystring + "' for url = " + url.toNormalform()); switchboard.wordIndex.removeReferences(query, plasmaURL.urlHash(url)); } prop.put("text", error); diff --git a/source/de/anomic/index/indexRAMRI.java b/source/de/anomic/index/indexRAMRI.java index 869071a70..3f3749823 100644 --- a/source/de/anomic/index/indexRAMRI.java +++ b/source/de/anomic/index/indexRAMRI.java @@ -28,9 +28,11 @@ package de.anomic.index; import java.io.File; import java.io.IOException; +import java.util.Collections; import java.util.Iterator; import java.util.Map; import java.util.Set; +import java.util.SortedMap; import java.util.TreeMap; import de.anomic.kelondro.kelondroBase64Order; @@ -49,7 +51,7 @@ public final class indexRAMRI implements indexRI { // class variables private final File databaseRoot; - protected final TreeMap cache; // wordhash-container + protected final SortedMap cache; // wordhash-container private final kelondroMScoreCluster hashScore; private final kelondroMScoreCluster hashDate; private long initTime; @@ -72,7 +74,7 @@ public final class indexRAMRI implements indexRI { // creates a new index cache // the cache has a back-end where indexes that do not fit in the cache are flushed this.databaseRoot = databaseRoot; - this.cache = new TreeMap(); + this.cache = Collections.synchronizedSortedMap(new TreeMap()); this.hashScore = new kelondroMScoreCluster(); this.hashDate = new kelondroMScoreCluster(); this.initTime = System.currentTimeMillis(); diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index f69b0de52..1da6f5630 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -740,8 +740,8 @@ public final class plasmaCondenser { StringBuffer sb; char c; while (s.length() == 0) { - if (e.hasMoreElements()) { - r = (String) e.nextElement(); + if (e.hasNext()) { + r = (String) e.next(); if (r == null) return null; r = r.trim(); sb = new StringBuffer(r.length() * 2); @@ -788,7 +788,7 @@ public final class plasmaCondenser { } } - public static class sentencesFromInputStreamEnum implements Enumeration { + public static class sentencesFromInputStreamEnum implements Iterator { // read sentences from a given input stream // this enumerates String objects @@ -826,11 +826,11 @@ public final class plasmaCondenser { } } - public boolean hasMoreElements() { + public boolean hasNext() { return buffer != null; } - public Object nextElement() { + public Object next() { if (buffer == null) { return null; } else { @@ -844,6 +844,10 @@ public final class plasmaCondenser { public int count() { return counter; } + + public void remove() { + throw new UnsupportedOperationException(); + } } static String readSentence(Reader reader, boolean pre) throws IOException { diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 9dae62b7c..f6fabcfdb 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -397,6 +397,7 @@ public final class plasmaCrawlStacker { indexURLEntry oldEntry = null; oldEntry = this.sb.wordIndex.loadedURL.load(nexturlhash, null); boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder()); + // FIXME: this does not work correctly? if ((dbocc != null) && (!(recrawl))) { reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")"; //this.log.logFine("URL '" + nexturlString + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 8a2a99ce5..2b0cca8ee 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -56,7 +56,6 @@ import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URI; import java.util.Arrays; -import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Hashtable; @@ -955,11 +954,11 @@ public final class plasmaParser { System.out.println(document.getMainLongTitle()); // found text - final Enumeration sentences = document.getSentences(false); + final Iterator sentences = document.getSentences(false); int i = 0; - if (sentences != null) while (sentences.hasMoreElements()) { + if (sentences != null) while (sentences.hasNext()) { System.out.print("line " + i + ": "); - System.out.println((String) sentences.nextElement()); + System.out.println((String) sentences.next()); i++; } diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index 532b7417c..787d2077c 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -50,7 +50,6 @@ import java.io.InputStream; import java.net.MalformedURLException; import de.anomic.server.serverFileUtils; -import java.util.Enumeration; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -194,7 +193,7 @@ public class plasmaParserDocument { return -1; } - public Enumeration getSentences(boolean pre) { + public Iterator getSentences(boolean pre) { if (this.text == null) return null; plasmaCondenser.sentencesFromInputStreamEnum e = plasmaCondenser.sentencesFromInputStream(getText(), this.charset); e.pre(pre); diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 7028655bf..6f2e912bc 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -47,13 +47,13 @@ package de.anomic.plasma; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; +import java.util.TreeMap; import de.anomic.http.httpHeader; import de.anomic.http.httpc; @@ -264,7 +264,7 @@ public class plasmaSnippetCache { // we have found a parseable non-empty file: use the lines // compute snippet from text - final Enumeration sentences = document.getSentences(pre); + final Iterator sentences = document.getSentences(pre); if (sentences == null) return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences"); String textline = computeTextSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength); @@ -282,7 +282,7 @@ public class plasmaSnippetCache { //if (hrefline != null) line += (line.length() == 0) ? hrefline : "
" + hrefline; if (textline != null) line += (line.length() == 0) ? textline : "
" + textline; - if (line == null) return new Snippet(null, ERROR_NO_MATCH, "no matching snippet found"); + if ((line == null) || (line.length() < 3 /*snippetMinLength*/)) return new Snippet(null, ERROR_NO_MATCH, "no matching snippet found"); if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength); // finally store this snippet in our own cache @@ -403,58 +403,64 @@ public class plasmaSnippetCache { return result.substring(6); } - private String computeTextSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) { + private String computeTextSnippet(Iterator sentences, Set queryhashes, int minLength, int maxLength) { try { if (sentences == null) return null; if ((queryhashes == null) || (queryhashes.size() == 0)) return null; - kelondroMScoreCluster hitTable = new kelondroMScoreCluster(); Iterator j; HashMap hs; String hash; - ArrayList sb = new ArrayList(); String sentence; - while (sentences.hasMoreElements()) { - sentence = (String) sentences.nextElement(); + TreeMap os = new TreeMap(); + int uniqCounter = 9999; + int score; + while (sentences.hasNext()) { + sentence = (String) sentences.next(); //System.out.println("Snippet-Sentence :" + sentence); // DEBUG if (sentence.length() > minLength) { hs = hashSentence(sentence); j = queryhashes.iterator(); + score = 0; while (j.hasNext()) { hash = (String) j.next(); if (hs.containsKey(hash)) { //System.out.println("hash " + hash + " appears in line " + i); - hitTable.incScore(new Integer(sb.size())); + score++; } } - sb.add(sentence); + if (score > 0) { + os.put(new Integer(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence); + } } } - int score = hitTable.getMaxScore(); // best number of hits - if (score <= 0) return null; - // we found (a) line(s) that have hits. - // now find the shortest line of these hits - int shortLineIndex = -1; - int shortLineLength = Integer.MAX_VALUE; - for (int i = 0; i < sb.size(); i++) { - if ((hitTable.getScore(new Integer(i)) == score) && (((String) sb.get(i)).length() < shortLineLength)) { - shortLineIndex = i; - shortLineLength = ((String) sb.get(i)).length(); + + String result; + Set remaininghashes; + while (os.size() > 0) { + sentence = (String) os.remove((Integer) os.lastKey()); // sentence with the biggest score + result = computeTextSnippet(sentence, queryhashes, minLength, maxLength); + if ((result != null) && (result.length() > 0)) { + remaininghashes = removeAppearanceHashes(result, queryhashes); + if (remaininghashes.size() == 0) { + // we have found the snippet + return result; + } else if (remaininghashes.size() < queryhashes.size()) { + // the result has not all words in it. + // find another sentence that represents the missing other words + // and find recursively more sentences + maxLength = maxLength - result.length(); + if (maxLength < 20) maxLength = 20; + String nextSnippet = computeTextSnippet(os.values().iterator(), remaininghashes, minLength / 2, maxLength); + if ((nextSnippet == null) || (nextSnippet.length() < (minLength / 2))) return null; // no success + return result + (" / " + nextSnippet); + } else { + // error + //assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'"; + continue; + } } } - - // find a first result - String result = computeTextSnippet((String) sb.get(shortLineIndex), queryhashes, minLength, maxLength); - Set remaininghashes = removeAppearanceHashes(result, queryhashes); - - if (remaininghashes.size() == 0) return result; - // the result has not all words in it. - // find another sentence that represents the missing other words - // and find recursively more sentences - maxLength = maxLength - result.length(); - if (maxLength < 20) maxLength = 20; - String nextSnippet = computeTextSnippet(sentences, remaininghashes, minLength, maxLength); - if (nextSnippet == null) return null; - return result + (" / " + nextSnippet); + return null; } catch (IndexOutOfBoundsException e) { log.logSevere("computeSnippet: error with string generation", e); return "";