diff --git a/htroot/DetailedSearch.java b/htroot/DetailedSearch.java
index cd25283fd..bb9ae7d5f 100644
--- a/htroot/DetailedSearch.java
+++ b/htroot/DetailedSearch.java
@@ -221,7 +221,7 @@ public class DetailedSearch {
return prop;
}
final String delHash = post.get("deleteref", "");
- sb.wordIndex.removeReferences(query, delHash);
+ sb.wordIndex.removeWordReferences(query, delHash);
}
// prepare search order
diff --git a/htroot/xml/snippet.java b/htroot/xml/snippet.java
index 316b4ba79..22f5b3dd8 100644
--- a/htroot/xml/snippet.java
+++ b/htroot/xml/snippet.java
@@ -15,7 +15,6 @@ import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
-import de.anomic.server.logging.serverLog;
public class snippet {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) throws MalformedURLException {
@@ -54,23 +53,20 @@ public class snippet {
final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords);
if (filtered.size() > 0) {
kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords);
- }
+ }
// find snippet
if (media.equals("text")) {
// attach text snippet
plasmaSnippetCache.TextSnippet snippet = switchboard.snippetCache.retrieveTextSnippet(url, queryHashes, true, pre, 260, textsnippet_timeout);
- prop.put("status",snippet.getSource());
- if (snippet.getSource() < 11) {
+ prop.put("status",snippet.getErrorCode());
+ if (snippet.getErrorCode() < 11) {
+ // no problems occurred
//prop.put("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown");
prop.putASIS("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown"); //FIXME: the ASIS should not be needed, but we have still htmlcode in .java files
} else {
- String error = snippet.getError();
- if ((remove) && (error.equals("no matching snippet found"))) {
- serverLog.logInfo("snippet-fetch", "no snippet found, remove words '" + querystring + "' for url = " + url.toNormalform());
- switchboard.wordIndex.removeReferences(query, plasmaURL.urlHash(url));
- }
- prop.put("text", error);
+ // problems with snippet fetch
+ prop.put("text", (remove) ? switchboard.snippetCache.failConsequences(snippet, query) : snippet.getError());
}
prop.put("link", 0);
prop.put("links", 0);
diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java
index 24003a452..4f8cd191f 100644
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@@ -210,7 +210,7 @@ public class yacysearch {
// delete the index entry locally
final String delHash = post.get("deleteref", ""); // urlhash
- sb.wordIndex.removeReferences(query, delHash);
+ sb.wordIndex.removeWordReferences(query, delHash);
// make new news message with negative voting
HashMap map = new HashMap();
diff --git a/source/de/anomic/kelondro/kelondroMSetTools.java b/source/de/anomic/kelondro/kelondroMSetTools.java
index 302ee339f..877baf023 100644
--- a/source/de/anomic/kelondro/kelondroMSetTools.java
+++ b/source/de/anomic/kelondro/kelondroMSetTools.java
@@ -423,6 +423,16 @@ public class kelondroMSetTools {
return list;
}
+ public static String setToString(Set set, char separator) {
+ Iterator i = set.iterator();
+ StringBuffer sb = new StringBuffer(set.size() * 7);
+ if (i.hasNext()) sb.append(i.next().toString());
+ while (i.hasNext()) {
+ sb.append(separator).append(i.next().toString());
+ }
+ return new String(sb);
+ }
+
// ------------------------------------------------------------------------------------------------
diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java
index 4fb9d26d0..a92895811 100644
--- a/source/de/anomic/plasma/plasmaSearchQuery.java
+++ b/source/de/anomic/plasma/plasmaSearchQuery.java
@@ -178,19 +178,6 @@ public final class plasmaSearchQuery {
return result.toString();
}
- /*
- public String hashes(String separator) {
- StringBuffer result = new StringBuffer(8 * queryHashes.size());
- Iterator i = queryHashes.iterator();
- if (i.hasNext()) result.append((String) i.next());
- while (i.hasNext()) {
- result.append(separator);
- result.append((String) i.next());
- }
- return result.toString();
- }
- */
-
public void filterOut(Set blueList) {
// filter out words that appear in this set
Iterator it = queryWords.iterator();
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index ce20ef7d5..ab4c16ef2 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -62,6 +62,7 @@ import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.plasma.plasmaURL;
import de.anomic.kelondro.kelondroMScoreCluster;
+import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.crawler.plasmaCrawlerException;
@@ -109,13 +110,20 @@ public class plasmaSnippetCache {
}
public class TextSnippet {
+ private URL url;
private String line;
private String error;
- private int source;
- public TextSnippet(String line, int source, String errortext) {
+ private int errorCode;
+ private Set remaingHashes;
+ public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext) {
+ this.url = url;
this.line = line;
- this.source = source;
+ this.errorCode = errorCode;
this.error = errortext;
+ this.remaingHashes = remaingHashes;
+ }
+ public URL getUrl() {
+ return this.url;
}
public boolean exists() {
return line != null;
@@ -129,6 +137,12 @@ public class plasmaSnippetCache {
public String getError() {
return (error == null) ? "" : error.trim();
}
+ public int getErrorCode() {
+ return errorCode;
+ }
+ public Set getRemainingHashes() {
+ return this.remaingHashes;
+ }
public String getLineMarked(Set queryHashes) {
if (line == null) return "";
if ((queryHashes == null) || (queryHashes.size() == 0)) return line.trim();
@@ -199,9 +213,6 @@ public class plasmaSnippetCache {
}
return l.toString().trim();
}
- public int getSource() {
- return source;
- }
}
public class MediaSnippet {
@@ -225,7 +236,7 @@ public class plasmaSnippetCache {
// heise = "0OQUNU3JSs05"
if (queryhashes.size() == 0) {
//System.out.println("found no queryhashes for URL retrieve " + url);
- return new TextSnippet(null, ERROR_NO_HASH_GIVEN, "no query hashes given");
+ return new TextSnippet(url, null, ERROR_NO_HASH_GIVEN, queryhashes, "no query hashes given");
}
String urlhash = plasmaURL.urlHash(url);
@@ -235,7 +246,7 @@ public class plasmaSnippetCache {
String line = retrieveFromCache(wordhashes, urlhash);
if (line != null) {
//System.out.println("found snippet for URL " + url + " in cache: " + line);
- return new TextSnippet(line, source, null);
+ return new TextSnippet(url, line, source, null, null);
}
/* ===========================================================================
@@ -273,15 +284,15 @@ public class plasmaSnippetCache {
}
// if it is still not available, report an error
- if (resContent == null) return new TextSnippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL");
+ if (resContent == null) return new TextSnippet(url, null, ERROR_RESOURCE_LOADING, queryhashes, "error loading resource, plasmaHTCache.Entry cache is NULL");
source = SOURCE_WEB;
} else {
- return new TextSnippet(null, ERROR_SOURCE_LOADING, "no resource available");
+ return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "no resource available");
}
} catch (Exception e) {
if (!(e instanceof plasmaCrawlerException)) e.printStackTrace();
- return new TextSnippet(null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
+ return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "error loading resource: " + e.getMessage());
}
/* ===========================================================================
@@ -291,11 +302,11 @@ public class plasmaSnippetCache {
try {
document = parseDocument(url, resContentLength, resContent, resInfo);
} catch (ParserException e) {
- return new TextSnippet(null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
+ return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed
} finally {
try { resContent.close(); } catch (Exception e) {/* ignore this */}
}
- if (document == null) return new TextSnippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
+ if (document == null) return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, "parser error/failed"); // cannot be parsed
/* ===========================================================================
@@ -305,8 +316,10 @@ public class plasmaSnippetCache {
// compute snippet from text
final Iterator sentences = document.getSentences(pre);
- if (sentences == null) return new TextSnippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
- String textline = computeTextSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
+ if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences");
+ Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength);
+ String textline = (tsr == null) ? null : (String) tsr[0];
+ Set remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1];
// compute snippet from media
String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
@@ -322,13 +335,13 @@ public class plasmaSnippetCache {
//if (hrefline != null) line += (line.length() == 0) ? hrefline : "
" + hrefline;
if (textline != null) line += (line.length() == 0) ? textline : "
" + textline;
- if ((line == null) || (line.length() < 3 /*snippetMinLength*/)) return new TextSnippet(null, ERROR_NO_MATCH, "no matching snippet found");
+ if ((line == null) || (remainingHashes.size() > 0)) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found");
if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);
// finally store this snippet in our own cache
storeToCache(wordhashes, urlhash, line);
document.close();
- return new TextSnippet(line, source, null);
+ return new TextSnippet(url, line, source, null, null);
}
/**
@@ -458,34 +471,25 @@ public class plasmaSnippetCache {
return result.substring(6);
}
- private String computeTextSnippet(Iterator sentences, Set queryhashes, int minLength, int maxLength) {
+ private Object[] /*{String - the snippet, Set - remaining hashes}*/
+ computeTextSnippet(Iterator sentences, Set queryhashes, int maxLength) {
try {
if (sentences == null) return null;
if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
Iterator j;
HashMap hs;
- String hash;
StringBuffer sentence;
TreeMap os = new TreeMap();
int uniqCounter = 9999;
int score;
while (sentences.hasNext()) {
sentence = (StringBuffer) sentences.next();
- //System.out.println("Snippet-Sentence :" + sentence); // DEBUG
- if (sentence.length() > minLength) {
- hs = hashSentence(sentence.toString());
- j = queryhashes.iterator();
- score = 0;
- while (j.hasNext()) {
- hash = (String) j.next();
- if (hs.containsKey(hash)) {
- //System.out.println("hash " + hash + " appears in line " + i);
- score++;
- }
- }
- if (score > 0) {
- os.put(new Integer(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence);
- }
+ hs = hashSentence(sentence.toString());
+ j = queryhashes.iterator();
+ score = 0;
+ while (j.hasNext()) {if (hs.containsKey((String) j.next())) score++;}
+ if (score > 0) {
+ os.put(new Integer(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence);
}
}
@@ -493,21 +497,24 @@ public class plasmaSnippetCache {
Set remaininghashes;
while (os.size() > 0) {
sentence = (StringBuffer) os.remove((Integer) os.lastKey()); // sentence with the biggest score
- result = computeTextSnippet(sentence.toString(), queryhashes, minLength, maxLength);
+ Object[] tsr = computeTextSnippet(sentence.toString(), queryhashes, maxLength);
+ if (tsr == null) continue;
+ result = (String) tsr[0];
if ((result != null) && (result.length() > 0)) {
- remaininghashes = removeAppearanceHashes(result, queryhashes);
+ remaininghashes = (Set) tsr[1];
if (remaininghashes.size() == 0) {
// we have found the snippet
- return result;
+ return new Object[]{result, remaininghashes};
} else if (remaininghashes.size() < queryhashes.size()) {
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - result.length();
if (maxLength < 20) maxLength = 20;
- String nextSnippet = computeTextSnippet(os.values().iterator(), remaininghashes, minLength / 2, maxLength);
- if ((nextSnippet == null) || (nextSnippet.length() < (minLength / 2))) return null; // no success
- return result + (" / " + nextSnippet);
+ tsr = computeTextSnippet(os.values().iterator(), remaininghashes, maxLength);
+ String nextSnippet = (String) tsr[0];
+ if (nextSnippet == null) return tsr;
+ return new Object[]{result + (" / " + nextSnippet), tsr[1]};
} else {
// error
//assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'";
@@ -518,11 +525,12 @@ public class plasmaSnippetCache {
return null;
} catch (IndexOutOfBoundsException e) {
log.logSevere("computeSnippet: error with string generation", e);
- return "";
+ return new Object[]{null, queryhashes};
}
}
- private String computeTextSnippet(String sentence, Set queryhashes, int minLength, int maxLength) {
+ private Object[] /*{String - the snippet, Set - remaining hashes}*/
+ computeTextSnippet(String sentence, Set queryhashes, int maxLength) {
try {
if (sentence == null) return null;
if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
@@ -535,10 +543,13 @@ public class plasmaSnippetCache {
j = queryhashes.iterator();
Integer pos;
int p, minpos = sentence.length(), maxpos = -1;
+ HashSet remainingHashes = new HashSet();
while (j.hasNext()) {
hash = (String) j.next();
pos = (Integer) hs.get(hash);
- if (pos != null) {
+ if (pos == null) {
+ remainingHashes.add(hash);
+ } else {
p = pos.intValue();
if (p > maxpos) maxpos = p;
if (p < minpos) minpos = p;
@@ -579,7 +590,7 @@ public class plasmaSnippetCache {
// trim sentence, 3rd step (cut in the middle)
sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
}
- return sentence;
+ return new Object[] {sentence, remainingHashes};
} catch (IndexOutOfBoundsException e) {
log.logSevere("computeSnippet: error with string generation", e);
return null;
@@ -838,46 +849,24 @@ public class plasmaSnippetCache {
return result;
}
- /*
- public void fetch(plasmaSearchResult acc, Set queryhashes, String urlmask, int fetchcount, long maxTime) {
- // fetch snippets
- int i = 0;
- indexURLEntry urlentry;
- String urlstring;
- long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
- while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) {
- urlentry = acc.nextElement();
- indexURLEntry.Components comp = urlentry.comp();
- if (comp.url().getHost().endsWith(".yacyh")) continue;
- urlstring = comp.url().toNormalform();
- if ((urlstring.matches(urlmask)) &&
- (!(existsInCache(comp.url(), queryhashes)))) {
- new Fetcher(comp.url(), queryhashes, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), (int) maxTime).start();
- i++;
- }
- }
- }
-
- public class Fetcher extends Thread {
- URL url;
- Set queryhashes;
- int timeout;
- boolean pre;
- public Fetcher(URL url, Set queryhashes, boolean pre, int timeout) {
- if (url.getHost().endsWith(".yacyh")) return;
- this.url = url;
- this.queryhashes = queryhashes;
- this.timeout = timeout;
- this.pre = pre;
+
+ public String failConsequences(TextSnippet snippet, Set queryhashes) {
+ // problems with snippet fetch
+ String urlHash = plasmaURL.urlHash(snippet.getUrl());
+ String querystring = kelondroMSetTools.setToString(snippet.getRemainingHashes(), ' ');
+ if ((snippet.getErrorCode() == ERROR_SOURCE_LOADING) ||
+ (snippet.getErrorCode() == ERROR_RESOURCE_LOADING) ||
+ (snippet.getErrorCode() == ERROR_PARSER_FAILED) ||
+ (snippet.getErrorCode() == ERROR_PARSER_NO_LINES)) {
+ log.logInfo("error: '" + snippet.getError() + "', remove url = " + snippet.getUrl().toNormalform() + ", cause: " + snippet.getError());
+ sb.wordIndex.loadedURL.remove(urlHash);
+ sb.wordIndex.removeHashReferences(queryhashes, urlHash);
}
- public void run() {
- log.logFine("snippetFetcher: try to get URL " + url);
- plasmaSnippetCache.TextSnippet snippet = retrieveTextSnippet(url, queryhashes, true, pre, 260, timeout);
- if (snippet.line == null)
- log.logFine("snippetFetcher: cannot get URL " + url + ". error(" + snippet.source + "): " + snippet.error);
- else
- log.logFine("snippetFetcher: got URL " + url + ", the snippet is '" + snippet.line + "', source=" + snippet.source);
+ if (snippet.getErrorCode() == ERROR_NO_MATCH) {
+ log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform() + ", cause: " + snippet.getError());
+ sb.wordIndex.removeHashReferences(snippet.remaingHashes, urlHash);
}
+ return snippet.getError();
}
- */
+
}
\ No newline at end of file
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index d3816746f..cba80353d 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -2741,7 +2741,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
filename = comp.url().getFile();
if ((seed == null) || ((address = seed.getAddress()) == null)) {
// seed is not known from here
- wordIndex.removeReferences(plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes(), "UTF-8").keySet(), urlentry.hash());
+ wordIndex.removeWordReferences(plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes(), "UTF-8").keySet(), urlentry.hash());
wordIndex.loadedURL.remove(urlentry.hash()); // clean up
continue; // next result
}
@@ -2887,7 +2887,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// delete all word references
int count = 0;
- if (words != null) count = wordIndex.removeReferences(words, urlhash);
+ if (words != null) count = wordIndex.removeWordReferences(words, urlhash);
// finally delete the url entry itself
wordIndex.loadedURL.remove(urlhash);
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index 577a9fe76..b0d2ef52b 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -426,16 +426,26 @@ public final class plasmaWordIndex implements indexRI {
return removed;
}
- public int removeReferences(Set words, String urlhash) {
+ public int removeWordReferences(Set words, String urlhash) {
// sequentially delete all word references
// returns number of deletions
Iterator iter = words.iterator();
- String word;
int count = 0;
while (iter.hasNext()) {
- word = (String) iter.next();
// delete the URL reference in this word index
- if (removeEntry(plasmaCondenser.word2hash(word), urlhash)) count++;
+ if (removeEntry(plasmaCondenser.word2hash((String) iter.next()), urlhash)) count++;
+ }
+ return count;
+ }
+
+ public int removeHashReferences(Set hashes, String urlhash) {
+ // sequentially delete all word references
+ // returns number of deletions
+ Iterator iter = hashes.iterator();
+ int count = 0;
+ while (iter.hasNext()) {
+ // delete the URL reference in this word index
+ if (removeEntry((String) iter.next(), urlhash)) count++;
}
return count;
}