better snippet handling in case of snippet load fail

see also http://www.yacy-forum.de/viewtopic.php?p=31096#31096

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3475 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent d451ad48d3
commit 9f929b5438

@ -221,7 +221,7 @@ public class DetailedSearch {
return prop;
}
final String delHash = post.get("deleteref", "");
sb.wordIndex.removeReferences(query, delHash);
sb.wordIndex.removeWordReferences(query, delHash);
}
// prepare search order

@ -15,7 +15,6 @@ import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
public class snippet {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) throws MalformedURLException {
@ -54,23 +53,20 @@ public class snippet {
final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords);
if (filtered.size() > 0) {
kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords);
}
}
// find snippet
if (media.equals("text")) {
// attach text snippet
plasmaSnippetCache.TextSnippet snippet = switchboard.snippetCache.retrieveTextSnippet(url, queryHashes, true, pre, 260, textsnippet_timeout);
prop.put("status",snippet.getSource());
if (snippet.getSource() < 11) {
prop.put("status",snippet.getErrorCode());
if (snippet.getErrorCode() < 11) {
// no problems occurred
//prop.put("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown");
prop.putASIS("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown"); //FIXME: the ASIS should not be needed, but we have still htmlcode in .java files
} else {
String error = snippet.getError();
if ((remove) && (error.equals("no matching snippet found"))) {
serverLog.logInfo("snippet-fetch", "no snippet found, remove words '" + querystring + "' for url = " + url.toNormalform());
switchboard.wordIndex.removeReferences(query, plasmaURL.urlHash(url));
}
prop.put("text", error);
// problems with snippet fetch
prop.put("text", (remove) ? switchboard.snippetCache.failConsequences(snippet, query) : snippet.getError());
}
prop.put("link", 0);
prop.put("links", 0);

@ -210,7 +210,7 @@ public class yacysearch {
// delete the index entry locally
final String delHash = post.get("deleteref", ""); // urlhash
sb.wordIndex.removeReferences(query, delHash);
sb.wordIndex.removeWordReferences(query, delHash);
// make new news message with negative voting
HashMap map = new HashMap();

@ -423,6 +423,16 @@ public class kelondroMSetTools {
return list;
}
public static String setToString(Set set, char separator) {
Iterator i = set.iterator();
StringBuffer sb = new StringBuffer(set.size() * 7);
if (i.hasNext()) sb.append(i.next().toString());
while (i.hasNext()) {
sb.append(separator).append(i.next().toString());
}
return new String(sb);
}
// ------------------------------------------------------------------------------------------------

@ -178,19 +178,6 @@ public final class plasmaSearchQuery {
return result.toString();
}
/*
public String hashes(String separator) {
StringBuffer result = new StringBuffer(8 * queryHashes.size());
Iterator i = queryHashes.iterator();
if (i.hasNext()) result.append((String) i.next());
while (i.hasNext()) {
result.append(separator);
result.append((String) i.next());
}
return result.toString();
}
*/
public void filterOut(Set blueList) {
// filter out words that appear in this set
Iterator it = queryWords.iterator();

@ -62,6 +62,7 @@ import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.plasma.plasmaURL;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.crawler.plasmaCrawlerException;
@ -109,13 +110,20 @@ public class plasmaSnippetCache {
}
public class TextSnippet {
private URL url;
private String line;
private String error;
private int source;
public TextSnippet(String line, int source, String errortext) {
private int errorCode;
private Set remaingHashes;
public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext) {
this.url = url;
this.line = line;
this.source = source;
this.errorCode = errorCode;
this.error = errortext;
this.remaingHashes = remaingHashes;
}
public URL getUrl() {
return this.url;
}
public boolean exists() {
return line != null;
@ -129,6 +137,12 @@ public class plasmaSnippetCache {
public String getError() {
return (error == null) ? "" : error.trim();
}
public int getErrorCode() {
return errorCode;
}
public Set getRemainingHashes() {
return this.remaingHashes;
}
public String getLineMarked(Set queryHashes) {
if (line == null) return "";
if ((queryHashes == null) || (queryHashes.size() == 0)) return line.trim();
@ -199,9 +213,6 @@ public class plasmaSnippetCache {
}
return l.toString().trim();
}
public int getSource() {
return source;
}
}
public class MediaSnippet {
@ -225,7 +236,7 @@ public class plasmaSnippetCache {
// heise = "0OQUNU3JSs05"
if (queryhashes.size() == 0) {
//System.out.println("found no queryhashes for URL retrieve " + url);
return new TextSnippet(null, ERROR_NO_HASH_GIVEN, "no query hashes given");
return new TextSnippet(url, null, ERROR_NO_HASH_GIVEN, queryhashes, "no query hashes given");
}
String urlhash = plasmaURL.urlHash(url);
@ -235,7 +246,7 @@ public class plasmaSnippetCache {
String line = retrieveFromCache(wordhashes, urlhash);
if (line != null) {
//System.out.println("found snippet for URL " + url + " in cache: " + line);
return new TextSnippet(line, source, null);
return new TextSnippet(url, line, source, null, null);
}
/* ===========================================================================
@ -273,15 +284,15 @@ public class plasmaSnippetCache {
}
// if it is still not available, report an error
if (resContent == null) return new TextSnippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL");
if (resContent == null) return new TextSnippet(url, null, ERROR_RESOURCE_LOADING, queryhashes, "error loading resource, plasmaHTCache.Entry cache is NULL");
source = SOURCE_WEB;
} else {
return new TextSnippet(null, ERROR_SOURCE_LOADING, "no resource available");
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "no resource available");
}
} catch (Exception e) {
if (!(e instanceof plasmaCrawlerException)) e.printStackTrace();
return new TextSnippet(null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "error loading resource: " + e.getMessage());
}
/* ===========================================================================
@ -291,11 +302,11 @@ public class plasmaSnippetCache {
try {
document = parseDocument(url, resContentLength, resContent, resInfo);
} catch (ParserException e) {
return new TextSnippet(null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed
} finally {
try { resContent.close(); } catch (Exception e) {/* ignore this */}
}
if (document == null) return new TextSnippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
if (document == null) return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, "parser error/failed"); // cannot be parsed
/* ===========================================================================
@ -305,8 +316,10 @@ public class plasmaSnippetCache {
// compute snippet from text
final Iterator sentences = document.getSentences(pre);
if (sentences == null) return new TextSnippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
String textline = computeTextSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences");
Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength);
String textline = (tsr == null) ? null : (String) tsr[0];
Set remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1];
// compute snippet from media
String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
@ -322,13 +335,13 @@ public class plasmaSnippetCache {
//if (hrefline != null) line += (line.length() == 0) ? hrefline : "<br />" + hrefline;
if (textline != null) line += (line.length() == 0) ? textline : "<br />" + textline;
if ((line == null) || (line.length() < 3 /*snippetMinLength*/)) return new TextSnippet(null, ERROR_NO_MATCH, "no matching snippet found");
if ((line == null) || (remainingHashes.size() > 0)) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found");
if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);
// finally store this snippet in our own cache
storeToCache(wordhashes, urlhash, line);
document.close();
return new TextSnippet(line, source, null);
return new TextSnippet(url, line, source, null, null);
}
/**
@ -458,34 +471,25 @@ public class plasmaSnippetCache {
return result.substring(6);
}
private String computeTextSnippet(Iterator sentences, Set queryhashes, int minLength, int maxLength) {
private Object[] /*{String - the snippet, Set - remaining hashes}*/
computeTextSnippet(Iterator sentences, Set queryhashes, int maxLength) {
try {
if (sentences == null) return null;
if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
Iterator j;
HashMap hs;
String hash;
StringBuffer sentence;
TreeMap os = new TreeMap();
int uniqCounter = 9999;
int score;
while (sentences.hasNext()) {
sentence = (StringBuffer) sentences.next();
//System.out.println("Snippet-Sentence :" + sentence); // DEBUG
if (sentence.length() > minLength) {
hs = hashSentence(sentence.toString());
j = queryhashes.iterator();
score = 0;
while (j.hasNext()) {
hash = (String) j.next();
if (hs.containsKey(hash)) {
//System.out.println("hash " + hash + " appears in line " + i);
score++;
}
}
if (score > 0) {
os.put(new Integer(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence);
}
hs = hashSentence(sentence.toString());
j = queryhashes.iterator();
score = 0;
while (j.hasNext()) {if (hs.containsKey((String) j.next())) score++;}
if (score > 0) {
os.put(new Integer(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence);
}
}
@ -493,21 +497,24 @@ public class plasmaSnippetCache {
Set remaininghashes;
while (os.size() > 0) {
sentence = (StringBuffer) os.remove((Integer) os.lastKey()); // sentence with the biggest score
result = computeTextSnippet(sentence.toString(), queryhashes, minLength, maxLength);
Object[] tsr = computeTextSnippet(sentence.toString(), queryhashes, maxLength);
if (tsr == null) continue;
result = (String) tsr[0];
if ((result != null) && (result.length() > 0)) {
remaininghashes = removeAppearanceHashes(result, queryhashes);
remaininghashes = (Set) tsr[1];
if (remaininghashes.size() == 0) {
// we have found the snippet
return result;
return new Object[]{result, remaininghashes};
} else if (remaininghashes.size() < queryhashes.size()) {
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - result.length();
if (maxLength < 20) maxLength = 20;
String nextSnippet = computeTextSnippet(os.values().iterator(), remaininghashes, minLength / 2, maxLength);
if ((nextSnippet == null) || (nextSnippet.length() < (minLength / 2))) return null; // no success
return result + (" / " + nextSnippet);
tsr = computeTextSnippet(os.values().iterator(), remaininghashes, maxLength);
String nextSnippet = (String) tsr[0];
if (nextSnippet == null) return tsr;
return new Object[]{result + (" / " + nextSnippet), tsr[1]};
} else {
// error
//assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'";
@ -518,11 +525,12 @@ public class plasmaSnippetCache {
return null;
} catch (IndexOutOfBoundsException e) {
log.logSevere("computeSnippet: error with string generation", e);
return "";
return new Object[]{null, queryhashes};
}
}
private String computeTextSnippet(String sentence, Set queryhashes, int minLength, int maxLength) {
private Object[] /*{String - the snippet, Set - remaining hashes}*/
computeTextSnippet(String sentence, Set queryhashes, int maxLength) {
try {
if (sentence == null) return null;
if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
@ -535,10 +543,13 @@ public class plasmaSnippetCache {
j = queryhashes.iterator();
Integer pos;
int p, minpos = sentence.length(), maxpos = -1;
HashSet remainingHashes = new HashSet();
while (j.hasNext()) {
hash = (String) j.next();
pos = (Integer) hs.get(hash);
if (pos != null) {
if (pos == null) {
remainingHashes.add(hash);
} else {
p = pos.intValue();
if (p > maxpos) maxpos = p;
if (p < minpos) minpos = p;
@ -579,7 +590,7 @@ public class plasmaSnippetCache {
// trim sentence, 3rd step (cut in the middle)
sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
}
return sentence;
return new Object[] {sentence, remainingHashes};
} catch (IndexOutOfBoundsException e) {
log.logSevere("computeSnippet: error with string generation", e);
return null;
@ -838,46 +849,24 @@ public class plasmaSnippetCache {
return result;
}
/*
public void fetch(plasmaSearchResult acc, Set queryhashes, String urlmask, int fetchcount, long maxTime) {
// fetch snippets
int i = 0;
indexURLEntry urlentry;
String urlstring;
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) {
urlentry = acc.nextElement();
indexURLEntry.Components comp = urlentry.comp();
if (comp.url().getHost().endsWith(".yacyh")) continue;
urlstring = comp.url().toNormalform();
if ((urlstring.matches(urlmask)) &&
(!(existsInCache(comp.url(), queryhashes)))) {
new Fetcher(comp.url(), queryhashes, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), (int) maxTime).start();
i++;
}
}
}
public class Fetcher extends Thread {
URL url;
Set queryhashes;
int timeout;
boolean pre;
public Fetcher(URL url, Set queryhashes, boolean pre, int timeout) {
if (url.getHost().endsWith(".yacyh")) return;
this.url = url;
this.queryhashes = queryhashes;
this.timeout = timeout;
this.pre = pre;
public String failConsequences(TextSnippet snippet, Set queryhashes) {
// problems with snippet fetch
String urlHash = plasmaURL.urlHash(snippet.getUrl());
String querystring = kelondroMSetTools.setToString(snippet.getRemainingHashes(), ' ');
if ((snippet.getErrorCode() == ERROR_SOURCE_LOADING) ||
(snippet.getErrorCode() == ERROR_RESOURCE_LOADING) ||
(snippet.getErrorCode() == ERROR_PARSER_FAILED) ||
(snippet.getErrorCode() == ERROR_PARSER_NO_LINES)) {
log.logInfo("error: '" + snippet.getError() + "', remove url = " + snippet.getUrl().toNormalform() + ", cause: " + snippet.getError());
sb.wordIndex.loadedURL.remove(urlHash);
sb.wordIndex.removeHashReferences(queryhashes, urlHash);
}
public void run() {
log.logFine("snippetFetcher: try to get URL " + url);
plasmaSnippetCache.TextSnippet snippet = retrieveTextSnippet(url, queryhashes, true, pre, 260, timeout);
if (snippet.line == null)
log.logFine("snippetFetcher: cannot get URL " + url + ". error(" + snippet.source + "): " + snippet.error);
else
log.logFine("snippetFetcher: got URL " + url + ", the snippet is '" + snippet.line + "', source=" + snippet.source);
if (snippet.getErrorCode() == ERROR_NO_MATCH) {
log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform() + ", cause: " + snippet.getError());
sb.wordIndex.removeHashReferences(snippet.remaingHashes, urlHash);
}
return snippet.getError();
}
*/
}

@ -2741,7 +2741,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
filename = comp.url().getFile();
if ((seed == null) || ((address = seed.getAddress()) == null)) {
// seed is not known from here
wordIndex.removeReferences(plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes(), "UTF-8").keySet(), urlentry.hash());
wordIndex.removeWordReferences(plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes(), "UTF-8").keySet(), urlentry.hash());
wordIndex.loadedURL.remove(urlentry.hash()); // clean up
continue; // next result
}
@ -2887,7 +2887,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// delete all word references
int count = 0;
if (words != null) count = wordIndex.removeReferences(words, urlhash);
if (words != null) count = wordIndex.removeWordReferences(words, urlhash);
// finally delete the url entry itself
wordIndex.loadedURL.remove(urlhash);

@ -426,16 +426,26 @@ public final class plasmaWordIndex implements indexRI {
return removed;
}
public int removeReferences(Set words, String urlhash) {
public int removeWordReferences(Set words, String urlhash) {
// sequentially delete all word references
// returns number of deletions
Iterator iter = words.iterator();
String word;
int count = 0;
while (iter.hasNext()) {
word = (String) iter.next();
// delete the URL reference in this word index
if (removeEntry(plasmaCondenser.word2hash(word), urlhash)) count++;
if (removeEntry(plasmaCondenser.word2hash((String) iter.next()), urlhash)) count++;
}
return count;
}
public int removeHashReferences(Set hashes, String urlhash) {
// sequentially delete all word references
// returns number of deletions
Iterator iter = hashes.iterator();
int count = 0;
while (iter.hasNext()) {
// delete the URL reference in this word index
if (removeEntry((String) iter.next(), urlhash)) count++;
}
return count;
}

Loading…
Cancel
Save