- enhanced remote search: during waiting time for remote crawls

some urls are fetched so the url cache can be filled with these urls
- the url-prefetch is used to sort out some unresolved urls
- the snippet-fetcher is triggered with the search event id. This is used
  to remove missing snippets from the search cache so they will not be displayed again


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4060 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent a34d9b8609
commit e332b844b2

@ -94,7 +94,8 @@ public class PerformanceSearch_p {
}
// prepare values
plasmaSearchEvent se = plasmaSearchEvent.lastEvent;
plasmaSearchEvent se = plasmaSearchEvent.getEvent(plasmaSearchEvent.lastEventID);
// count complete execution time
long time = 0;
long t;

@ -46,6 +46,7 @@
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaGrafics;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.ymage.ymageMatrix;
@ -56,7 +57,9 @@ public class SearchEventPicture {
public static ymageMatrix respond(httpHeader header, serverObjects post, serverSwitch env) {
ymageMatrix yp = plasmaGrafics.getSearchEventPicture();
String eventID = (String) header.get("event", plasmaSearchEvent.lastEventID);
if (eventID == null) return null;
ymageMatrix yp = plasmaGrafics.getSearchEventPicture(eventID);
if (yp == null) return new ymageMatrix(1, 1, "000000"); // empty image
return yp;

@ -16,7 +16,7 @@ public class WatchWebStructure_p {
int width = 768;
int height = 576;
int depth = 3;
int nodes = 1000; // maximum number of host nodes that are painted
int nodes = 500; // maximum number of host nodes that are painted
int time = -1;
String host = "auto";

@ -51,12 +51,12 @@ function Progressbar(length, parent) {
parent.appendChild(this.element);
}
function AllTextSnippets(query) {
function AllTextSnippets(query, eventID) {
var span = document.getElementsByTagName("span");
for(var x=0;x<span.length;x++) {
if (span[x].className == 'snippetLoading') {
var url = document.getElementById("url" + span[x].id.substring(1));
requestTextSnippet(url,query);
requestTextSnippet(url, query, eventID);
}
}
}
@ -79,9 +79,9 @@ function AllImageSnippets(urls, query) {
}
}
function requestTextSnippet(url, query){
function requestTextSnippet(url, query, eventID){
var request=createRequestObject();
request.open('get', '/xml/snippet.xml?url=' + escape(url) + '&remove=true&media=text&search=' + escape(query),true);
request.open('get', '/xml/snippet.xml?url=' + escape(url) + '&remove=true&media=text&search=' + escape(query) + '&eventID=' + eventID,true);
request.onreadystatechange = function () {handleTextState(request)};
request.send(null);
}

@ -43,6 +43,7 @@ public class snippet {
String media = post.get("media", "text");
String querystring = post.get("search", "").trim();
String eventID = post.get("eventID", "").trim();
if ((querystring.length() > 2) && (querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
querystring = querystring.substring(1, querystring.length() - 1).trim();
}
@ -66,7 +67,7 @@ public class snippet {
prop.putASIS("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown"); //FIXME: the ASIS should not be needed, but we have still htmlcode in .java files
} else {
// problems with snippet fetch
prop.put("text", (remove) ? plasmaSnippetCache.failConsequences(snippet, queryHashes) : snippet.getError());
prop.put("text", (remove) ? plasmaSnippetCache.failConsequences(snippet, eventID) : snippet.getError());
}
prop.put("link", 0);
prop.put("links", 0);

@ -159,7 +159,7 @@ document.getElementById("Enter").value = "search again - catch up more links";
#{/results}#
<script type="text/javascript">
AllTextSnippets("#[former]#");
AllTextSnippets("#[former]#", "#[eventID]#");
addHover();
</script>

@ -406,7 +406,7 @@ public class yacysearch {
prop.putASIS("type_results_" + i + "_snippet_text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown");
} else {
// problems with snippet fetch
prop.put("type_results_" + i + "_snippet_text", (remove) ? plasmaSnippetCache.failConsequences(snippet, queryHashes) : snippet.getError());
prop.put("type_results_" + i + "_snippet_text", (remove) ? plasmaSnippetCache.failConsequences(snippet, theQuery.id()) : snippet.getError());
}
prop.put("type_results_" + i + "_snippet", 1);
} else {
@ -503,6 +503,7 @@ public class yacysearch {
if (prop.getInt("type", 0) == 1) prop.put("type_mediatype", post.get("contentdom", "text"));
prop.put("input_cat", "href");
prop.put("input_depth", "0");
prop.put("type_eventID", theQuery.id());
// adding some additional properties needed for the rss feed
String hostName = (String) header.get("Host", "localhost");
@ -570,7 +571,6 @@ public class yacysearch {
prop.put("input_contentdomCheckApp", (contentdomCode == plasmaSearchQuery.CONTENTDOM_APP) ? 1 : 0);
prop.put("type_former", post.get("search", "")); //the query-string used to get the snippets
// return rewrite properties
return prop;
}

@ -167,6 +167,13 @@ public class indexContainer extends kelondroRowSet {
return count;
}
public void removeEntriesMultiple(Set wordHashes, Set urlHashes) {
Iterator i = wordHashes.iterator();
while (i.hasNext()) {
removeEntries((String) i.next(), urlHashes);
}
}
public Iterator entries() {
// returns an iterator of indexEntry objects
return new entryIterator();

@ -120,10 +120,11 @@ public class plasmaGrafics {
private static BufferedImage peerloadPicture = null;
private static long peerloadPictureDate = 0;
public static ymageMatrix getSearchEventPicture() {
if (plasmaSearchEvent.lastEvent == null) return null;
yacySearch[] primarySearches = plasmaSearchEvent.lastEvent.getPrimarySearchThreads();
yacySearch[] secondarySearches = plasmaSearchEvent.lastEvent.getSecondarySearchThreads();
public static ymageMatrix getSearchEventPicture(String eventID) {
plasmaSearchEvent event = plasmaSearchEvent.getEvent(eventID);
if (event == null) return null;
yacySearch[] primarySearches = event.getPrimarySearchThreads();
yacySearch[] secondarySearches = event.getSecondarySearchThreads();
if (primarySearches == null) return null; // this was a local search and there are no threads
// get a copy of a recent network picture
@ -159,7 +160,7 @@ public class plasmaGrafics {
}
// draw in the search target
plasmaSearchQuery query = plasmaSearchEvent.lastEvent.getQuery();
plasmaSearchQuery query = event.getQuery();
Iterator i = query.queryHashes.iterator();
eventPicture.setMode(ymageMatrix.MODE_SUB);
eventPicture.setColor(ymageMatrix.SUBTRACTIVE_BLACK);

@ -26,12 +26,15 @@
package de.anomic.plasma;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
@ -39,7 +42,8 @@ import de.anomic.yacy.yacySearch;
public final class plasmaSearchEvent {
public static plasmaSearchEvent lastEvent = null;
//public static plasmaSearchEvent lastEvent = null;
public static String lastEventID = "";
private static HashMap lastEvents = new HashMap(); // a cache for objects from this class: re-use old search requests
public static final long eventLifetime = 600000; // the time an event will stay in the cache, 10 Minutes
@ -57,6 +61,7 @@ public final class plasmaSearchEvent {
private indexContainer sortedResults;
private int lastglobal;
private int filteredCount;
private ArrayList display; // an array of url hashes of urls that had been displayed as search result after this search
private plasmaSearchEvent(plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
@ -80,6 +85,7 @@ public final class plasmaSearchEvent {
this.globalcount = 0;
this.sortedResults = null;
this.lastglobal = 0;
this.display = new ArrayList();
long start = System.currentTimeMillis();
if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ||
@ -152,11 +158,22 @@ public final class plasmaSearchEvent {
// sort the local containers and truncate it to a limited count,
// so following sortings together with the global results will be fast
localcount = rcLocal.size();
plasmaSearchPreOrder firstsort = new plasmaSearchPreOrder(query, profileLocal, ranking, rcLocal);
rcLocal = firstsort.strippedContainer(200);
// wait some time to retrieve index abstracts from primary search
int prefetchIndex = 0;
HashSet unknownURLs = new HashSet();
String urlhash;
// while we wait for the first time-out for index abstracts, we fetch urls form the url-db
while ((System.currentTimeMillis() < secondaryTimeout) && (prefetchIndex < rcLocal.size())) {
if (yacySearch.remainingWaiting(primarySearchThreads) == 0) break; // all threads have finished
urlhash = new String(rcLocal.get(prefetchIndex).getColBytes(0));
if (wordIndex.loadedURL.load(urlhash, null) == null) unknownURLs.add(urlhash);
prefetchIndex++;
}
// eventually wait some more time to retrieve index abstracts from primary search
while (System.currentTimeMillis() < secondaryTimeout) {
if (yacySearch.remainingWaiting(primarySearchThreads) == 0) break; // all threads have finished
try {Thread.sleep(100);} catch (InterruptedException e) {}
@ -165,6 +182,19 @@ public final class plasmaSearchEvent {
// evaluate index abstracts and start a secondary search
if (rcAbstracts != null) prepareSecondarySearch();
// while we wait for the second time-out for index abstracts, we fetch more urls form the url-db
while ((System.currentTimeMillis() < primaryTimeout) && (prefetchIndex < rcLocal.size())) {
if (yacySearch.remainingWaiting(primarySearchThreads) == 0) break; // all threads have finished
urlhash = new String(rcLocal.get(prefetchIndex).getColBytes(0));
if (wordIndex.loadedURL.load(urlhash, null) == null) unknownURLs.add(urlhash);
prefetchIndex++;
}
// when we have found some non-existing urls in the local collection, we delete them now
wordIndex.removeEntriesMultiple(query.queryHashes, unknownURLs);
rcLocal.removeEntriesMultiple(query.queryHashes, unknownURLs);
localcount = rcLocal.size();
// catch up global results:
// wait until primary timeout passed
while (System.currentTimeMillis() < primaryTimeout) {
@ -195,7 +225,7 @@ public final class plasmaSearchEvent {
serverLog.logFine("SEARCH_EVENT", "SEARCHRESULT: " + profileLocal.reportToString());
// set link for statistic
lastEvent = this;
//lastEvent = this;
// remove old events in the event cache
Iterator i = lastEvents.entrySet().iterator();
@ -205,6 +235,7 @@ public final class plasmaSearchEvent {
// store this search to a cache so it can be re-used
lastEvents.put(query.id(), this);
lastEventID = query.id();
}
public plasmaSearchQuery getQuery() {
@ -234,6 +265,10 @@ public final class plasmaSearchEvent {
return this.globalcount;
}
public static plasmaSearchEvent getEvent(String eventID) {
return (plasmaSearchEvent) lastEvents.get(eventID);
}
public static plasmaSearchEvent getEvent(plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
plasmaSearchProcessing localTiming,
@ -371,5 +406,16 @@ public final class plasmaSearchEvent {
}
return wordlist;
}
public void remove(String urlhash) {
// removes the url hash reference from last search result
indexRWIEntry e = this.sortedResults.remove(urlhash);
assert e != null;
rcLocal.remove(urlhash);
}
public void displayed(String urlhash, int position) {
this.display.set(position, urlhash);
}
}

@ -867,7 +867,7 @@ public class plasmaSnippetCache {
return result;
}
public static String failConsequences(TextSnippet snippet, Set queryhashes) {
public static String failConsequences(TextSnippet snippet, String eventID) {
// problems with snippet fetch
if (yacyCore.seedDB.mySeed.isVirgin()) return snippet.getError() + " (no consequences, no network connection)"; // no consequences if we do not have a network connection
String urlHash = plasmaURL.urlHash(snippet.getUrl());
@ -878,11 +878,14 @@ public class plasmaSnippetCache {
(snippet.getErrorCode() == ERROR_PARSER_NO_LINES)) {
log.logInfo("error: '" + snippet.getError() + "', remove url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
plasmaSwitchboard.getSwitchboard().wordIndex.loadedURL.remove(urlHash);
plasmaSwitchboard.getSwitchboard().wordIndex.removeHashReferences(queryhashes, urlHash);
plasmaSearchEvent event = plasmaSearchEvent.getEvent(eventID);
plasmaSwitchboard.getSwitchboard().wordIndex.removeEntryMultiple(event.getQuery().queryHashes, urlHash);
event.remove(urlHash);
}
if (snippet.getErrorCode() == ERROR_NO_MATCH) {
log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
plasmaSwitchboard.getSwitchboard().wordIndex.removeHashReferences(snippet.remaingHashes, urlHash);
plasmaSwitchboard.getSwitchboard().wordIndex.removeEntryMultiple(snippet.remaingHashes, urlHash);
plasmaSearchEvent.getEvent(eventID).remove(urlHash);
}
return snippet.getError();
}

@ -423,6 +423,17 @@ public final class plasmaWordIndex implements indexRI {
return removed;
}
public int removeEntryMultiple(Set wordHashes, String urlHash) {
// remove the same url hashes for multiple words
// this is mainly used when correcting a index after a search
Iterator i = wordHashes.iterator();
int count = 0;
while (i.hasNext()) {
if (removeEntry((String) i.next(), urlHash)) count++;
}
return count;
}
public int removeEntries(String wordHash, Set urlHashes) {
int removed = 0;
synchronized (dhtInCache) {
@ -451,26 +462,23 @@ public final class plasmaWordIndex implements indexRI {
return removed;
}
public int removeWordReferences(Set words, String urlhash) {
// sequentially delete all word references
// returns number of deletions
Iterator iter = words.iterator();
int count = 0;
while (iter.hasNext()) {
// delete the URL reference in this word index
if (removeEntry(plasmaCondenser.word2hash((String) iter.next()), urlhash)) count++;
public void removeEntriesMultiple(Set wordHashes, Set urlHashes) {
// remove the same url hashes for multiple words
// this is mainly used when correcting a index after a search
Iterator i = wordHashes.iterator();
while (i.hasNext()) {
removeEntries((String) i.next(), urlHashes);
}
return count;
}
public int removeHashReferences(Set hashes, String urlhash) {
public int removeWordReferences(Set words, String urlhash) {
// sequentially delete all word references
// returns number of deletions
Iterator iter = hashes.iterator();
Iterator iter = words.iterator();
int count = 0;
while (iter.hasNext()) {
// delete the URL reference in this word index
if (removeEntry((String) iter.next(), urlhash)) count++;
if (removeEntry(plasmaCondenser.word2hash((String) iter.next()), urlhash)) count++;
}
return count;
}

Loading…
Cancel
Save