From efb4ca8fa8b84ba3478938161495e53b29c337b7 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sat, 22 Jan 2011 09:46:00 +0000 Subject: [PATCH] modified auto-delete of search failure-words: - words are now not deleted from the search index automatically if index receive is switched off - a flag in the network definition defines if this feature is switched on at all - the search filter for not-found word references is switched off for server-side remote searches git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7441 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.network.freeworld.unit | 1 + defaults/yacy.network.intranet.unit | 1 + defaults/yacy.network.webportal.unit | 1 + htroot/yacy/search.java | 6 ++++-- htroot/yacysearch.java | 3 ++- source/de/anomic/search/QueryParams.java | 6 +++++- source/de/anomic/search/ResultFetcher.java | 2 +- source/de/anomic/search/Switchboard.java | 6 ++++-- source/de/anomic/search/SwitchboardConstants.java | 2 ++ 9 files changed, 21 insertions(+), 7 deletions(-) diff --git a/defaults/yacy.network.freeworld.unit b/defaults/yacy.network.freeworld.unit index d473c1809..f43219b91 100644 --- a/defaults/yacy.network.freeworld.unit +++ b/defaults/yacy.network.freeworld.unit @@ -15,6 +15,7 @@ network.unit.dht = true network.unit.dhtredundancy.junior = 1 network.unit.dhtredundancy.senior = 3 network.unit.dht.partitionExponent = 4 +network.unit.inspection.searchverify = true network.unit.remotecrawl.speed = 300 network.unit.bootstrap.seedlist0 = http://www.yacy.net/seed.txt network.unit.bootstrap.seedlist1 = http://home.arcor.de/hermens/yacy/seed.txt diff --git a/defaults/yacy.network.intranet.unit b/defaults/yacy.network.intranet.unit index 4c122cb67..5ec5a50aa 100644 --- a/defaults/yacy.network.intranet.unit +++ b/defaults/yacy.network.intranet.unit @@ -14,6 +14,7 @@ network.unit.dht = false network.unit.dhtredundancy.junior = 1 network.unit.dhtredundancy.senior = 1 network.unit.dht.partitionExponent = 0 +network.unit.inspection.searchverify = false network.unit.remotecrawl.speed = 600 # each network may use different yacy distributions. diff --git a/defaults/yacy.network.webportal.unit b/defaults/yacy.network.webportal.unit index 681d1eacc..9762f3f26 100644 --- a/defaults/yacy.network.webportal.unit +++ b/defaults/yacy.network.webportal.unit @@ -11,6 +11,7 @@ network.unit.dht = false network.unit.dhtredundancy.junior = 1 network.unit.dhtredundancy.senior = 1 network.unit.dht.partitionExponent = 0 +network.unit.inspection.searchverify = false network.unit.remotecrawl.speed = 1 # each network may use different yacy distributions. diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 99c947dbc..a188f5c65 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -232,7 +232,8 @@ public final class search { false, indexSegment, rankingProfile, - header.get(RequestHeader.USER_AGENT, "") + header.get(RequestHeader.USER_AGENT, ""), + false ); yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + QueryParams.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); @@ -286,7 +287,8 @@ public final class search { false, sb.indexSegments.segment(Segments.Process.PUBLIC), rankingProfile, - header.get(RequestHeader.USER_AGENT, "") + header.get(RequestHeader.USER_AGENT, ""), + false ); yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + QueryParams.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); yacyChannel.channels(yacyChannel.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.queryHashes), "")); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index e351bdf5d..61efec208 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -499,7 +499,8 @@ public class yacysearch { authenticated, indexSegment, ranking, - header.get(RequestHeader.USER_AGENT, "")); + header.get(RequestHeader.USER_AGENT, ""), + sb.getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false) && sb.peers.mySeed().getFlagAcceptRemoteIndex()); EventTracker.delete(EventTracker.EClass.SEARCH); EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(theQuery.id(true), SearchEvent.Type.INITIALIZATION, "", 0, 0), false); diff --git a/source/de/anomic/search/QueryParams.java b/source/de/anomic/search/QueryParams.java index 7c0d5f1e4..66da4e4fd 100644 --- a/source/de/anomic/search/QueryParams.java +++ b/source/de/anomic/search/QueryParams.java @@ -103,6 +103,7 @@ public final class QueryParams { public long searchtime, urlretrievaltime, snippetcomputationtime; // time to perform the search, to get all the urls, and to compute the snippets public boolean specialRights; // is true if the user has a special authorization and my use more database-extensive options public final String userAgent; + public boolean filterfailurls; public QueryParams(final String queryString, final int itemsPerPage, @@ -154,6 +155,7 @@ public final class QueryParams { this.indexSegment = indexSegment; this.userAgent = userAgent; this.transmitcount = 0; + this.filterfailurls = false; } public QueryParams( @@ -175,7 +177,8 @@ public final class QueryParams { final boolean specialRights, final Segment indexSegment, final RankingProfile ranking, - final String userAgent) { + final String userAgent, + final boolean filterfailurls) { this.queryString = queryString; this.queryHashes = queryHashes; @@ -209,6 +212,7 @@ public final class QueryParams { this.indexSegment = indexSegment; this.userAgent = userAgent; this.transmitcount = 0; + this.filterfailurls = filterfailurls; } public Segment getSegment() { diff --git a/source/de/anomic/search/ResultFetcher.java b/source/de/anomic/search/ResultFetcher.java index 567851024..1ead425b2 100644 --- a/source/de/anomic/search/ResultFetcher.java +++ b/source/de/anomic/search/ResultFetcher.java @@ -333,7 +333,7 @@ public class ResultFetcher { //System.out.println("page == null"); break; // no more available } - if (workTables.failURLsContains(page.hash())) continue; + if (query.filterfailurls && workTables.failURLsContains(page.hash())) continue; loops++; final ResultEntry resultEntry = fetchSnippet(page, cacheStrategy); // does not fetch snippets if snippetMode == 0 diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index f30b3cca6..19f2bacfd 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1656,14 +1656,16 @@ public final class Switchboard extends serverSwitch { this.clusterhashes = this.peers.clusterHashes(getConfig("cluster.peers.yacydomain", "")); // check if we are reachable and try to map port again if not (e.g. when router rebooted) - if(getConfigBool(SwitchboardConstants.UPNP_ENABLED, false) && sb.peers.mySeed().isJunior()) + if (getConfigBool(SwitchboardConstants.UPNP_ENABLED, false) && sb.peers.mySeed().isJunior()) UPnP.addPortMapping(); // after all clean up is done, check the resource usage observer.resourceObserverJob(); // cleanup cached search failures - this.tables.cleanFailURLS(this.getConfigLong("cleanup.failedSearchURLtimeout", -1)); + if (getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false) && peers.mySeed().getFlagAcceptRemoteIndex()) { + this.tables.cleanFailURLS(this.getConfigLong("cleanup.failedSearchURLtimeout", -1)); + } return true; } catch (final InterruptedException e) { diff --git a/source/de/anomic/search/SwitchboardConstants.java b/source/de/anomic/search/SwitchboardConstants.java index c6bccf169..fe3c0fe65 100644 --- a/source/de/anomic/search/SwitchboardConstants.java +++ b/source/de/anomic/search/SwitchboardConstants.java @@ -389,6 +389,8 @@ public final class SwitchboardConstants { public static final String NETWORK_WHITELIST = "network.unit.access.whitelist"; public static final String NETWORK_BLACKLIST = "network.unit.access.blacklist"; + public static final String NETWORK_SEARCHVERIFY = "network.unit.inspection.searchverify"; + /** * appearance */