From 84f82541e8cb7854c239eed5fe1ea288c0e677f9 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 19 Dec 2012 10:41:22 +0100 Subject: [PATCH] search process enhancements --- htroot/yacysearchtrailer.java | 2 +- .../net/yacy/cora/protocol/ftp/FTPClient.java | 1 + .../net/yacy/repository/LoaderDispatcher.java | 2 +- source/net/yacy/search/query/QueryParams.java | 2 +- .../net/yacy/search/query/RankingProcess.java | 13 +----- source/net/yacy/search/query/SearchEvent.java | 44 ++++--------------- 6 files changed, 15 insertions(+), 49 deletions(-) diff --git a/htroot/yacysearchtrailer.java b/htroot/yacysearchtrailer.java index 70f79c8e9..e77aede54 100644 --- a/htroot/yacysearchtrailer.java +++ b/htroot/yacysearchtrailer.java @@ -107,7 +107,7 @@ public class yacysearchtrailer { } // host navigators - final ScoreMap hostNavigator = theSearch.rankingProcess.getHostNavigator(); + final ScoreMap hostNavigator = theSearch.hostNavigator; if (hostNavigator == null || hostNavigator.isEmpty()) { prop.put("nav-domains", 0); } else { diff --git a/source/net/yacy/cora/protocol/ftp/FTPClient.java b/source/net/yacy/cora/protocol/ftp/FTPClient.java index 4a4491275..534c63480 100644 --- a/source/net/yacy/cora/protocol/ftp/FTPClient.java +++ b/source/net/yacy/cora/protocol/ftp/FTPClient.java @@ -1970,6 +1970,7 @@ public class FTPClient { // protocoll socket commands private void send(final String buf) throws IOException { + if (this.clientOutput == null) return; byte[] b = buf.getBytes("UTF-8"); this.clientOutput.write(b, 0, b.length); this.clientOutput.write('\r'); diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 83c860682..4f8102f23 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -230,7 +230,7 @@ public final class LoaderDispatcher { } // now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test - assert cacheStrategy == CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy; + //assert cacheStrategy == CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy; if (response.isFreshForProxy()) { final byte[] content = Cache.getContent(url.hash()); if (content != null) { diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 83c67a6ef..a1c3c4470 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -463,7 +463,7 @@ public final class QueryParams { while ((p = urlMaskPattern.indexOf(':')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 1); while ((p = urlMaskPattern.indexOf('/')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 1); while ((p = urlMaskPattern.indexOf('\\')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 2); - //fq.append(" AND ").append(YaCySchema.sku.getSolrFieldName() + ":/" + urlMaskPattern + "/"); + fq.append(" AND ").append(YaCySchema.sku.getSolrFieldName() + ":/" + urlMaskPattern + "/"); } if (this.radius > 0.0d && this.lat != 0.0d && this.lon != 0.0d) { diff --git a/source/net/yacy/search/query/RankingProcess.java b/source/net/yacy/search/query/RankingProcess.java index 4658016d4..5d39802d1 100644 --- a/source/net/yacy/search/query/RankingProcess.java +++ b/source/net/yacy/search/query/RankingProcess.java @@ -82,7 +82,6 @@ public final class RankingProcess extends Thread { protected final AtomicInteger receivedRemoteReferences; protected final ReferenceOrder order; protected final HandleSet urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) - protected final ScoreMap hostNavigator = new ConcurrentScoreMap(); // a counter for the appearance of host names private final Map taggingPredicates; // a map from tagging vocabulary names to tagging predicate uris private boolean remote; @@ -361,11 +360,7 @@ public final class RankingProcess extends Thread { // this is only available if execQuery() was called before return this.localSearchInclusion; } - - public ScoreMap getHostNavigator() { - return this.hostNavigator; - } - + public ScoreMap getTopicNavigator(final int count) { // create a list of words that had been computed by statistics over all // words that appeared in the url or the description of all urls @@ -420,14 +415,10 @@ public final class RankingProcess extends Thread { protected void addTopics(final ResultEntry resultEntry) { // take out relevant information for reference computation - if ( (resultEntry.url() == null) || (resultEntry.title() == null) ) { - return; - } - //final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url + if ((resultEntry.url() == null) || (resultEntry.title() == null)) return; final String[] descrcomps = MultiProtocolURI.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description // add references - //addTopic(urlcomps); addTopic(descrcomps); } diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index d88321c2b..1334db377 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -62,7 +62,6 @@ import net.yacy.document.LargeNumberCache; import net.yacy.document.LibraryProvider; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.URIMetadataNode; -import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; import net.yacy.kelondro.data.word.WordReferenceVars; @@ -113,6 +112,7 @@ public final class SearchEvent { private byte[] IAmaxcounthash, IAneardhthash; private final Thread localsearch; private final AtomicInteger expectedRemoteReferences, maxExpectedRemoteReferences; // counter for referenced that had been sorted out for other reasons + public final ScoreMap hostNavigator; // a counter for the appearance of host names public final ScoreMap authorNavigator; // a counter for the appearances of authors public final ScoreMap namespaceNavigator; // a counter for name spaces public final ScoreMap protocolNavigator; // a counter for protocol types @@ -164,6 +164,7 @@ public final class SearchEvent { } else { this.namespaceNavigator = null; } + this.hostNavigator = new ConcurrentScoreMap(); this.protocolNavigator = new ConcurrentScoreMap(); this.filetypeNavigator = new ConcurrentScoreMap(); this.vocabularyNavigator = new ConcurrentHashMap>(); @@ -478,7 +479,7 @@ public final class SearchEvent { // collect navigation information ReversibleScoreMap fcts = facets.get(YaCySchema.host_s.getSolrFieldName()); - if (fcts != null) this.rankingProcess.hostNavigator.inc(fcts); + if (fcts != null) this.hostNavigator.inc(fcts); if (this.filetypeNavigator != null) { fcts = facets.get(YaCySchema.url_file_ext_s.getSolrFieldName()); @@ -564,38 +565,6 @@ public final class SearchEvent { if (!hosthash.equals(this.query.nav_sitehash)) continue pollloop; } - // check vocabulary constraint - /* - String subject = YaCyMetadata.hashURI(iEntry.hash()); - Resource resource = JenaTripleStore.getResource(subject); - if (this.query.metatags != null && !this.query.metatags.isEmpty()) { - // all metatags must appear in the tags list - for (Tagging.Metatag metatag: this.query.metatags) { - Iterator ni = JenaTripleStore.getObjects(resource, metatag.getPredicate()); - if (!ni.hasNext()) continue pollloop; - String tags = ni.next().toString(); - if (tags.indexOf(metatag.getObject()) < 0) continue pollloop; - } - } - */ - // add navigators using the triplestore - /* - for (Map.Entry v: this.rankingProcess.taggingPredicates.entrySet()) { - Iterator ni = JenaTripleStore.getObjects(resource, v.getValue()); - while (ni.hasNext()) { - String[] tags = CommonPattern.COMMA.split(ni.next().toString()); - for (String tag: tags) { - ScoreMap voc = this.rankingProcess.vocabularyNavigator.get(v.getKey()); - if (voc == null) { - voc = new ConcurrentScoreMap(); - this.rankingProcess.vocabularyNavigator.put(v.getKey(), voc); - } - voc.inc(tag); - } - } - } - */ - // finally extend the double-check and insert result to stack this.rankingProcess.urlhashes.putUnique(iEntry.hash()); rankingtryloop: while (true) { @@ -894,7 +863,12 @@ public final class SearchEvent { deployWorker(Math.min(SNIPPET_WORKER_THREADS, this.query.itemsPerPage), this.query.neededResults()); } // wait until local data is there - while (this.localsearch != null && this.localsearch.isAlive() && this.result.sizeAvailable() < item) try {this.localsearch.join(10);} catch (InterruptedException e) {} + while (this.localsearch != null && this.localsearch.isAlive() && this.result.sizeAvailable() < item) try { + if (!anyWorkerAlive()) { + deployWorker(Math.min(SNIPPET_WORKER_THREADS, this.query.itemsPerPage), this.query.neededResults()); + } + this.localsearch.join(10); + } catch (InterruptedException e) {} // check if we already retrieved this item // (happens if a search pages is accessed a second time) final long finishTime = System.currentTimeMillis() + timeout;