From f4a5c287fed21e0f439334dacb7e22172ea5f8a2 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sat, 8 Sep 2007 11:50:19 +0000 Subject: [PATCH] re-implemented post-ranking of search results (should enhanced search result quality) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4080 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- htroot/yacy/search.java | 11 +- htroot/yacysearchitem.java | 18 +-- .../anomic/plasma/plasmaSearchContainer.java | 9 +- .../de/anomic/plasma/plasmaSearchEvent.java | 152 ++++++++++++++++-- .../de/anomic/plasma/plasmaSearchQuery.java | 2 +- .../plasma/plasmaSearchRankingProfile.java | 26 +-- source/de/anomic/yacy/yacyClient.java | 7 +- 8 files changed, 179 insertions(+), 48 deletions(-) diff --git a/build.properties b/build.properties index ebaa67364..1273505a7 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.543 +releaseVersion=0.544 releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseFileParentDir=yacy diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index e3688556d..d11ab08cc 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -32,6 +32,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import java.util.Set; import java.util.TreeSet; import de.anomic.http.httpHeader; @@ -226,12 +227,14 @@ public final class search { // prepare reference hints localProcess.startTimer(); - Object[] ws = theSearch.references(10); + Set ws = theSearch.references(10); StringBuffer refstr = new StringBuffer(); - for (int j = 0; j < ws.length; j++) - refstr.append(",").append((String) ws[j]); + Iterator j = ws.iterator(); + while (j.hasNext()) { + refstr.append(",").append((String) j.next()); + } prop.putASIS("references", (refstr.length() > 0) ? refstr.substring(1) : new String(refstr)); - localProcess.yield("reference collection", ws.length); + localProcess.yield("reference collection", ws.size()); } prop.putASIS("indexabstract", new String(indexabstract)); diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 0c46d1747..e106c4513 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -29,6 +29,7 @@ import java.net.MalformedURLException; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Iterator; +import java.util.Set; import java.util.TreeSet; import de.anomic.http.httpHeader; @@ -64,6 +65,7 @@ public class yacysearchitem { String eventID = post.get("eventID", ""); boolean bottomline = post.get("bottomline", "false").equals("true"); boolean authenticated = sb.adminAuthenticated(header) >= 2; + int item = post.getInt("item", -1); // find search event plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(eventID); @@ -74,19 +76,19 @@ public class yacysearchitem { prop.put("offset", theQuery.neededResults() - theQuery.displayResults() + 1); prop.put("global", theSearch.getGlobalCount()); prop.put("total", theSearch.getGlobalCount() + theSearch.getLocalCount()); - prop.put("items", theQuery.displayResults()); + prop.put("items", (item < 0) ? theQuery.neededResults() : item + 1); if (bottomline) { // attach the bottom line with search references (topwords) - final Object[] references = theSearch.references(20); - int hintcount = references.length; - if (hintcount > 0) { + final Set references = theSearch.references(20); + if (references.size() > 0) { prop.put("references", 1); // get the topwords final TreeSet topwords = new TreeSet(kelondroNaturalOrder.naturalOrder); String tmp = ""; - for (int i = 0; i < hintcount; i++) { - tmp = (String) references[i]; + Iterator i = references.iterator(); + while (i.hasNext()) { + tmp = (String) i.next(); if (tmp.matches("[a-z]+")) { topwords.add(tmp); } @@ -106,7 +108,7 @@ public class yacysearchitem { } String word; - hintcount = 0; + int hintcount = 0; final Iterator iter = topwords.iterator(); while (iter.hasNext()) { word = (String) iter.next(); @@ -134,8 +136,6 @@ public class yacysearchitem { prop.put("references", 0); // generate result object - int item = post.getInt("item", -1); - prop.put("items", (item < 0) ? theQuery.displayResults() : item + 1); plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item); if (result == null) { diff --git a/source/de/anomic/plasma/plasmaSearchContainer.java b/source/de/anomic/plasma/plasmaSearchContainer.java index 760aec759..de3f41232 100644 --- a/source/de/anomic/plasma/plasmaSearchContainer.java +++ b/source/de/anomic/plasma/plasmaSearchContainer.java @@ -145,10 +145,15 @@ public class plasmaSearchContainer { return this.globalcount; } - public Object[] getReferences(int count) { + public Set getReferences(int count) { // create a list of words that had been computed by statistics over all // words that appeared in the url or the description of all urls - return ref.getScores(count, false, 2, Integer.MAX_VALUE); + Object[] refs = ref.getScores(count, false, 2, Integer.MAX_VALUE); + TreeSet s = new TreeSet(String.CASE_INSENSITIVE_ORDER); + for (int i = 0; i < refs.length; i++) { + s.add((String) refs[i]); + } + return s; } public void addReferences(String[] words) { diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 9ad29992f..3e05eaa49 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -51,7 +51,7 @@ import de.anomic.yacy.yacyURL; public final class plasmaSearchEvent { - public static int workerThreadCount = 5; + public static int workerThreadCount = 10; public static String lastEventID = ""; private static HashMap lastEvents = new HashMap(); // a cache for objects from this class: re-use old search requests public static final long eventLifetime = 600000; // the time an event will stay in the cache, 10 Minutes @@ -71,6 +71,7 @@ public final class plasmaSearchEvent { private int localcount; private resultWorker[] workerThreads; private ArrayList resultList; // list of this.Entry objects + //private int resultListLock; // a pointer that shows that all elements below this pointer are fixed and may not be changed again private HashMap failedURLs; // a mapping from a urlhash to a fail reason string TreeSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets @@ -97,6 +98,7 @@ public final class plasmaSearchEvent { this.localcount = 0; this.workerThreads = null; this.resultList = new ArrayList(10); // this is the result set which is filled up with search results, enriched with snippets + //this.resultListLock = 0; // no locked elements until now this.failedURLs = new HashMap(); // a map of urls to reason strings where a worker thread tried to work on, but failed. // snippets do not need to match with the complete query hashes, @@ -120,7 +122,7 @@ public final class plasmaSearchEvent { // the result of the fetch is then in the rcGlobal process.startTimer(); serverLog.logFine("SEARCH_EVENT", "STARTING " + fetchpeers + " THREADS TO CATCH EACH " + query.displayResults() + " URLs"); - primarySearchThreads = yacySearch.primaryRemoteSearches( + this.primarySearchThreads = yacySearch.primaryRemoteSearches( plasmaSearchQuery.hashSet2hashString(query.queryHashes), plasmaSearchQuery.hashSet2hashString(query.excludeHashes), "", @@ -136,7 +138,7 @@ public final class plasmaSearchEvent { ranking, query.constraint, (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ? null : preselectedPeerHashes); - process.yield("remote search thread start", primarySearchThreads.length); + process.yield("remote search thread start", this.primarySearchThreads.length); // meanwhile do a local search Map[] searchContainerMaps = process.localSearchContainers(query, wordIndex, null); @@ -400,6 +402,22 @@ public final class plasmaSearchEvent { return false; } + private boolean anyRemoteSearchAlive() { + // check primary search threads + if ((this.primarySearchThreads != null) && (this.primarySearchThreads.length != 0)) { + for (int i = 0; i < this.primarySearchThreads.length; i++) { + if ((this.primarySearchThreads[i] != null) && (this.primarySearchThreads[i].isAlive())) return true; + } + } + // maybe a secondary search thread is alivem check this + if ((this.secondarySearchThreads != null) && (this.secondarySearchThreads.length != 0)) { + for (int i = 0; i < this.primarySearchThreads.length; i++) { + if ((this.secondarySearchThreads[i] != null) && (this.secondarySearchThreads[i].isAlive())) return true; + } + } + return false; + } + public plasmaSearchQuery getQuery() { return query; } @@ -454,7 +472,7 @@ public final class plasmaSearchEvent { // if worker threads had been alive, but did not succeed, start them again to fetch missing links if ((query.onlineSnippetFetch) && (!event.anyWorkerAlive()) && - (event.resultList.size() < query.neededResults()) && + (event.resultList.size() < query.neededResults() + 10) && ((event.getLocalCount() + event.getGlobalCount()) > event.resultList.size())) { // set new timeout event.eventTime = System.currentTimeMillis(); @@ -493,10 +511,14 @@ public final class plasmaSearchEvent { public void run() { // sleep first to give remote loading threads a chance to fetch entries - try {Thread.sleep(this.sleeptime);} catch (InterruptedException e1) {} + if (anyRemoteSearchAlive()) try {Thread.sleep(this.sleeptime);} catch (InterruptedException e1) {} // start fetching urls and snippets - while ((resultList.size() < query.neededResults() + query.displayResults()) && (System.currentTimeMillis() < this.timeout)) { + while (true) { + + if (resultList.size() > query.neededResults() + query.displayResults()) break; // computed enough + + if (System.currentTimeMillis() > this.timeout) break; // time is over // try secondary search prepareSecondarySearch(); // will be executed only once @@ -505,9 +527,14 @@ public final class plasmaSearchEvent { this.entry = null; entry = nextOrder(); if (entry == null) { - // wait and try again - try {Thread.sleep(100);} catch (InterruptedException e) {} - continue; + if (anyRemoteSearchAlive()) { + // wait and try again + try {Thread.sleep(100);} catch (InterruptedException e) {} + continue; + } else { + // we will not see that there core more results in + break; + } } indexURLEntry page = wordIndex.loadedURL.load(entry.urlHash(), entry); @@ -531,7 +558,7 @@ public final class plasmaSearchEvent { System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url()); } - System.out.println("DEBUG: resultWorker thread " + id + " terminated"); + serverLog.logInfo("SEARCH", "resultWorker thread " + id + " terminated"); } private indexRWIEntry nextOrder() { @@ -574,29 +601,106 @@ public final class plasmaSearchEvent { serverLog.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason); } - public ResultEntry oneResult(int item) { // first sleep a while to give accumulation threads a chance to work long sleeptime = this.eventTime + (this.query.maximumTime / this.query.displayResults() * ((item % this.query.displayResults()) + 1)) - System.currentTimeMillis(); - if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) && - (anyWorkerAlive()) && - (sleeptime > 0)) try {Thread.sleep(sleeptime);} catch (InterruptedException e) {} + if ((anyWorkerAlive()) && (sleeptime > 0)) { + try {Thread.sleep(sleeptime);} catch (InterruptedException e) {} + } - // then sleep until a result is available + // if there are less than 10 more results available, sleep some extra time to get a chance that the "common sense" ranking algorithm can work + if ((this.resultList.size() <= item + 10) && (anyWorkerAlive())) { + try {Thread.sleep(300);} catch (InterruptedException e) {} + } + // then sleep until any result is available (that should not happen) while ((this.resultList.size() <= item) && (anyWorkerAlive())) { try {Thread.sleep(100);} catch (InterruptedException e) {} } // finally, if there is something, return the result synchronized (this.resultList) { + // check if we have enough entries if (this.resultList.size() <= item) return null; - // todo: fetch best result (switch) from item position to end of resultList + // fetch the best entry from the resultList, not the entry from item position + // whenever a specific entry was switched in its position and was returned here + // a moving pointer is set to assign that item position as not changeable + int bestpick = postRankingFavourite(item); + if (bestpick != item) { + // switch the elements + ResultEntry buf = (ResultEntry) this.resultList.get(bestpick); + serverLog.logInfo("SEARCH_POSTRANKING", "prefering [" + bestpick + "] " + buf.urlstring() + " over [" + item + "] " + ((ResultEntry) this.resultList.get(item)).urlstring()); + this.resultList.set(bestpick, (ResultEntry) this.resultList.get(item)); + this.resultList.set(item, buf); + } + //this.resultListLock = item; // lock the element; be prepared to return it return (ResultEntry) this.resultList.get(item); } } + private int postRankingFavourite(int item) { + // do a post-ranking on resultList, which should be locked upon time of this call + long rank, bestrank = 0; + int bestitem = item; + ResultEntry entry; + for (int i = item; i < this.resultList.size(); i++) { + entry = (ResultEntry) this.resultList.get(i); + rank = this.ranking.postRanking(this.query, this.references(10), entry, item); + if (rank > bestrank) { + bestrank = rank; + bestitem = i; + } + } + return bestitem; + } + + /* + public void removeRedundant() { + // remove all urls from the pageAcc structure that occur double by specific redundancy rules + // a link is redundant, if a sub-path of the url is cited before. redundant urls are removed + // we find redundant urls by iteration over all elements in pageAcc + Iterator i = pageAcc.entrySet().iterator(); + HashMap paths = new HashMap(); // a url-subpath to pageAcc-key relation + Map.Entry entry; + + // first scan all entries and find all urls that are referenced + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + paths.put(((indexURLEntry) entry.getValue()).comp().url().toNormalform(true, true), entry.getKey()); + //if (path != null) path = shortenPath(path); + //if (path != null) paths.put(path, entry.getKey()); + } + + // now scan the pageAcc again and remove all redundant urls + i = pageAcc.entrySet().iterator(); + String shorten; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + shorten = shortenPath(((indexURLEntry) entry.getValue()).comp().url().toNormalform(true, true)); + // scan all subpaths of the url + while (shorten != null) { + if (pageAcc.size() <= query.wantedResults) break; + if (paths.containsKey(shorten)) { + //System.out.println("deleting path from search result: " + path + " is redundant to " + shorten); + try { + i.remove(); + } catch (IllegalStateException e) { + + } + } + shorten = shortenPath(shorten); + } + } + } + + private static String shortenPath(String path) { + int pos = path.lastIndexOf('/'); + if (pos < 0) return null; + return path.substring(0, pos); + } + */ + public ArrayList completeResults(long waitingtime) { long timeout = System.currentTimeMillis() + waitingtime; while ((this.resultList.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) { @@ -743,7 +847,8 @@ public final class plasmaSearchEvent { //assert e != null; } - public Object[] references(int count) { + public Set references(int count) { + // returns a set of words that are computed as toplist return this.rankedCache.getReferences(count); } @@ -791,6 +896,7 @@ public final class plasmaSearchEvent { if ((p = alternative_urlname.indexOf("?")) > 0) alternative_urlname = alternative_urlname.substring(0, p); } } + public String hash() { return urlentry.hash(); } @@ -821,6 +927,18 @@ public final class plasmaSearchEvent { public int filesize() { return urlentry.size(); } + public int limage() { + return urlentry.limage(); + } + public int laudio() { + return urlentry.laudio(); + } + public int lvideo() { + return urlentry.lvideo(); + } + public int lapp() { + return urlentry.lapp(); + } public indexRWIEntry word() { return urlentry.word(); } diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index 3789e12b3..0c9fd9c20 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -115,7 +115,7 @@ public final class plasmaSearchQuery { } public int displayResults() { - // the number if result lines that are displayed at once (size of result page) + // the number of result lines that are displayed at once (size of result page) return this.linesPerPage; } diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index 98fda8f1f..89ef1abdd 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -47,9 +47,9 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; +import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.index.indexRWIEntry; import de.anomic.yacy.yacyURL; -import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBitfield; public class plasmaSearchRankingProfile { @@ -290,30 +290,30 @@ public class plasmaSearchRankingProfile { public long postRanking( plasmaSearchQuery query, Set topwords, - String[] urlcomps, - String[] descrcomps, - indexURLEntry page, + plasmaSearchEvent.ResultEntry rentry, int position) { long ranking = (255 - position) << 8; // for media search: prefer pages with many links - if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) ranking += page.limage() << coeff_cathasimage; - if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) ranking += page.limage() << coeff_cathasaudio; - if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) ranking += page.limage() << coeff_cathasvideo; - if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) ranking += page.limage() << coeff_cathasapp; + if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) ranking += rentry.limage() << coeff_cathasimage; + if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) ranking += rentry.laudio() << coeff_cathasaudio; + if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) ranking += rentry.lvideo() << coeff_cathasvideo; + if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) ranking += rentry.lapp() << coeff_cathasapp; // prefer hit with 'prefer' pattern - indexURLEntry.Components comp = page.comp(); - if (comp.url().toNormalform(true, true).matches(query.prefer)) ranking += 256 << coeff_prefer; - if (comp.title().matches(query.prefer)) ranking += 256 << coeff_prefer; + if (rentry.url().toNormalform(true, true).matches(query.prefer)) ranking += 256 << coeff_prefer; + if (rentry.title().matches(query.prefer)) ranking += 256 << coeff_prefer; // apply 'common-sense' heuristic using references + String urlstring = rentry.url().toNormalform(true, true); + String[] urlcomps = htmlFilterContentScraper.urlComps(urlstring); + String[] descrcomps = rentry.title().toLowerCase().split(htmlFilterContentScraper.splitrex); for (int j = 0; j < urlcomps.length; j++) { - if (topwords.contains(urlcomps[j])) ranking += 256 << coeff_urlcompintoplist; + if (topwords.contains(urlcomps[j])) ranking += Math.max(1, 256 - urlstring.length()) << coeff_urlcompintoplist; } for (int j = 0; j < descrcomps.length; j++) { - if (topwords.contains(descrcomps[j])) ranking += 256 << coeff_descrcompintoplist; + if (topwords.contains(descrcomps[j])) ranking += Math.max(1, 256 - rentry.title().length()) << coeff_descrcompintoplist; } // apply query-in-result matching diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index dd1c3abee..825128746 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -513,7 +513,12 @@ public final class yacyClient { // integrate remote topwords String references = (String) result.get("references"); - if (references != null) containerCache.addReferences(references.split(",")); + yacyCore.log.logInfo("remote search (client): peer " + target.getName() + " sent references " + references); + if (references != null) { + // add references twice, so they can be countet (must have at least 2 entries) + containerCache.addReferences(references.split(",")); + containerCache.addReferences(references.split(",")); + } } // insert the containers to the index