re-implemented post-ranking of search results

(should enhanced search result quality)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4080 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent d962200d11
commit f4a5c287fe

@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4
# Release Configuration
releaseVersion=0.543
releaseVersion=0.544
releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseFileParentDir=yacy

@ -32,6 +32,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import de.anomic.http.httpHeader;
@ -226,12 +227,14 @@ public final class search {
// prepare reference hints
localProcess.startTimer();
Object[] ws = theSearch.references(10);
Set ws = theSearch.references(10);
StringBuffer refstr = new StringBuffer();
for (int j = 0; j < ws.length; j++)
refstr.append(",").append((String) ws[j]);
Iterator j = ws.iterator();
while (j.hasNext()) {
refstr.append(",").append((String) j.next());
}
prop.putASIS("references", (refstr.length() > 0) ? refstr.substring(1) : new String(refstr));
localProcess.yield("reference collection", ws.length);
localProcess.yield("reference collection", ws.size());
}
prop.putASIS("indexabstract", new String(indexabstract));

@ -29,6 +29,7 @@ import java.net.MalformedURLException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import de.anomic.http.httpHeader;
@ -64,6 +65,7 @@ public class yacysearchitem {
String eventID = post.get("eventID", "");
boolean bottomline = post.get("bottomline", "false").equals("true");
boolean authenticated = sb.adminAuthenticated(header) >= 2;
int item = post.getInt("item", -1);
// find search event
plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(eventID);
@ -74,19 +76,19 @@ public class yacysearchitem {
prop.put("offset", theQuery.neededResults() - theQuery.displayResults() + 1);
prop.put("global", theSearch.getGlobalCount());
prop.put("total", theSearch.getGlobalCount() + theSearch.getLocalCount());
prop.put("items", theQuery.displayResults());
prop.put("items", (item < 0) ? theQuery.neededResults() : item + 1);
if (bottomline) {
// attach the bottom line with search references (topwords)
final Object[] references = theSearch.references(20);
int hintcount = references.length;
if (hintcount > 0) {
final Set references = theSearch.references(20);
if (references.size() > 0) {
prop.put("references", 1);
// get the topwords
final TreeSet topwords = new TreeSet(kelondroNaturalOrder.naturalOrder);
String tmp = "";
for (int i = 0; i < hintcount; i++) {
tmp = (String) references[i];
Iterator i = references.iterator();
while (i.hasNext()) {
tmp = (String) i.next();
if (tmp.matches("[a-z]+")) {
topwords.add(tmp);
}
@ -106,7 +108,7 @@ public class yacysearchitem {
}
String word;
hintcount = 0;
int hintcount = 0;
final Iterator iter = topwords.iterator();
while (iter.hasNext()) {
word = (String) iter.next();
@ -134,8 +136,6 @@ public class yacysearchitem {
prop.put("references", 0);
// generate result object
int item = post.getInt("item", -1);
prop.put("items", (item < 0) ? theQuery.displayResults() : item + 1);
plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
if (result == null) {

@ -145,10 +145,15 @@ public class plasmaSearchContainer {
return this.globalcount;
}
public Object[] getReferences(int count) {
public Set getReferences(int count) {
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
return ref.getScores(count, false, 2, Integer.MAX_VALUE);
Object[] refs = ref.getScores(count, false, 2, Integer.MAX_VALUE);
TreeSet s = new TreeSet(String.CASE_INSENSITIVE_ORDER);
for (int i = 0; i < refs.length; i++) {
s.add((String) refs[i]);
}
return s;
}
public void addReferences(String[] words) {

@ -51,7 +51,7 @@ import de.anomic.yacy.yacyURL;
public final class plasmaSearchEvent {
public static int workerThreadCount = 5;
public static int workerThreadCount = 10;
public static String lastEventID = "";
private static HashMap lastEvents = new HashMap(); // a cache for objects from this class: re-use old search requests
public static final long eventLifetime = 600000; // the time an event will stay in the cache, 10 Minutes
@ -71,6 +71,7 @@ public final class plasmaSearchEvent {
private int localcount;
private resultWorker[] workerThreads;
private ArrayList resultList; // list of this.Entry objects
//private int resultListLock; // a pointer that shows that all elements below this pointer are fixed and may not be changed again
private HashMap failedURLs; // a mapping from a urlhash to a fail reason string
TreeSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
@ -97,6 +98,7 @@ public final class plasmaSearchEvent {
this.localcount = 0;
this.workerThreads = null;
this.resultList = new ArrayList(10); // this is the result set which is filled up with search results, enriched with snippets
//this.resultListLock = 0; // no locked elements until now
this.failedURLs = new HashMap(); // a map of urls to reason strings where a worker thread tried to work on, but failed.
// snippets do not need to match with the complete query hashes,
@ -120,7 +122,7 @@ public final class plasmaSearchEvent {
// the result of the fetch is then in the rcGlobal
process.startTimer();
serverLog.logFine("SEARCH_EVENT", "STARTING " + fetchpeers + " THREADS TO CATCH EACH " + query.displayResults() + " URLs");
primarySearchThreads = yacySearch.primaryRemoteSearches(
this.primarySearchThreads = yacySearch.primaryRemoteSearches(
plasmaSearchQuery.hashSet2hashString(query.queryHashes),
plasmaSearchQuery.hashSet2hashString(query.excludeHashes),
"",
@ -136,7 +138,7 @@ public final class plasmaSearchEvent {
ranking,
query.constraint,
(query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ? null : preselectedPeerHashes);
process.yield("remote search thread start", primarySearchThreads.length);
process.yield("remote search thread start", this.primarySearchThreads.length);
// meanwhile do a local search
Map[] searchContainerMaps = process.localSearchContainers(query, wordIndex, null);
@ -400,6 +402,22 @@ public final class plasmaSearchEvent {
return false;
}
private boolean anyRemoteSearchAlive() {
// check primary search threads
if ((this.primarySearchThreads != null) && (this.primarySearchThreads.length != 0)) {
for (int i = 0; i < this.primarySearchThreads.length; i++) {
if ((this.primarySearchThreads[i] != null) && (this.primarySearchThreads[i].isAlive())) return true;
}
}
// maybe a secondary search thread is alivem check this
if ((this.secondarySearchThreads != null) && (this.secondarySearchThreads.length != 0)) {
for (int i = 0; i < this.primarySearchThreads.length; i++) {
if ((this.secondarySearchThreads[i] != null) && (this.secondarySearchThreads[i].isAlive())) return true;
}
}
return false;
}
public plasmaSearchQuery getQuery() {
return query;
}
@ -454,7 +472,7 @@ public final class plasmaSearchEvent {
// if worker threads had been alive, but did not succeed, start them again to fetch missing links
if ((query.onlineSnippetFetch) &&
(!event.anyWorkerAlive()) &&
(event.resultList.size() < query.neededResults()) &&
(event.resultList.size() < query.neededResults() + 10) &&
((event.getLocalCount() + event.getGlobalCount()) > event.resultList.size())) {
// set new timeout
event.eventTime = System.currentTimeMillis();
@ -493,10 +511,14 @@ public final class plasmaSearchEvent {
public void run() {
// sleep first to give remote loading threads a chance to fetch entries
try {Thread.sleep(this.sleeptime);} catch (InterruptedException e1) {}
if (anyRemoteSearchAlive()) try {Thread.sleep(this.sleeptime);} catch (InterruptedException e1) {}
// start fetching urls and snippets
while ((resultList.size() < query.neededResults() + query.displayResults()) && (System.currentTimeMillis() < this.timeout)) {
while (true) {
if (resultList.size() > query.neededResults() + query.displayResults()) break; // computed enough
if (System.currentTimeMillis() > this.timeout) break; // time is over
// try secondary search
prepareSecondarySearch(); // will be executed only once
@ -505,9 +527,14 @@ public final class plasmaSearchEvent {
this.entry = null;
entry = nextOrder();
if (entry == null) {
// wait and try again
try {Thread.sleep(100);} catch (InterruptedException e) {}
continue;
if (anyRemoteSearchAlive()) {
// wait and try again
try {Thread.sleep(100);} catch (InterruptedException e) {}
continue;
} else {
// we will not see that there core more results in
break;
}
}
indexURLEntry page = wordIndex.loadedURL.load(entry.urlHash(), entry);
@ -531,7 +558,7 @@ public final class plasmaSearchEvent {
System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url());
}
System.out.println("DEBUG: resultWorker thread " + id + " terminated");
serverLog.logInfo("SEARCH", "resultWorker thread " + id + " terminated");
}
private indexRWIEntry nextOrder() {
@ -574,29 +601,106 @@ public final class plasmaSearchEvent {
serverLog.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);
}
public ResultEntry oneResult(int item) {
// first sleep a while to give accumulation threads a chance to work
long sleeptime = this.eventTime + (this.query.maximumTime / this.query.displayResults() * ((item % this.query.displayResults()) + 1)) - System.currentTimeMillis();
if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) &&
(anyWorkerAlive()) &&
(sleeptime > 0)) try {Thread.sleep(sleeptime);} catch (InterruptedException e) {}
if ((anyWorkerAlive()) && (sleeptime > 0)) {
try {Thread.sleep(sleeptime);} catch (InterruptedException e) {}
}
// then sleep until a result is available
// if there are less than 10 more results available, sleep some extra time to get a chance that the "common sense" ranking algorithm can work
if ((this.resultList.size() <= item + 10) && (anyWorkerAlive())) {
try {Thread.sleep(300);} catch (InterruptedException e) {}
}
// then sleep until any result is available (that should not happen)
while ((this.resultList.size() <= item) && (anyWorkerAlive())) {
try {Thread.sleep(100);} catch (InterruptedException e) {}
}
// finally, if there is something, return the result
synchronized (this.resultList) {
// check if we have enough entries
if (this.resultList.size() <= item) return null;
// todo: fetch best result (switch) from item position to end of resultList
// fetch the best entry from the resultList, not the entry from item position
// whenever a specific entry was switched in its position and was returned here
// a moving pointer is set to assign that item position as not changeable
int bestpick = postRankingFavourite(item);
if (bestpick != item) {
// switch the elements
ResultEntry buf = (ResultEntry) this.resultList.get(bestpick);
serverLog.logInfo("SEARCH_POSTRANKING", "prefering [" + bestpick + "] " + buf.urlstring() + " over [" + item + "] " + ((ResultEntry) this.resultList.get(item)).urlstring());
this.resultList.set(bestpick, (ResultEntry) this.resultList.get(item));
this.resultList.set(item, buf);
}
//this.resultListLock = item; // lock the element; be prepared to return it
return (ResultEntry) this.resultList.get(item);
}
}
private int postRankingFavourite(int item) {
// do a post-ranking on resultList, which should be locked upon time of this call
long rank, bestrank = 0;
int bestitem = item;
ResultEntry entry;
for (int i = item; i < this.resultList.size(); i++) {
entry = (ResultEntry) this.resultList.get(i);
rank = this.ranking.postRanking(this.query, this.references(10), entry, item);
if (rank > bestrank) {
bestrank = rank;
bestitem = i;
}
}
return bestitem;
}
/*
public void removeRedundant() {
// remove all urls from the pageAcc structure that occur double by specific redundancy rules
// a link is redundant, if a sub-path of the url is cited before. redundant urls are removed
// we find redundant urls by iteration over all elements in pageAcc
Iterator i = pageAcc.entrySet().iterator();
HashMap paths = new HashMap(); // a url-subpath to pageAcc-key relation
Map.Entry entry;
// first scan all entries and find all urls that are referenced
while (i.hasNext()) {
entry = (Map.Entry) i.next();
paths.put(((indexURLEntry) entry.getValue()).comp().url().toNormalform(true, true), entry.getKey());
//if (path != null) path = shortenPath(path);
//if (path != null) paths.put(path, entry.getKey());
}
// now scan the pageAcc again and remove all redundant urls
i = pageAcc.entrySet().iterator();
String shorten;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
shorten = shortenPath(((indexURLEntry) entry.getValue()).comp().url().toNormalform(true, true));
// scan all subpaths of the url
while (shorten != null) {
if (pageAcc.size() <= query.wantedResults) break;
if (paths.containsKey(shorten)) {
//System.out.println("deleting path from search result: " + path + " is redundant to " + shorten);
try {
i.remove();
} catch (IllegalStateException e) {
}
}
shorten = shortenPath(shorten);
}
}
}
private static String shortenPath(String path) {
int pos = path.lastIndexOf('/');
if (pos < 0) return null;
return path.substring(0, pos);
}
*/
public ArrayList completeResults(long waitingtime) {
long timeout = System.currentTimeMillis() + waitingtime;
while ((this.resultList.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) {
@ -743,7 +847,8 @@ public final class plasmaSearchEvent {
//assert e != null;
}
public Object[] references(int count) {
public Set references(int count) {
// returns a set of words that are computed as toplist
return this.rankedCache.getReferences(count);
}
@ -791,6 +896,7 @@ public final class plasmaSearchEvent {
if ((p = alternative_urlname.indexOf("?")) > 0) alternative_urlname = alternative_urlname.substring(0, p);
}
}
public String hash() {
return urlentry.hash();
}
@ -821,6 +927,18 @@ public final class plasmaSearchEvent {
public int filesize() {
return urlentry.size();
}
public int limage() {
return urlentry.limage();
}
public int laudio() {
return urlentry.laudio();
}
public int lvideo() {
return urlentry.lvideo();
}
public int lapp() {
return urlentry.lapp();
}
public indexRWIEntry word() {
return urlentry.word();
}

@ -115,7 +115,7 @@ public final class plasmaSearchQuery {
}
public int displayResults() {
// the number if result lines that are displayed at once (size of result page)
// the number of result lines that are displayed at once (size of result page)
return this.linesPerPage;
}

@ -47,9 +47,9 @@ import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexRWIEntry;
import de.anomic.yacy.yacyURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
public class plasmaSearchRankingProfile {
@ -290,30 +290,30 @@ public class plasmaSearchRankingProfile {
public long postRanking(
plasmaSearchQuery query,
Set topwords,
String[] urlcomps,
String[] descrcomps,
indexURLEntry page,
plasmaSearchEvent.ResultEntry rentry,
int position) {
long ranking = (255 - position) << 8;
// for media search: prefer pages with many links
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) ranking += page.limage() << coeff_cathasimage;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) ranking += page.limage() << coeff_cathasaudio;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) ranking += page.limage() << coeff_cathasvideo;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) ranking += page.limage() << coeff_cathasapp;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) ranking += rentry.limage() << coeff_cathasimage;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) ranking += rentry.laudio() << coeff_cathasaudio;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) ranking += rentry.lvideo() << coeff_cathasvideo;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) ranking += rentry.lapp() << coeff_cathasapp;
// prefer hit with 'prefer' pattern
indexURLEntry.Components comp = page.comp();
if (comp.url().toNormalform(true, true).matches(query.prefer)) ranking += 256 << coeff_prefer;
if (comp.title().matches(query.prefer)) ranking += 256 << coeff_prefer;
if (rentry.url().toNormalform(true, true).matches(query.prefer)) ranking += 256 << coeff_prefer;
if (rentry.title().matches(query.prefer)) ranking += 256 << coeff_prefer;
// apply 'common-sense' heuristic using references
String urlstring = rentry.url().toNormalform(true, true);
String[] urlcomps = htmlFilterContentScraper.urlComps(urlstring);
String[] descrcomps = rentry.title().toLowerCase().split(htmlFilterContentScraper.splitrex);
for (int j = 0; j < urlcomps.length; j++) {
if (topwords.contains(urlcomps[j])) ranking += 256 << coeff_urlcompintoplist;
if (topwords.contains(urlcomps[j])) ranking += Math.max(1, 256 - urlstring.length()) << coeff_urlcompintoplist;
}
for (int j = 0; j < descrcomps.length; j++) {
if (topwords.contains(descrcomps[j])) ranking += 256 << coeff_descrcompintoplist;
if (topwords.contains(descrcomps[j])) ranking += Math.max(1, 256 - rentry.title().length()) << coeff_descrcompintoplist;
}
// apply query-in-result matching

@ -513,7 +513,12 @@ public final class yacyClient {
// integrate remote topwords
String references = (String) result.get("references");
if (references != null) containerCache.addReferences(references.split(","));
yacyCore.log.logInfo("remote search (client): peer " + target.getName() + " sent references " + references);
if (references != null) {
// add references twice, so they can be countet (must have at least 2 entries)
containerCache.addReferences(references.split(","));
containerCache.addReferences(references.split(","));
}
}
// insert the containers to the index

Loading…
Cancel
Save