From f4a5c287fed21e0f439334dacb7e22172ea5f8a2 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Sat, 8 Sep 2007 11:50:19 +0000
Subject: [PATCH] re-implemented post-ranking of search results (should
 enhanced search result quality)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4080 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 build.properties                              |   2 +-
 htroot/yacy/search.java                       |  11 +-
 htroot/yacysearchitem.java                    |  18 +--
 .../anomic/plasma/plasmaSearchContainer.java  |   9 +-
 .../de/anomic/plasma/plasmaSearchEvent.java   | 152 ++++++++++++++++--
 .../de/anomic/plasma/plasmaSearchQuery.java   |   2 +-
 .../plasma/plasmaSearchRankingProfile.java    |  26 +--
 source/de/anomic/yacy/yacyClient.java         |   7 +-
 8 files changed, 179 insertions(+), 48 deletions(-)

diff --git a/build.properties b/build.properties
index ebaa67364..1273505a7 100644
--- a/build.properties
+++ b/build.properties
@@ -3,7 +3,7 @@ javacSource=1.4
 javacTarget=1.4
 
 # Release Configuration
-releaseVersion=0.543
+releaseVersion=0.544
 releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
 proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
 releaseFileParentDir=yacy
diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java
index e3688556d..d11ab08cc 100644
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@@ -32,6 +32,7 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeSet;
 
 import de.anomic.http.httpHeader;
@@ -226,12 +227,14 @@ public final class search {
             
             // prepare reference hints
             localProcess.startTimer();
-            Object[] ws = theSearch.references(10);
+            Set ws = theSearch.references(10);
             StringBuffer refstr = new StringBuffer();
-            for (int j = 0; j < ws.length; j++)
-                refstr.append(",").append((String) ws[j]);
+            Iterator j = ws.iterator();
+            while (j.hasNext()) {
+                refstr.append(",").append((String) j.next());
+            }
             prop.putASIS("references", (refstr.length() > 0) ? refstr.substring(1) : new String(refstr));
-            localProcess.yield("reference collection", ws.length);
+            localProcess.yield("reference collection", ws.size());
         }
         prop.putASIS("indexabstract", new String(indexabstract));
         
diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java
index 0c46d1747..e106c4513 100644
--- a/htroot/yacysearchitem.java
+++ b/htroot/yacysearchitem.java
@@ -29,6 +29,7 @@ import java.net.MalformedURLException;
 import java.net.URLEncoder;
 import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.Set;
 import java.util.TreeSet;
 
 import de.anomic.http.httpHeader;
@@ -64,6 +65,7 @@ public class yacysearchitem {
         String eventID = post.get("eventID", "");
         boolean bottomline = post.get("bottomline", "false").equals("true");
         boolean authenticated = sb.adminAuthenticated(header) >= 2;
+        int item = post.getInt("item", -1);
         
         // find search event
         plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(eventID);
@@ -74,19 +76,19 @@ public class yacysearchitem {
         prop.put("offset", theQuery.neededResults() - theQuery.displayResults() + 1);
         prop.put("global", theSearch.getGlobalCount());
         prop.put("total", theSearch.getGlobalCount() + theSearch.getLocalCount());
-        prop.put("items", theQuery.displayResults());
+        prop.put("items", (item < 0) ? theQuery.neededResults() : item + 1);
         
         if (bottomline) {
             // attach the bottom line with search references (topwords)
-            final Object[] references = theSearch.references(20);
-            int hintcount = references.length;
-            if (hintcount > 0) {
+            final Set references = theSearch.references(20);
+            if (references.size() > 0) {
                 prop.put("references", 1);
                 // get the topwords
                 final TreeSet topwords = new TreeSet(kelondroNaturalOrder.naturalOrder);
                 String tmp = "";
-                for (int i = 0; i < hintcount; i++) {
-                    tmp = (String) references[i];
+                Iterator i = references.iterator();
+                while (i.hasNext()) {
+                    tmp = (String) i.next();
                     if (tmp.matches("[a-z]+")) {
                         topwords.add(tmp);
                     }
@@ -106,7 +108,7 @@ public class yacysearchitem {
                 }
                 
                 String word;
-                hintcount = 0;
+                int hintcount = 0;
                 final Iterator iter = topwords.iterator();
                 while (iter.hasNext()) {
                     word = (String) iter.next();
@@ -134,8 +136,6 @@ public class yacysearchitem {
         prop.put("references", 0);
         
         // generate result object
-        int item = post.getInt("item", -1);
-        prop.put("items", (item < 0) ? theQuery.displayResults() : item + 1);
         plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
         
         if (result == null) {
diff --git a/source/de/anomic/plasma/plasmaSearchContainer.java b/source/de/anomic/plasma/plasmaSearchContainer.java
index 760aec759..de3f41232 100644
--- a/source/de/anomic/plasma/plasmaSearchContainer.java
+++ b/source/de/anomic/plasma/plasmaSearchContainer.java
@@ -145,10 +145,15 @@ public class plasmaSearchContainer {
         return this.globalcount;
     }
     
-    public Object[] getReferences(int count) {
+    public Set getReferences(int count) {
         // create a list of words that had been computed by statistics over all
         // words that appeared in the url or the description of all urls
-        return ref.getScores(count, false, 2, Integer.MAX_VALUE);
+        Object[] refs = ref.getScores(count, false, 2, Integer.MAX_VALUE);
+        TreeSet s = new TreeSet(String.CASE_INSENSITIVE_ORDER);
+        for (int i = 0; i < refs.length; i++) {
+            s.add((String) refs[i]);
+        }
+        return s;
     }
     
     public void addReferences(String[] words) {
diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java
index 9ad29992f..3e05eaa49 100644
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@@ -51,7 +51,7 @@ import de.anomic.yacy.yacyURL;
 
 public final class plasmaSearchEvent {
     
-    public static int workerThreadCount = 5;
+    public static int workerThreadCount = 10;
     public static String lastEventID = "";
     private static HashMap lastEvents = new HashMap(); // a cache for objects from this class: re-use old search requests
     public static final long eventLifetime = 600000; // the time an event will stay in the cache, 10 Minutes
@@ -71,6 +71,7 @@ public final class plasmaSearchEvent {
     private int localcount;
     private resultWorker[] workerThreads;
     private ArrayList resultList; // list of this.Entry objects
+    //private int resultListLock; // a pointer that shows that all elements below this pointer are fixed and may not be changed again
     private HashMap failedURLs; // a mapping from a urlhash to a fail reason string
     TreeSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
     
@@ -97,6 +98,7 @@ public final class plasmaSearchEvent {
         this.localcount = 0;
         this.workerThreads = null;
         this.resultList = new ArrayList(10); // this is the result set which is filled up with search results, enriched with snippets
+        //this.resultListLock = 0; // no locked elements until now
         this.failedURLs = new HashMap(); // a map of urls to reason strings where a worker thread tried to work on, but failed.
         
         // snippets do not need to match with the complete query hashes,
@@ -120,7 +122,7 @@ public final class plasmaSearchEvent {
             // the result of the fetch is then in the rcGlobal
             process.startTimer();
             serverLog.logFine("SEARCH_EVENT", "STARTING " + fetchpeers + " THREADS TO CATCH EACH " + query.displayResults() + " URLs");
-            primarySearchThreads = yacySearch.primaryRemoteSearches(
+            this.primarySearchThreads = yacySearch.primaryRemoteSearches(
                     plasmaSearchQuery.hashSet2hashString(query.queryHashes),
                     plasmaSearchQuery.hashSet2hashString(query.excludeHashes),
                     "",
@@ -136,7 +138,7 @@ public final class plasmaSearchEvent {
                     ranking,
                     query.constraint,
                     (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ? null : preselectedPeerHashes);
-            process.yield("remote search thread start", primarySearchThreads.length);
+            process.yield("remote search thread start", this.primarySearchThreads.length);
             
             // meanwhile do a local search
             Map[] searchContainerMaps = process.localSearchContainers(query, wordIndex, null);
@@ -400,6 +402,22 @@ public final class plasmaSearchEvent {
         return false;
     }
     
+    private boolean anyRemoteSearchAlive() {
+        // check primary search threads
+        if ((this.primarySearchThreads != null) && (this.primarySearchThreads.length != 0)) {
+            for (int i = 0; i < this.primarySearchThreads.length; i++) {
+                if ((this.primarySearchThreads[i] != null) && (this.primarySearchThreads[i].isAlive())) return true;
+            }
+        }
+        // maybe a secondary search thread is alivem check this
+        if ((this.secondarySearchThreads != null) && (this.secondarySearchThreads.length != 0)) {
+            for (int i = 0; i < this.primarySearchThreads.length; i++) {
+                if ((this.secondarySearchThreads[i] != null) && (this.secondarySearchThreads[i].isAlive())) return true;
+            }
+        }
+        return false;
+    }
+    
     public plasmaSearchQuery getQuery() {
         return query;
     }
@@ -454,7 +472,7 @@ public final class plasmaSearchEvent {
             // if worker threads had been alive, but did not succeed, start them again to fetch missing links
             if ((query.onlineSnippetFetch) &&
                 (!event.anyWorkerAlive()) &&
-                (event.resultList.size() < query.neededResults()) &&
+                (event.resultList.size() < query.neededResults() + 10) &&
                 ((event.getLocalCount() + event.getGlobalCount()) > event.resultList.size())) {
                 // set new timeout
                 event.eventTime = System.currentTimeMillis();
@@ -493,10 +511,14 @@ public final class plasmaSearchEvent {
         public void run() {
 
             // sleep first to give remote loading threads a chance to fetch entries
-            try {Thread.sleep(this.sleeptime);} catch (InterruptedException e1) {}
+            if (anyRemoteSearchAlive()) try {Thread.sleep(this.sleeptime);} catch (InterruptedException e1) {}
             
             // start fetching urls and snippets
-            while ((resultList.size() < query.neededResults() + query.displayResults()) && (System.currentTimeMillis() < this.timeout)) {
+            while (true) {
+                
+                if (resultList.size() > query.neededResults() + query.displayResults()) break; // computed enough
+
+                if (System.currentTimeMillis() > this.timeout) break; // time is over
                 
                 // try secondary search
                 prepareSecondarySearch(); // will be executed only once
@@ -505,9 +527,14 @@ public final class plasmaSearchEvent {
                 this.entry = null;
                 entry = nextOrder();
                 if (entry == null) {
-                    // wait and try again
-                    try {Thread.sleep(100);} catch (InterruptedException e) {}
-                    continue;
+                    if (anyRemoteSearchAlive()) {
+                        // wait and try again
+                        try {Thread.sleep(100);} catch (InterruptedException e) {}
+                        continue;
+                    } else {
+                        // we will not see that there core more results in
+                        break;
+                    }
                 }
                 
                 indexURLEntry page = wordIndex.loadedURL.load(entry.urlHash(), entry);
@@ -531,7 +558,7 @@ public final class plasmaSearchEvent {
                 
                 System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url());
             }
-            System.out.println("DEBUG: resultWorker thread " + id + " terminated");
+            serverLog.logInfo("SEARCH", "resultWorker thread " + id + " terminated");
         }
         
         private indexRWIEntry nextOrder() {
@@ -574,29 +601,106 @@ public final class plasmaSearchEvent {
         serverLog.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);
     }
     
-    
     public ResultEntry oneResult(int item) {
         // first sleep a while to give accumulation threads a chance to work
         long sleeptime = this.eventTime + (this.query.maximumTime / this.query.displayResults() * ((item % this.query.displayResults()) + 1)) - System.currentTimeMillis();
-        if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) &&
-            (anyWorkerAlive()) &&
-            (sleeptime > 0)) try {Thread.sleep(sleeptime);} catch (InterruptedException e) {}
+        if ((anyWorkerAlive()) && (sleeptime > 0)) {
+            try {Thread.sleep(sleeptime);} catch (InterruptedException e) {}
+        }
         
-        // then sleep until a result is available
+        // if there are less than 10 more results available, sleep some extra time to get a chance that the "common sense" ranking algorithm can work
+        if ((this.resultList.size() <= item + 10) && (anyWorkerAlive())) {
+            try {Thread.sleep(300);} catch (InterruptedException e) {}
+        }
+        // then sleep until any result is available (that should not happen)
         while ((this.resultList.size() <= item) && (anyWorkerAlive())) {
             try {Thread.sleep(100);} catch (InterruptedException e) {}
         }
         
         // finally, if there is something, return the result
         synchronized (this.resultList) {
+            // check if we have enough entries
             if (this.resultList.size() <= item) return null;
             
-            // todo: fetch best result (switch) from item position to end of resultList
+            // fetch the best entry from the resultList, not the entry from item position
+            // whenever a specific entry was switched in its position and was returned here
+            // a moving pointer is set to assign that item position as not changeable
+            int bestpick = postRankingFavourite(item);
+            if (bestpick != item) {
+                // switch the elements
+                ResultEntry buf = (ResultEntry) this.resultList.get(bestpick);
+                serverLog.logInfo("SEARCH_POSTRANKING", "prefering [" + bestpick + "] " + buf.urlstring() + " over [" + item + "] " + ((ResultEntry) this.resultList.get(item)).urlstring());
+                this.resultList.set(bestpick, (ResultEntry) this.resultList.get(item));
+                this.resultList.set(item, buf);
+            }
             
+            //this.resultListLock = item; // lock the element; be prepared to return it
             return (ResultEntry) this.resultList.get(item);
         }
     }
     
+    private int postRankingFavourite(int item) {
+        // do a post-ranking on resultList, which should be locked upon time of this call
+        long rank, bestrank = 0;
+        int bestitem = item;
+        ResultEntry entry;
+        for (int i = item; i < this.resultList.size(); i++) {
+            entry = (ResultEntry) this.resultList.get(i);
+            rank = this.ranking.postRanking(this.query, this.references(10), entry, item);
+            if (rank > bestrank) {
+                bestrank = rank;
+                bestitem = i;
+            }
+        }
+        return bestitem;
+    }
+    
+    /*
+    public void removeRedundant() {
+        // remove all urls from the pageAcc structure that occur double by specific redundancy rules
+        // a link is redundant, if a sub-path of the url is cited before. redundant urls are removed
+        // we find redundant urls by iteration over all elements in pageAcc
+        Iterator i = pageAcc.entrySet().iterator();
+        HashMap paths = new HashMap(); // a url-subpath to pageAcc-key relation
+        Map.Entry entry;
+
+        // first scan all entries and find all urls that are referenced
+        while (i.hasNext()) {
+            entry = (Map.Entry) i.next();
+            paths.put(((indexURLEntry) entry.getValue()).comp().url().toNormalform(true, true), entry.getKey());
+            //if (path != null) path = shortenPath(path);
+            //if (path != null) paths.put(path, entry.getKey());
+        }
+
+        // now scan the pageAcc again and remove all redundant urls
+        i = pageAcc.entrySet().iterator();
+        String shorten;
+        while (i.hasNext()) {
+            entry = (Map.Entry) i.next();
+            shorten = shortenPath(((indexURLEntry) entry.getValue()).comp().url().toNormalform(true, true));
+            // scan all subpaths of the url
+            while (shorten != null) {
+                if (pageAcc.size() <= query.wantedResults) break;
+                if (paths.containsKey(shorten)) {
+                    //System.out.println("deleting path from search result: " + path + " is redundant to " + shorten);
+                    try {
+                        i.remove();
+                    } catch (IllegalStateException e) {
+
+                    }
+                }
+                shorten = shortenPath(shorten);
+            }
+        }
+    }
+
+    private static String shortenPath(String path) {
+        int pos = path.lastIndexOf('/');
+        if (pos < 0) return null;
+        return path.substring(0, pos);
+    }
+    */
+    
     public ArrayList completeResults(long waitingtime) {
         long timeout = System.currentTimeMillis() + waitingtime;
         while ((this.resultList.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) {
@@ -743,7 +847,8 @@ public final class plasmaSearchEvent {
         //assert e != null;
     }
     
-    public Object[] references(int count) {
+    public Set references(int count) {
+        // returns a set of words that are computed as toplist
         return this.rankedCache.getReferences(count);
     }
     
@@ -791,6 +896,7 @@ public final class plasmaSearchEvent {
                 if ((p = alternative_urlname.indexOf("?")) > 0) alternative_urlname = alternative_urlname.substring(0, p);
             }
         }
+        
         public String hash() {
             return urlentry.hash();
         }
@@ -821,6 +927,18 @@ public final class plasmaSearchEvent {
         public int filesize() {
             return urlentry.size();
         }
+        public int limage() {
+            return urlentry.limage();
+        }
+        public int laudio() {
+            return urlentry.laudio();
+        }
+        public int lvideo() {
+            return urlentry.lvideo();
+        }
+        public int lapp() {
+            return urlentry.lapp();
+        }
         public indexRWIEntry word() {
             return urlentry.word();
         }
diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java
index 3789e12b3..0c9fd9c20 100644
--- a/source/de/anomic/plasma/plasmaSearchQuery.java
+++ b/source/de/anomic/plasma/plasmaSearchQuery.java
@@ -115,7 +115,7 @@ public final class plasmaSearchQuery {
     }
     
     public int displayResults() {
-        // the number if result lines that are displayed at once (size of result page)
+        // the number of result lines that are displayed at once (size of result page)
         return this.linesPerPage;
     }
     
diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java
index 98fda8f1f..89ef1abdd 100644
--- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java
+++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java
@@ -47,9 +47,9 @@ import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
 
+import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.index.indexRWIEntry;
 import de.anomic.yacy.yacyURL;
-import de.anomic.index.indexURLEntry;
 import de.anomic.kelondro.kelondroBitfield;
 
 public class plasmaSearchRankingProfile {
@@ -290,30 +290,30 @@ public class plasmaSearchRankingProfile {
     public long postRanking(
                     plasmaSearchQuery query,
                     Set topwords,
-                    String[] urlcomps,
-                    String[] descrcomps,
-                    indexURLEntry page,
+                    plasmaSearchEvent.ResultEntry rentry,
                     int position) {
 
         long ranking = (255 - position) << 8;
         
         // for media search: prefer pages with many links
-        if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) ranking += page.limage() << coeff_cathasimage;
-        if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) ranking += page.limage() << coeff_cathasaudio;
-        if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) ranking += page.limage() << coeff_cathasvideo;
-        if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP  ) ranking += page.limage() << coeff_cathasapp;
+        if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) ranking += rentry.limage() << coeff_cathasimage;
+        if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) ranking += rentry.laudio() << coeff_cathasaudio;
+        if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) ranking += rentry.lvideo() << coeff_cathasvideo;
+        if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP  ) ranking += rentry.lapp()   << coeff_cathasapp;
         
         // prefer hit with 'prefer' pattern
-        indexURLEntry.Components comp = page.comp();
-        if (comp.url().toNormalform(true, true).matches(query.prefer)) ranking += 256 << coeff_prefer;
-        if (comp.title().matches(query.prefer)) ranking += 256 << coeff_prefer;
+        if (rentry.url().toNormalform(true, true).matches(query.prefer)) ranking += 256 << coeff_prefer;
+        if (rentry.title().matches(query.prefer)) ranking += 256 << coeff_prefer;
         
         // apply 'common-sense' heuristic using references
+        String urlstring = rentry.url().toNormalform(true, true);
+        String[] urlcomps = htmlFilterContentScraper.urlComps(urlstring);
+        String[] descrcomps = rentry.title().toLowerCase().split(htmlFilterContentScraper.splitrex);
         for (int j = 0; j < urlcomps.length; j++) {
-            if (topwords.contains(urlcomps[j])) ranking += 256 << coeff_urlcompintoplist;
+            if (topwords.contains(urlcomps[j])) ranking += Math.max(1, 256 - urlstring.length()) << coeff_urlcompintoplist;
         }
         for (int j = 0; j < descrcomps.length; j++) {
-            if (topwords.contains(descrcomps[j])) ranking += 256 << coeff_descrcompintoplist;
+            if (topwords.contains(descrcomps[j])) ranking += Math.max(1, 256 - rentry.title().length()) << coeff_descrcompintoplist;
         }
 
         // apply query-in-result matching
diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java
index dd1c3abee..825128746 100644
--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@@ -513,7 +513,12 @@ public final class yacyClient {
             
             // integrate remote topwords
             String references = (String) result.get("references");
-            if (references != null) containerCache.addReferences(references.split(","));
+            yacyCore.log.logInfo("remote search (client): peer " + target.getName() + " sent references " + references);
+            if (references != null) {
+                // add references twice, so they can be countet (must have at least 2 entries)
+                containerCache.addReferences(references.split(","));
+                containerCache.addReferences(references.split(","));
+            }
         }
         
 		// insert the containers to the index