re-design of post-ranking process

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1537 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 6eef848954
parent 883f03c399
commit 6eef848954
6 changed files with 72 additions and 78 deletions
--- a/source/de/anomic/plasma/plasmaSearchPreOrder.java
+++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java
@ -119,71 +119,31 @@ public final class plasmaSearchPreOrder {
        return (plasmaWordIndexEntry) pageAcc.remove(top);
    }
    
-    /*
-    public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) {
-        Iterator i = container.entries();
-        long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
-        plasmaWordIndexEntry entry;
-        while (i.hasNext()) {
-            if (System.currentTimeMillis() > limitTime) break;
-            entry = (plasmaWordIndexEntry) i.next();
-            addEntry(entry);
-        }
-    }
-    
-    public void addEntry(plasmaWordIndexEntry indexEntry) {
-        long ranking = 0;
-        long factor = 4096L*4096L;
-        
-        for (int i = 0; i < 3; i++) {
-            if (query.order[i].equals(plasmaSearchQuery.ORDER_QUALITY))  ranking  += factor * indexEntry.getQuality() / 64L;
-            else if (query.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking  += factor * indexEntry.getVirtualAge() / 64L;
-            else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR))  ranking  += factor * ybr_p(indexEntry.getUrlHash());
-            factor = factor / 4096L;
-        }
-        int wordpos = indexEntry.posintext();
-        if (wordpos == 0) wordpos = 1000;
-        ranking = ranking + 4096L*4096L * (1000 - wordpos + indexEntry.hitcount() - 2 * indexEntry.worddistance());
-        pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry);
-    }
-    */
-    
    public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) {
        long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
-        plasmaWordIndexEntry entry;
+        plasmaWordIndexEntry indexEntry;

        // first pass: find min/max to obtain limits for normalization
        Iterator i = container.entries();
        int count = 0;
        while (i.hasNext()) {
            if (System.currentTimeMillis() > limitTime) break;
-            entry = (plasmaWordIndexEntry) i.next();
-            if (entryMin == null) entryMin = (plasmaWordIndexEntry) entry.clone(); else entryMin.min(entry);
-            if (entryMax == null) entryMax = (plasmaWordIndexEntry) entry.clone(); else entryMax.max(entry);
+            indexEntry = (plasmaWordIndexEntry) i.next();
+            if (entryMin == null) entryMin = (plasmaWordIndexEntry) indexEntry.clone(); else entryMin.min(indexEntry);
+            if (entryMax == null) entryMax = (plasmaWordIndexEntry) indexEntry.clone(); else entryMax.max(indexEntry);
            count++;
        }
        
-        // second pass: normalize entries
+        // second pass: normalize entries and get ranking
        i = container.entries();
        for (int j = 0; j < count; j++) {
-            entry = (plasmaWordIndexEntry) i.next();
-            entry.normalize(entryMin, entryMax);
-            addEntry(entry);
+            indexEntry = (plasmaWordIndexEntry) i.next();
+            pageAcc.put(serverCodings.encodeHex(query.ranking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.getUrlHash(), indexEntry);
        }
    }
    
-    public void addEntry(plasmaWordIndexEntry indexEntry) {
-        long ranking = 0;
-        
-        for (int i = 0; i < 3; i++) {
-            if (query.order[i].equals(plasmaSearchQuery.ORDER_QUALITY))   ranking  += indexEntry.getQuality() << (4 * (3 - i));
-            else if (query.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking  += indexEntry.getVirtualAge() << (4 * (3 - i));
-            else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR))  ranking  += ybr_p(indexEntry.getUrlHash()) << (4 * (3 - i));
-        }
-        ranking += (indexEntry.posintext()    == 0) ? 0 : (255 - indexEntry.posintext()) << 11;
-        ranking += (indexEntry.worddistance() == 0) ? 0 : (255 - indexEntry.worddistance()) << 10;
-        ranking += (indexEntry.hitcount()     == 0) ? 0 : indexEntry.hitcount() << 9;
-        pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry);
+    public plasmaWordIndexEntry[] getNormalizer() {
+        return new plasmaWordIndexEntry[] {entryMin, entryMax};
    }

    public static int ybr_p(String urlHash) {
--- a/source/de/anomic/plasma/plasmaSearchQuery.java
+++ b/source/de/anomic/plasma/plasmaSearchQuery.java
@ -177,4 +177,19 @@ public final class plasmaSearchQuery {
            if (blueList.contains(word)) it.remove();
        }
    }
+    
+    public long ranking(plasmaWordIndexEntry normalizedEntry) {
+        long ranking = 0;
+        
+        for (int i = 0; i < 3; i++) {
+            if (this.order[i].equals(plasmaSearchQuery.ORDER_QUALITY))   ranking  += normalizedEntry.getQuality() << (4 * (3 - i));
+            else if (this.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking  += normalizedEntry.getVirtualAge() << (4 * (3 - i));
+            else if (this.order[i].equals(plasmaSearchQuery.ORDER_YBR))  ranking  += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << (4 * (3 - i));
+        }
+        ranking += (normalizedEntry.posintext()    == 0) ? 0 : (255 - normalizedEntry.posintext()) << 11;
+        ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << 10;
+        ranking += (normalizedEntry.hitcount()     == 0) ? 0 : normalizedEntry.hitcount() << 9;
+        ranking += (255 - normalizedEntry.domlengthNormalized()) << 8;
+        return ranking;
+    }
 }
--- a/source/de/anomic/plasma/plasmaSearchResult.java
+++ b/source/de/anomic/plasma/plasmaSearchResult.java
@ -58,6 +58,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;

 public final class plasmaSearchResult {
    
+    private plasmaWordIndexEntry entryMin, entryMax;
    private TreeMap pageAcc;            // key = order hash; value = plasmaLURL.entry
    private kelondroMScoreCluster ref;  // reference score computation for the commonSense heuristic
    private ArrayList results;          // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
@ -72,6 +73,8 @@ public final class plasmaSearchResult {
        this.query = query;
        this.globalContributions = 0;
        this.localContributions = 0;
+        this.entryMin = null;
+        this.entryMax = null;
    }
    
    public plasmaSearchResult cloneSmart() {
@ -101,10 +104,10 @@ public final class plasmaSearchResult {
    }
    
    protected void addResult(plasmaWordIndexEntry indexEntry, plasmaCrawlLURL.Entry page) {
-        // this does 3 things:
-        // 1. simply store indexEntry and page to a cache
-        // 2. calculate references and store them to cache
-        // 2. add reference to reference sorting table
+        
+        // make min/max for normalization
+        if (entryMin == null) entryMin = (plasmaWordIndexEntry) indexEntry.clone(); else entryMin.min(indexEntry);
+        if (entryMax == null) entryMax = (plasmaWordIndexEntry) indexEntry.clone(); else entryMax.max(indexEntry);
        
        // take out relevant information for reference computation
        URL url = page.url();
@ -136,33 +139,25 @@ public final class plasmaSearchResult {
        plasmaCrawlLURL.Entry page;
        String[] urlcomps;
        String[] descrcomps;
-        long ranking, factor;
+        long ranking;
        String queryhash;
        for (int i = 0; i < results.size(); i++) {
            // take out values from result array
            resultVector = (Object[]) results.get(i);
            indexEntry = (plasmaWordIndexEntry) resultVector[0];
-            page = (plasmaCrawlLURL.Entry) resultVector[1];
-            urlcomps = (String[]) resultVector[2];
-            descrcomps = (String[]) resultVector[3];
            
            // apply pre-calculated order attributes
-            ranking = 0;
-            factor = 4096L*4096L;
-            
-            for (int j = 0; j < 3; j++) {
-                if (query.order[j].equals(plasmaSearchQuery.ORDER_QUALITY))  ranking += factor * indexEntry.getQuality() / 64L;
-                else if (query.order[j].equals(plasmaSearchQuery.ORDER_DATE)) ranking += factor * indexEntry.getVirtualAge() / 64L;
-                else if (query.order[j].equals(plasmaSearchQuery.ORDER_YBR))  ranking += factor * plasmaSearchPreOrder.ybr_p(indexEntry.getUrlHash());
-                factor = factor / 4096L;
-            }
-            int wordpos = indexEntry.posintext();
-            if (wordpos == 0) wordpos = 1000;
-            ranking = ranking + 4096L*4096L * (1000 - wordpos + indexEntry.hitcount() - 2 * indexEntry.worddistance());
+            ranking = query.ranking(indexEntry.generateNormalized(entryMin, entryMax));
            
            // apply 'common-sense' heuristic using references
-            for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking += 10L*4096L*4096L / urlcomps.length;
-            for (int j = 0; j < descrcomps.length; j++) if (commonSense.contains(descrcomps[j])) ranking += 10L*4096L*4096L / descrcomps.length;
+            urlcomps = (String[]) resultVector[2];
+            for (int j = 0; j < urlcomps.length; j++) {
+                if (commonSense.contains(urlcomps[j])) ranking += 1 << 12;
+            }
+            descrcomps = (String[]) resultVector[3];
+            for (int j = 0; j < descrcomps.length; j++) {
+                if (commonSense.contains(descrcomps[j])) ranking += 1 << 11;
+            }
            
            // apply query-in-result matching
            Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps);
@ -170,17 +165,18 @@ public final class plasmaSearchResult {
            Iterator shi = query.queryHashes.iterator();
            while (shi.hasNext()) {
                queryhash = (String) shi.next();
-                if (urlcomph.contains(queryhash)) ranking += 90L*4096L*4096L / urlcomps.length / query.queryHashes.size();
-                if (descrcomph.contains(queryhash)) ranking += 40L*4096L*4096L / descrcomps.length / query.queryHashes.size();
+                if (urlcomph.contains(queryhash)) ranking += 1 << 13;
+                if (descrcomph.contains(queryhash)) ranking += 1 << 14;
            }
            
            // prefer short urls
-            ranking -= 64L * page.url().toString().length();
-            ranking -= 64L * urlcomps.length;
+            page = (plasmaCrawlLURL.Entry) resultVector[1];
+            ranking += (255 - page.url().toString().length()) << 10;
+            ranking += (24 - urlcomps.length) << 10;
            
            // prefer long descriptions
-            ranking += 64L * (40 - Math.abs(40 - Math.min(40, page.descr().length())));
-            ranking += 64L * ( 8 - Math.abs( 8 - Math.min( 8, descrcomps.length)));
+            ranking += (40 - Math.abs(40 - Math.min(40, page.descr().length()))) << 10;
+            ranking += ( 8 - Math.abs( 8 - Math.min( 8, descrcomps.length))) << 10;
            
            // insert value
            //System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url());
--- a/source/de/anomic/plasma/plasmaURL.java
+++ b/source/de/anomic/plasma/plasmaURL.java
@ -535,6 +535,19 @@ public class plasmaURL {
        return hash3 + hash2 + hash1 + hash0;
    }
    
+    public static final int domLengthEstimation(String urlHash) {
+        // generates an estimation of the original domain length
+        int flagbyte = kelondroBase64Order.enhancedCoder.decodeByte(urlHash.charAt(11));
+        int domLengthKey = flagbyte & 4;
+        switch (domLengthKey) {
+            case 0: return 4;
+            case 1: return 10;
+            case 2: return 14;
+            case 3: return 20;
+        }
+        return 20;
+    }
+    
    public static final String oldurlHash(URL url) {
 	if (url == null) return null;
        String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(htmlFilterContentScraper.urlNormalform(url))).substring(0, urlHashLength);
--- a/source/de/anomic/plasma/plasmaWordIndexEntry.java
+++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java
@ -388,6 +388,12 @@ public final class plasmaWordIndexEntry implements Cloneable {
        this.quality      = (this.quality      == 0) ? 0 : 1 + 255 * (this.quality      - min.quality     ) / (1 + max.quality      - min.quality);
    }
    
+    public plasmaWordIndexEntry generateNormalized(plasmaWordIndexEntry min, plasmaWordIndexEntry max) {
+        plasmaWordIndexEntry e = (plasmaWordIndexEntry) this.clone();
+        e.normalize(min, max);
+        return e;
+    }
+    
    public String getUrlHash() { return urlHash; }
    public int getQuality() { return quality; }
    public int getVirtualAge() { return plasmaWordIndex.microDateDays(lastModified); }
@ -403,6 +409,10 @@ public final class plasmaWordIndexEntry implements Cloneable {
    public char getType() { return doctype; }
    public boolean isLocal() { return localflag == LT_LOCAL; }
    
+    public int domlengthNormalized() {
+        return 255 * plasmaURL.domLengthEstimation(this.urlHash) / 20;
+    }
+
    public static void main(String[] args) {
        // outputs the word hash to a given word
        if (args.length != 1) System.exit(0);