enhancements to ranking

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2535 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 64bed59ee8
parent 63893003be
commit 64bed59ee8
6 changed files with 55 additions and 39 deletions
--- a/source/de/anomic/index/indexURL.java
+++ b/source/de/anomic/index/indexURL.java
@ -571,36 +571,39 @@ public class indexURL {
     return hash3 + hash2 + hash1 + hash0;
 }
 
- private static final char[] rootURLFlags = new char[] {
-         subdomPortPath("www", 80, ""),
-         subdomPortPath("", 80, "")
- };
- 
 private static char subdomPortPath(String subdom, int port, String rootpath) {
     return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(subdom + ":" + port + ":" + rootpath)).charAt(0);
 }
 
+ private static final char rootURLFlag = subdomPortPath("www", 80, "");
 public static final boolean probablyRootURL(String urlHash) {
-     for (int i = 0; i < rootURLFlags.length; i++) if (urlHash.charAt(6) == rootURLFlags[i]) return true;
-     return false;
+     return (urlHash.charAt(5) == rootURLFlag);
 }
 
 private static String protocolHostPort(String protocol, String host, int port) {
     return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(protocol + ":" + host + ":" + port)).substring(0, 5);
 }
 
- public static final boolean probablyWordURL(String urlHash, String word) {
-     if (word == null) return false;
+ private static String[] testTLDs = new String[] {"com", "net", "org", "uk", "fr", "de", "es", "it"};
+ public static final URL probablyWordURL(String urlHash, String word) {
+     if (word == null) return null;
     String pattern = urlHash.substring(6, 11);
-     if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".com", 80))) return true;
-     if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".net", 80))) return true;
-     if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".org", 80))) return true;
-     if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".uk", 80))) return true;
-     if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".fr", 80))) return true;
-     if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".de", 80))) return true;
-     if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".es", 80))) return true;
-     if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".it", 80))) return true;
-     return false;
+     for (int i = 0; i < testTLDs.length; i++) {
+         if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + "." + testTLDs[i], 80)))
+            try {
+                return new URL("http://www." + word.toLowerCase() + "." + testTLDs[i]);
+            } catch (MalformedURLException e) {
+                return null;
+            }
+     }
+     return null;
+ }
+ 
+ public static final boolean isWordRootURL(String givenURLHash, String word) {
+     if (!(probablyRootURL(givenURLHash))) return false;
+     URL wordURL = probablyWordURL(givenURLHash, word);
+     if (wordURL == null) return false;
+     return urlHash(wordURL).equals(givenURLHash);
 }
 
 public static final int domLengthEstimation(String urlHash) {
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@ -223,6 +223,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
        preorderTime = preorderTime - (System.currentTimeMillis() - pst);
        if (preorderTime < 0) preorderTime = 200;
        plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, searchResult, preorderTime);
+        preorder.remove(true, true);
        profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT);
        profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_PRESORT, rcLocal.size());
        
@ -241,11 +242,10 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
        int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);
        try {
            while (preorder.hasNext()) {
-                //if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break;
-                //if (acc.sizeFetched() >= minEntries) break;
                if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break;
                preorderEntry = preorder.next();
                entry = (indexEntry) preorderEntry[0];
+                // load only urls if there was not yet a root url of that hash
                preranking = (Long) preorderEntry[1];
                // find the url entry
                page = urlStore.load(entry.urlHash(), entry);
@ -267,7 +267,6 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
        // apply filter
        profileLocal.startTimer();
        acc.removeRedundant();
-        //acc.removeDoubleDom();
        profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER);
        profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered());
        
@ -281,6 +280,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
        profileLocal.startTimer();
        if (maxtime < 0) maxtime = 200;
        plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, maxtime);
+        preorder.remove(true, true);
        profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT);
        profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_PRESORT, rcLocal.size());
        
@ -320,7 +320,6 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
        // apply filter
        profileLocal.startTimer();
        acc.removeRedundant();
-        //acc.removeDoubleDom();
        profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER);
        profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered());
        
--- a/source/de/anomic/plasma/plasmaSearchPreOrder.java
+++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java
@ -44,13 +44,16 @@ package de.anomic.plasma;

 import java.io.File;
 import java.io.IOException;
+import java.util.HashSet;
 import java.util.TreeMap;
+import java.util.Map;
 import java.util.Iterator;

 import de.anomic.server.serverCodings;
 import de.anomic.server.serverFileUtils;
 import de.anomic.index.indexContainer;
 import de.anomic.index.indexEntry;
+import de.anomic.index.indexURL;
 import de.anomic.kelondro.kelondroBinSearch;

 public final class plasmaSearchPreOrder {
@ -100,6 +103,31 @@ public final class plasmaSearchPreOrder {
        }
    }
    
+    public void remove(boolean rootDomExt, boolean doubleDom) {
+        // this removes all refererences to urls that are extended paths of existing 'RootDom'-urls
+        HashSet rootDoms = new HashSet();
+        HashSet doubleDoms = new HashSet();
+        Iterator i = pageAcc.entrySet().iterator();
+        Map.Entry entry;
+        indexEntry iEntry;
+        String hashpart;
+        while (i.hasNext()) {
+            entry = (Map.Entry) i.next();
+            iEntry = (indexEntry) entry.getValue();
+            hashpart = iEntry.urlHash().substring(6);
+            if (((rootDomExt) && (rootDoms.contains(hashpart))) ||
+                ((doubleDom) && (doubleDoms.contains(hashpart)))) {
+                i.remove();
+                if (pageAcc.size() <= query.wantedResults) return;
+            } else {
+                if (indexURL.isWordRootURL(iEntry.urlHash(), query.words(""))) {
+                    rootDoms.add(hashpart);
+                }
+            }
+            doubleDoms.add(hashpart);
+        }
+    }
+    
    public static void loadYBR(File rankingPath, int count) {
        // load ranking tables
        if (rankingPath.exists()) {
--- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java
+++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java
@ -175,7 +175,7 @@ public class plasmaSearchRankingProfile {
        ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue();
        ranking += (256 - indexURL.domLengthNormalized(normalizedEntry.urlHash())) << ((Integer) coeff.get(DOMLENGTH)).intValue();
        ranking += (indexURL.probablyRootURL(normalizedEntry.urlHash())) ? 16 << ((Integer) coeff.get(URLLENGTH)).intValue() : 0;
-        ranking += (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord)) ? 256 << ((Integer) coeff.get(QUERYINURL)).intValue() : 0;
+        ranking += (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord) != null) ? 256 << ((Integer) coeff.get(QUERYINURL)).intValue() : 0;
        /*
        if (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord))
            System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains word " + searchedWord + ", weighted " + ((Integer) coeff.get(QUERYINURL)).intValue() + ", ranking = " + ranking);
--- a/source/de/anomic/plasma/plasmaSearchResult.java
+++ b/source/de/anomic/plasma/plasmaSearchResult.java
@ -161,21 +161,6 @@ public final class plasmaSearchResult {
        results = null;
    }
    
-    public void removeDoubleDom() {
-        Iterator i = pageAcc.entrySet().iterator();
-        HashSet doms = new HashSet();
-        Map.Entry entry;
-        String dom;
-        
-        while (i.hasNext()) {
-        		if (pageAcc.size() <= query.wantedResults) return;
-            entry = (Map.Entry) i.next();
-            dom = ((plasmaCrawlLURL.Entry) entry.getValue()).url().getHost();
-            if (doms.contains(dom)) i.remove(); else doms.add(dom);
-        }
-        
-    }
-    
    public void removeRedundant() {
        // remove all urls from the pageAcc structure that occur double by specific redundancy rules
        // a link is redundant, if a sub-path of the url is cited before. redundant urls are removed
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -2068,6 +2068,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                         */
                    //addScoreForked(ref, gs, descr.split(" "));
                    //addScoreForked(ref, gs, urlstring.split("/"));
+                    URL wordURL;
                    if (urlstring.matches(query.urlMask)) { //.* is default
                        snippet = snippetCache.retrieve(url, query.queryHashes, false, 260);
                        if (snippet.getSource() == plasmaSnippetCache.ERROR_NO_MATCH) {
@ -2086,7 +2087,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                            prop.put("type_results_" + i + "_former", formerSearch);
                            prop.put("type_results_" + i + "_rankingprops", urlentry.word().toPropertyForm(true) + ", domLengthEstimated=" + indexURL.domLengthEstimation(urlhash) +
                                    ((indexURL.probablyRootURL(urlhash)) ? ", probablyRootURL" : "") + 
-                                    ((indexURL.probablyWordURL(urlhash, query.words(""))) ? ", probablyWordURL" : ""));
+                                    (((wordURL = indexURL.probablyWordURL(urlhash, query.words(""))) != null) ? ", probablyWordURL=" + wordURL.toNormalform() : ""));
                            // adding snippet if available
                            if (snippet.exists()) {
                                prop.put("type_results_" + i + "_snippet", 1);