From a8bc76820673a337096da3465c3f1836aa8ad89b Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 8 Sep 2006 16:04:50 +0000 Subject: [PATCH] enhancements to ranking evaluation git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2523 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/index/indexURL.java | 36 +++++++++++++++++-- source/de/anomic/index/indexURLEntry.java | 2 +- .../de/anomic/plasma/plasmaSearchEvent.java | 8 ++--- .../anomic/plasma/plasmaSearchPreOrder.java | 2 +- .../plasma/plasmaSearchRankingProfile.java | 9 +++-- .../de/anomic/plasma/plasmaSwitchboard.java | 4 ++- 6 files changed, 50 insertions(+), 11 deletions(-) diff --git a/source/de/anomic/index/indexURL.java b/source/de/anomic/index/indexURL.java index c413c1e5d..0ce486b20 100644 --- a/source/de/anomic/index/indexURL.java +++ b/source/de/anomic/index/indexURL.java @@ -563,14 +563,46 @@ public class indexURL { byte flagbyte = (byte) (((isHTTP) ? 0 : 32) | (id << 2) | domlengthKey); // form the 'local' part of the hash String hash3 = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.toNormalform())).substring(0, 5); - char hash2 = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(subdom + ":" + port + ":" + rootpath)).charAt(0); + char hash2 = subdomPortPath(subdom, port, rootpath); // form the 'global' part of the hash - String hash1 = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.getProtocol() + ":" + host + ":" + port)).substring(0, 5); + String hash1 = protocolHostPort(url.getProtocol(), host, port); char hash0 = kelondroBase64Order.enhancedCoder.encodeByte(flagbyte); // combine the hashes return hash3 + hash2 + hash1 + hash0; } + private static final char[] rootURLFlags = new char[] { + subdomPortPath("www", 80, ""), + subdomPortPath("", 80, "") + }; + + private static char subdomPortPath(String subdom, int port, String rootpath) { + return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(subdom + ":" + port + ":" + rootpath)).charAt(0); + } + + public static final boolean probablyRootURL(String urlHash) { + for (int i = 0; i < rootURLFlags.length; i++) if (urlHash.charAt(6) == rootURLFlags[i]) return true; + return false; + } + + private static String protocolHostPort(String protocol, String host, int port) { + return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(protocol + ":" + host + ":" + port)).substring(0, 5); + } + + public static final boolean probablyWordURL(String urlHash, String word) { + if (word == null) return false; + String pattern = urlHash.substring(6, 11); + if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".com", 80))) return true; + if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".net", 80))) return true; + if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".org", 80))) return true; + if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".uk", 80))) return true; + if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".fr", 80))) return true; + if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".de", 80))) return true; + if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".es", 80))) return true; + if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".it", 80))) return true; + return false; + } + public static final int domLengthEstimation(String urlHash) { // generates an estimation of the original domain length int flagbyte = kelondroBase64Order.enhancedCoder.decodeByte(urlHash.charAt(11)); diff --git a/source/de/anomic/index/indexURLEntry.java b/source/de/anomic/index/indexURLEntry.java index d0ad2158e..408742d80 100644 --- a/source/de/anomic/index/indexURLEntry.java +++ b/source/de/anomic/index/indexURLEntry.java @@ -148,7 +148,7 @@ public class indexURLEntry implements Cloneable, indexEntry { } public String toPropertyForm(boolean displayFormat) { - return entry.toPropertyForm(true, displayFormat, displayFormat); + return entry.toPropertyForm(false, displayFormat, displayFormat); } public Entry toKelondroEntry() { diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 912847280..ae7481068 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -242,8 +242,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable { try { while (preorder.hasNext()) { //if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break; - if (acc.sizeFetched() >= minEntries) break; - if (System.currentTimeMillis() >= postorderLimitTime) break; + //if (acc.sizeFetched() >= minEntries) break; + if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break; preorderEntry = preorder.next(); entry = (indexEntry) preorderEntry[0]; preranking = (Long) preorderEntry[1]; @@ -298,8 +298,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable { try { while (preorder.hasNext()) { //if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break; - if (acc.sizeFetched() >= minEntries) break; - if (System.currentTimeMillis() >= postorderLimitTime) break; + //if (acc.sizeFetched() >= minEntries) break; + if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break; preorderEntry = preorder.next(); entry = (indexEntry) preorderEntry[0]; preranking = (Long) preorderEntry[1]; diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index 253de37ff..1e506f8a2 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -96,7 +96,7 @@ public final class plasmaSearchPreOrder { this.pageAcc = new TreeMap(); for (int j = 0; j < count; j++) { iEntry = (indexEntry) i.next(); - pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax)), 16) + iEntry.urlHash(), iEntry); + pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry); } } diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index 0b3edb527..8535db997 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -164,7 +164,7 @@ public class plasmaSearchRankingProfile { return new String(ext); } - public long preRanking(indexEntry normalizedEntry) { + public long preRanking(indexEntry normalizedEntry, String searchedWord) { // the normalizedEntry must be a normalized indexEntry long ranking = 0; ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue(); @@ -174,6 +174,12 @@ public class plasmaSearchRankingProfile { ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue(); ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue(); ranking += (255 - indexURL.domLengthNormalized(normalizedEntry.urlHash())) << ((Integer) coeff.get(DOMLENGTH)).intValue(); + ranking += (indexURL.probablyRootURL(normalizedEntry.urlHash())) ? 16 << ((Integer) coeff.get(URLLENGTH)).intValue() : 0; + ranking += (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord)) ? 256 << ((Integer) coeff.get(QUERYINURL)).intValue() : 0; + if (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord)) + System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains word " + searchedWord + ", weighted " + ((Integer) coeff.get(QUERYINURL)).intValue() + ", ranking = " + ranking); + else + System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains not word " + searchedWord + ", ranking = " + ranking); return ranking; } @@ -219,7 +225,6 @@ public class plasmaSearchRankingProfile { ranking += (255 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue(); ranking += (255 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue(); - return ranking; } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 4d403ba78..c00c884f0 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -2084,7 +2084,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser prop.put("type_results_" + i + "_size", Long.toString(urlentry.size())); prop.put("type_results_" + i + "_words", URLEncoder.encode(query.queryWords.toString(),"UTF-8")); prop.put("type_results_" + i + "_former", formerSearch); - prop.put("type_results_" + i + "_rankingprops", urlentry.word().toPropertyForm(true)); + prop.put("type_results_" + i + "_rankingprops", urlentry.word().toPropertyForm(true) + ", domLengthEstimated=" + indexURL.domLengthEstimation(urlhash) + + ((indexURL.probablyRootURL(urlhash)) ? ", probablyRootURL" : "") + + ((indexURL.probablyWordURL(urlhash, query.words(""))) ? ", probablyWordURL" : "")); // adding snippet if available if (snippet.exists()) { prop.put("type_results_" + i + "_snippet", 1);