From fb7411d7bbc6f858fac7f1ddc92813d4d9c8a5d3 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 5 Feb 2006 01:47:51 +0000 Subject: [PATCH] re-structuring of ranking application: concentration of all ranking attributes in the plasmaSearchRankingProfile git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1541 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/index.java | 22 ++-- htroot/yacy/search.java | 2 +- .../anomic/plasma/plasmaSearchPreOrder.java | 2 +- .../de/anomic/plasma/plasmaSearchQuery.java | 4 - .../plasma/plasmaSearchRankingProfile.java | 107 ++++++++++++++++-- .../de/anomic/plasma/plasmaSearchResult.java | 44 ++----- 6 files changed, 122 insertions(+), 59 deletions(-) diff --git a/htroot/index.java b/htroot/index.java index c0d58f768..55418f943 100644 --- a/htroot/index.java +++ b/htroot/index.java @@ -161,16 +161,18 @@ public class index { (yacyCore.seedDB.mySeed != null) && (yacyCore.seedDB.mySeed.getAddress() != null)); - String order1="", order2="", order3=""; - if (order.startsWith("YBR")) order1 = plasmaSearchQuery.ORDER_YBR; - if (order.startsWith("Date")) order1 = plasmaSearchQuery.ORDER_DATE; - if (order.startsWith("Quality")) order1 = plasmaSearchQuery.ORDER_QUALITY; - if (order.indexOf("-YBR-") > 0) order2 = plasmaSearchQuery.ORDER_YBR; - if (order.indexOf("-Date-") > 0) order2 = plasmaSearchQuery.ORDER_DATE; - if (order.indexOf("-Quality-") > 0) order2 = plasmaSearchQuery.ORDER_QUALITY; - if (order.endsWith("YBR")) order3 = plasmaSearchQuery.ORDER_YBR; - if (order.endsWith("Date")) order3 = plasmaSearchQuery.ORDER_DATE; - if (order.endsWith("Quality")) order3 = plasmaSearchQuery.ORDER_QUALITY; + String order1=plasmaSearchRankingProfile.ORDER_DATE; + String order2=plasmaSearchRankingProfile.ORDER_YBR; + String order3=plasmaSearchRankingProfile.ORDER_QUALITY; + if (order.startsWith("YBR")) order1 = plasmaSearchRankingProfile.ORDER_YBR; + if (order.startsWith("Date")) order1 = plasmaSearchRankingProfile.ORDER_DATE; + if (order.startsWith("Quality")) order1 = plasmaSearchRankingProfile.ORDER_QUALITY; + if (order.indexOf("-YBR-") > 0) order2 = plasmaSearchRankingProfile.ORDER_YBR; + if (order.indexOf("-Date-") > 0) order2 = plasmaSearchRankingProfile.ORDER_DATE; + if (order.indexOf("-Quality-") > 0) order2 = plasmaSearchRankingProfile.ORDER_QUALITY; + if (order.endsWith("YBR")) order3 = plasmaSearchRankingProfile.ORDER_YBR; + if (order.endsWith("Date")) order3 = plasmaSearchRankingProfile.ORDER_DATE; + if (order.endsWith("Quality")) order3 = plasmaSearchRankingProfile.ORDER_QUALITY; String urlmask = ""; if (post.containsKey("urlmask") && post.get("urlmask").equals("no")) { urlmask = ".*"; diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 7833a63df..7746f15a3 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -112,7 +112,7 @@ public final class search { yacyCore.log.logInfo("INIT HASH SEARCH: " + squery.queryHashes + " - " + squery.wantedResults + " links"); long timestamp1 = System.currentTimeMillis(); - plasmaSearchRankingProfile rankingProfile = new plasmaSearchRankingProfile(new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY}); + plasmaSearchRankingProfile rankingProfile = new plasmaSearchRankingProfile(new String[]{plasmaSearchRankingProfile.ORDER_YBR, plasmaSearchRankingProfile.ORDER_DATE, plasmaSearchRankingProfile.ORDER_QUALITY}); plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(squery.maximumTime, squery.wantedResults); plasmaSearchTimingProfile remoteTiming = null; plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache); diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index 20933c090..dbfac54d0 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -140,7 +140,7 @@ public final class plasmaSearchPreOrder { i = container.entries(); for (int j = 0; j < count; j++) { indexEntry = (plasmaWordIndexEntry) i.next(); - pageAcc.put(serverCodings.encodeHex(this.ranking.ranking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.getUrlHash(), indexEntry); + pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.getUrlHash(), indexEntry); } } diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index 924112800..6b270f902 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -52,10 +52,6 @@ import de.anomic.server.serverByteBuffer; public final class plasmaSearchQuery { - public static final String ORDER_QUALITY = "Quality"; - public static final String ORDER_DATE = "Date"; - public static final String ORDER_YBR = "YBR"; - public static final int SEARCHDOM_LOCAL = 0; public static final int SEARCHDOM_GROUPDHT = 1; public static final int SEARCHDOM_GROUPALL = 2; diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index 03d29d29a..d1ba1a3d7 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -41,30 +41,119 @@ package de.anomic.plasma; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Set; + public class plasmaSearchRankingProfile { + // old parameters for ordering + public static final String ORDER_QUALITY = "Quality"; + public static final String ORDER_DATE = "Date"; + public static final String ORDER_YBR = "YBR"; + + // pre-sort attributes + public static final String ENTROPY = "entropy"; + public static final String DATE = "date"; + public static final String YBR = "ybr"; + public static final String POSINTEXT = "posintext"; + public static final String WORDDISTANCE = "worddistance"; + public static final String HITCOUNT = "hitcount"; + public static final String DOMLENGTH = "domlength"; + + // post-sort attributes + public static final String URLLENGTH = "urllength"; + public static final String URLCOMPS = "urlcomps"; + public static final String DESCRLENGTH = "descrlength"; + public static final String DESCRCOMPS = "descrcomps"; + + // post-sort predicates + public static final String QUERYINURL = "queryinurl"; + public static final String QUERYINDESCR = "queryindescr"; + public static final String URLCOMPINTOPLIST = "urlcompintoplist"; + public static final String DESCRCOMPINTOPLIST = "descrcompintoplist"; + public String[] order; + private HashMap coeff; public plasmaSearchRankingProfile(String[] order) { this.order = order; + this.coeff = new HashMap(); + for (int i = 0; i < 3; i++) { + if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_QUALITY)) coeff.put(ENTROPY, new Integer((4 * (3 - i)))); + else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_DATE)) coeff.put(DATE, new Integer((4 * (3 - i)))); + else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_YBR)) coeff.put(YBR, new Integer((4 * (3 - i)))); + } + coeff.put(POSINTEXT, new Integer(11)); + coeff.put(WORDDISTANCE, new Integer(10)); + coeff.put(HITCOUNT, new Integer(9)); + coeff.put(DOMLENGTH, new Integer(8)); + coeff.put(URLLENGTH, new Integer(10)); + coeff.put(URLCOMPS, new Integer(10)); + coeff.put(DESCRLENGTH, new Integer(10)); + coeff.put(DESCRCOMPS, new Integer(10)); + coeff.put(QUERYINURL, new Integer(13)); + coeff.put(QUERYINDESCR, new Integer(14)); + coeff.put(URLCOMPINTOPLIST, new Integer(12)); + coeff.put(DESCRCOMPINTOPLIST, new Integer(11)); } public String orderString() { return order[0] + "-" + order[1] + "-" + order[2]; } - public long ranking(plasmaWordIndexEntry normalizedEntry) { + public long preRanking(plasmaWordIndexEntry normalizedEntry) { long ranking = 0; - for (int i = 0; i < 3; i++) { - if (this.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += normalizedEntry.getQuality() << (4 * (3 - i)); - else if (this.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking += normalizedEntry.getVirtualAge() << (4 * (3 - i)); - else if (this.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << (4 * (3 - i)); + ranking += normalizedEntry.getQuality() << ((Integer) coeff.get(ENTROPY)).intValue(); + ranking += normalizedEntry.getVirtualAge() << ((Integer) coeff.get(DATE)).intValue(); + ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << ((Integer) coeff.get(YBR)).intValue(); + ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue(); + ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue(); + ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue(); + ranking += (255 - normalizedEntry.domlengthNormalized()) << ((Integer) coeff.get(DOMLENGTH)).intValue(); + return ranking; + } + + public long postRanking( + plasmaWordIndexEntry normalizedEntry, + plasmaSearchQuery query, + Set topwords, + String[] urlcomps, + String[] descrcomps, + plasmaCrawlLURL.Entry page) { + + // apply pre-calculated order attributes + long ranking = this.preRanking(normalizedEntry); + + // apply 'common-sense' heuristic using references + for (int j = 0; j < urlcomps.length; j++) { + if (topwords.contains(urlcomps[j])) ranking += 1 << ((Integer) coeff.get(URLCOMPINTOPLIST)).intValue(); + } + for (int j = 0; j < descrcomps.length; j++) { + if (topwords.contains(descrcomps[j])) ranking += 1 << ((Integer) coeff.get(DESCRCOMPINTOPLIST)).intValue(); + } + + // apply query-in-result matching + Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps); + Set descrcomph = plasmaSearchQuery.words2hashes(descrcomps); + Iterator shi = query.queryHashes.iterator(); + String queryhash; + while (shi.hasNext()) { + queryhash = (String) shi.next(); + if (urlcomph.contains(queryhash)) ranking += 1 << ((Integer) coeff.get(QUERYINURL)).intValue(); + if (descrcomph.contains(queryhash)) ranking += 1 << ((Integer) coeff.get(QUERYINDESCR)).intValue(); } - ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << 11; - ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << 10; - ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << 9; - ranking += (255 - normalizedEntry.domlengthNormalized()) << 8; + + // prefer short urls + ranking += (255 - page.url().toString().length()) << ((Integer) coeff.get(URLLENGTH)).intValue(); + ranking += (24 - urlcomps.length) << ((Integer) coeff.get(URLCOMPS)).intValue(); + + // prefer long descriptions + ranking += (40 - Math.abs(40 - Math.min(40, page.descr().length()))) << ((Integer) coeff.get(DESCRLENGTH)).intValue(); + ranking += (8 - Math.abs(8 - Math.min(8, descrcomps.length))) << ((Integer) coeff.get(DESCRCOMPS)).intValue(); + return ranking; } + } diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java index 4fff9ea56..4f0c6acbd 100644 --- a/source/de/anomic/plasma/plasmaSearchResult.java +++ b/source/de/anomic/plasma/plasmaSearchResult.java @@ -139,47 +139,23 @@ public final class plasmaSearchResult { Object[] resultVector; plasmaWordIndexEntry indexEntry; plasmaCrawlLURL.Entry page; - String[] urlcomps; - String[] descrcomps; long ranking; - String queryhash; for (int i = 0; i < results.size(); i++) { // take out values from result array resultVector = (Object[]) results.get(i); indexEntry = (plasmaWordIndexEntry) resultVector[0]; - - // apply pre-calculated order attributes - ranking = this.ranking.ranking(indexEntry.generateNormalized(entryMin, entryMax)); - - // apply 'common-sense' heuristic using references - urlcomps = (String[]) resultVector[2]; - for (int j = 0; j < urlcomps.length; j++) { - if (commonSense.contains(urlcomps[j])) ranking += 1 << 12; - } - descrcomps = (String[]) resultVector[3]; - for (int j = 0; j < descrcomps.length; j++) { - if (commonSense.contains(descrcomps[j])) ranking += 1 << 11; - } - - // apply query-in-result matching - Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps); - Set descrcomph = plasmaSearchQuery.words2hashes(descrcomps); - Iterator shi = query.queryHashes.iterator(); - while (shi.hasNext()) { - queryhash = (String) shi.next(); - if (urlcomph.contains(queryhash)) ranking += 1 << 13; - if (descrcomph.contains(queryhash)) ranking += 1 << 14; - } - - // prefer short urls page = (plasmaCrawlLURL.Entry) resultVector[1]; - ranking += (255 - page.url().toString().length()) << 10; - ranking += (24 - urlcomps.length) << 10; - - // prefer long descriptions - ranking += (40 - Math.abs(40 - Math.min(40, page.descr().length()))) << 10; - ranking += ( 8 - Math.abs( 8 - Math.min( 8, descrcomps.length))) << 10; + // calculate ranking + ranking = this.ranking.postRanking( + indexEntry, + query, + commonSense, + (String[]) resultVector[2], + (String[]) resultVector[3], + page + ); + // insert value //System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url()); pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), page);