re-structuring of ranking application:

concentration of all ranking attributes in the
plasmaSearchRankingProfile

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1541 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 5f5eee1ae9
commit fb7411d7bb

@ -161,16 +161,18 @@ public class index {
(yacyCore.seedDB.mySeed != null) && (yacyCore.seedDB.mySeed != null) &&
(yacyCore.seedDB.mySeed.getAddress() != null)); (yacyCore.seedDB.mySeed.getAddress() != null));
String order1="", order2="", order3=""; String order1=plasmaSearchRankingProfile.ORDER_DATE;
if (order.startsWith("YBR")) order1 = plasmaSearchQuery.ORDER_YBR; String order2=plasmaSearchRankingProfile.ORDER_YBR;
if (order.startsWith("Date")) order1 = plasmaSearchQuery.ORDER_DATE; String order3=plasmaSearchRankingProfile.ORDER_QUALITY;
if (order.startsWith("Quality")) order1 = plasmaSearchQuery.ORDER_QUALITY; if (order.startsWith("YBR")) order1 = plasmaSearchRankingProfile.ORDER_YBR;
if (order.indexOf("-YBR-") > 0) order2 = plasmaSearchQuery.ORDER_YBR; if (order.startsWith("Date")) order1 = plasmaSearchRankingProfile.ORDER_DATE;
if (order.indexOf("-Date-") > 0) order2 = plasmaSearchQuery.ORDER_DATE; if (order.startsWith("Quality")) order1 = plasmaSearchRankingProfile.ORDER_QUALITY;
if (order.indexOf("-Quality-") > 0) order2 = plasmaSearchQuery.ORDER_QUALITY; if (order.indexOf("-YBR-") > 0) order2 = plasmaSearchRankingProfile.ORDER_YBR;
if (order.endsWith("YBR")) order3 = plasmaSearchQuery.ORDER_YBR; if (order.indexOf("-Date-") > 0) order2 = plasmaSearchRankingProfile.ORDER_DATE;
if (order.endsWith("Date")) order3 = plasmaSearchQuery.ORDER_DATE; if (order.indexOf("-Quality-") > 0) order2 = plasmaSearchRankingProfile.ORDER_QUALITY;
if (order.endsWith("Quality")) order3 = plasmaSearchQuery.ORDER_QUALITY; if (order.endsWith("YBR")) order3 = plasmaSearchRankingProfile.ORDER_YBR;
if (order.endsWith("Date")) order3 = plasmaSearchRankingProfile.ORDER_DATE;
if (order.endsWith("Quality")) order3 = plasmaSearchRankingProfile.ORDER_QUALITY;
String urlmask = ""; String urlmask = "";
if (post.containsKey("urlmask") && post.get("urlmask").equals("no")) { if (post.containsKey("urlmask") && post.get("urlmask").equals("no")) {
urlmask = ".*"; urlmask = ".*";

@ -112,7 +112,7 @@ public final class search {
yacyCore.log.logInfo("INIT HASH SEARCH: " + squery.queryHashes + " - " + squery.wantedResults + " links"); yacyCore.log.logInfo("INIT HASH SEARCH: " + squery.queryHashes + " - " + squery.wantedResults + " links");
long timestamp1 = System.currentTimeMillis(); long timestamp1 = System.currentTimeMillis();
plasmaSearchRankingProfile rankingProfile = new plasmaSearchRankingProfile(new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY}); plasmaSearchRankingProfile rankingProfile = new plasmaSearchRankingProfile(new String[]{plasmaSearchRankingProfile.ORDER_YBR, plasmaSearchRankingProfile.ORDER_DATE, plasmaSearchRankingProfile.ORDER_QUALITY});
plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(squery.maximumTime, squery.wantedResults); plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(squery.maximumTime, squery.wantedResults);
plasmaSearchTimingProfile remoteTiming = null; plasmaSearchTimingProfile remoteTiming = null;
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache); plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);

@ -140,7 +140,7 @@ public final class plasmaSearchPreOrder {
i = container.entries(); i = container.entries();
for (int j = 0; j < count; j++) { for (int j = 0; j < count; j++) {
indexEntry = (plasmaWordIndexEntry) i.next(); indexEntry = (plasmaWordIndexEntry) i.next();
pageAcc.put(serverCodings.encodeHex(this.ranking.ranking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.getUrlHash(), indexEntry); pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.getUrlHash(), indexEntry);
} }
} }

@ -52,10 +52,6 @@ import de.anomic.server.serverByteBuffer;
public final class plasmaSearchQuery { public final class plasmaSearchQuery {
public static final String ORDER_QUALITY = "Quality";
public static final String ORDER_DATE = "Date";
public static final String ORDER_YBR = "YBR";
public static final int SEARCHDOM_LOCAL = 0; public static final int SEARCHDOM_LOCAL = 0;
public static final int SEARCHDOM_GROUPDHT = 1; public static final int SEARCHDOM_GROUPDHT = 1;
public static final int SEARCHDOM_GROUPALL = 2; public static final int SEARCHDOM_GROUPALL = 2;

@ -41,30 +41,119 @@
package de.anomic.plasma; package de.anomic.plasma;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
public class plasmaSearchRankingProfile { public class plasmaSearchRankingProfile {
// old parameters for ordering
public static final String ORDER_QUALITY = "Quality";
public static final String ORDER_DATE = "Date";
public static final String ORDER_YBR = "YBR";
// pre-sort attributes
public static final String ENTROPY = "entropy";
public static final String DATE = "date";
public static final String YBR = "ybr";
public static final String POSINTEXT = "posintext";
public static final String WORDDISTANCE = "worddistance";
public static final String HITCOUNT = "hitcount";
public static final String DOMLENGTH = "domlength";
// post-sort attributes
public static final String URLLENGTH = "urllength";
public static final String URLCOMPS = "urlcomps";
public static final String DESCRLENGTH = "descrlength";
public static final String DESCRCOMPS = "descrcomps";
// post-sort predicates
public static final String QUERYINURL = "queryinurl";
public static final String QUERYINDESCR = "queryindescr";
public static final String URLCOMPINTOPLIST = "urlcompintoplist";
public static final String DESCRCOMPINTOPLIST = "descrcompintoplist";
public String[] order; public String[] order;
private HashMap coeff;
public plasmaSearchRankingProfile(String[] order) { public plasmaSearchRankingProfile(String[] order) {
this.order = order; this.order = order;
this.coeff = new HashMap();
for (int i = 0; i < 3; i++) {
if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_QUALITY)) coeff.put(ENTROPY, new Integer((4 * (3 - i))));
else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_DATE)) coeff.put(DATE, new Integer((4 * (3 - i))));
else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_YBR)) coeff.put(YBR, new Integer((4 * (3 - i))));
}
coeff.put(POSINTEXT, new Integer(11));
coeff.put(WORDDISTANCE, new Integer(10));
coeff.put(HITCOUNT, new Integer(9));
coeff.put(DOMLENGTH, new Integer(8));
coeff.put(URLLENGTH, new Integer(10));
coeff.put(URLCOMPS, new Integer(10));
coeff.put(DESCRLENGTH, new Integer(10));
coeff.put(DESCRCOMPS, new Integer(10));
coeff.put(QUERYINURL, new Integer(13));
coeff.put(QUERYINDESCR, new Integer(14));
coeff.put(URLCOMPINTOPLIST, new Integer(12));
coeff.put(DESCRCOMPINTOPLIST, new Integer(11));
} }
public String orderString() { public String orderString() {
return order[0] + "-" + order[1] + "-" + order[2]; return order[0] + "-" + order[1] + "-" + order[2];
} }
public long ranking(plasmaWordIndexEntry normalizedEntry) { public long preRanking(plasmaWordIndexEntry normalizedEntry) {
long ranking = 0; long ranking = 0;
for (int i = 0; i < 3; i++) { ranking += normalizedEntry.getQuality() << ((Integer) coeff.get(ENTROPY)).intValue();
if (this.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += normalizedEntry.getQuality() << (4 * (3 - i)); ranking += normalizedEntry.getVirtualAge() << ((Integer) coeff.get(DATE)).intValue();
else if (this.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking += normalizedEntry.getVirtualAge() << (4 * (3 - i)); ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << ((Integer) coeff.get(YBR)).intValue();
else if (this.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << (4 * (3 - i)); ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue();
ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue();
ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue();
ranking += (255 - normalizedEntry.domlengthNormalized()) << ((Integer) coeff.get(DOMLENGTH)).intValue();
return ranking;
}
public long postRanking(
plasmaWordIndexEntry normalizedEntry,
plasmaSearchQuery query,
Set topwords,
String[] urlcomps,
String[] descrcomps,
plasmaCrawlLURL.Entry page) {
// apply pre-calculated order attributes
long ranking = this.preRanking(normalizedEntry);
// apply 'common-sense' heuristic using references
for (int j = 0; j < urlcomps.length; j++) {
if (topwords.contains(urlcomps[j])) ranking += 1 << ((Integer) coeff.get(URLCOMPINTOPLIST)).intValue();
}
for (int j = 0; j < descrcomps.length; j++) {
if (topwords.contains(descrcomps[j])) ranking += 1 << ((Integer) coeff.get(DESCRCOMPINTOPLIST)).intValue();
}
// apply query-in-result matching
Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps);
Set descrcomph = plasmaSearchQuery.words2hashes(descrcomps);
Iterator shi = query.queryHashes.iterator();
String queryhash;
while (shi.hasNext()) {
queryhash = (String) shi.next();
if (urlcomph.contains(queryhash)) ranking += 1 << ((Integer) coeff.get(QUERYINURL)).intValue();
if (descrcomph.contains(queryhash)) ranking += 1 << ((Integer) coeff.get(QUERYINDESCR)).intValue();
} }
ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << 11;
ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << 10; // prefer short urls
ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << 9; ranking += (255 - page.url().toString().length()) << ((Integer) coeff.get(URLLENGTH)).intValue();
ranking += (255 - normalizedEntry.domlengthNormalized()) << 8; ranking += (24 - urlcomps.length) << ((Integer) coeff.get(URLCOMPS)).intValue();
// prefer long descriptions
ranking += (40 - Math.abs(40 - Math.min(40, page.descr().length()))) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
ranking += (8 - Math.abs(8 - Math.min(8, descrcomps.length))) << ((Integer) coeff.get(DESCRCOMPS)).intValue();
return ranking; return ranking;
} }
} }

@ -139,47 +139,23 @@ public final class plasmaSearchResult {
Object[] resultVector; Object[] resultVector;
plasmaWordIndexEntry indexEntry; plasmaWordIndexEntry indexEntry;
plasmaCrawlLURL.Entry page; plasmaCrawlLURL.Entry page;
String[] urlcomps;
String[] descrcomps;
long ranking; long ranking;
String queryhash;
for (int i = 0; i < results.size(); i++) { for (int i = 0; i < results.size(); i++) {
// take out values from result array // take out values from result array
resultVector = (Object[]) results.get(i); resultVector = (Object[]) results.get(i);
indexEntry = (plasmaWordIndexEntry) resultVector[0]; indexEntry = (plasmaWordIndexEntry) resultVector[0];
// apply pre-calculated order attributes
ranking = this.ranking.ranking(indexEntry.generateNormalized(entryMin, entryMax));
// apply 'common-sense' heuristic using references
urlcomps = (String[]) resultVector[2];
for (int j = 0; j < urlcomps.length; j++) {
if (commonSense.contains(urlcomps[j])) ranking += 1 << 12;
}
descrcomps = (String[]) resultVector[3];
for (int j = 0; j < descrcomps.length; j++) {
if (commonSense.contains(descrcomps[j])) ranking += 1 << 11;
}
// apply query-in-result matching
Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps);
Set descrcomph = plasmaSearchQuery.words2hashes(descrcomps);
Iterator shi = query.queryHashes.iterator();
while (shi.hasNext()) {
queryhash = (String) shi.next();
if (urlcomph.contains(queryhash)) ranking += 1 << 13;
if (descrcomph.contains(queryhash)) ranking += 1 << 14;
}
// prefer short urls
page = (plasmaCrawlLURL.Entry) resultVector[1]; page = (plasmaCrawlLURL.Entry) resultVector[1];
ranking += (255 - page.url().toString().length()) << 10;
ranking += (24 - urlcomps.length) << 10;
// prefer long descriptions
ranking += (40 - Math.abs(40 - Math.min(40, page.descr().length()))) << 10;
ranking += ( 8 - Math.abs( 8 - Math.min( 8, descrcomps.length))) << 10;
// calculate ranking
ranking = this.ranking.postRanking(
indexEntry,
query,
commonSense,
(String[]) resultVector[2],
(String[]) resultVector[3],
page
);
// insert value // insert value
//System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url()); //System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url());
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), page); pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), page);

Loading…
Cancel
Save