From 9e23acf2d6d403c32ac5a712aa30359a125339c6 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 11 Dec 2007 01:32:58 +0000 Subject: [PATCH] introduced new 'authority' ranking property git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4265 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControlRWIs_p.html | 4 +- htroot/IndexControlRWIs_p.java | 1 + htroot/Ranking_p.java | 1 + .../de/anomic/index/indexRWIEntryOrder.java | 67 ++++++++++++++++--- .../plasma/plasmaSearchRankingProfile.java | 38 ++++++----- 5 files changed, 85 insertions(+), 26 deletions(-) diff --git a/htroot/IndexControlRWIs_p.html b/htroot/IndexControlRWIs_p.html index 25149cefc..ee2ca7f5c 100644 --- a/htroot/IndexControlRWIs_p.html +++ b/htroot/IndexControlRWIs_p.html @@ -137,7 +137,7 @@ Resource Negative Ranking Factors - Positive Ranking Factors + Positive Ranking Factors Reverse Normalized Weighted Ranking Sum @@ -152,6 +152,7 @@ pos of phrase pos in phrase word distance + authority date words in title words in text @@ -180,6 +181,7 @@ #[phrase]# #[posinphrase]# #[worddistance]# + #[authority]# #[date]# #[wordsintitle]# #[wordsintext]# diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index a4d3c0df6..57a259eeb 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -419,6 +419,7 @@ public class IndexControlRWIs_p { prop.putNum("genUrlList_urlList_"+i+"_urlExists_ranking", (entry.ranking() - rn)); prop.putNum("genUrlList_urlList_"+i+"_urlExists_domlength", yacyURL.domLengthEstimation(entry.hash())); prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", plasmaSearchRankingProcess.ybr(entry.hash())); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", ranked.getOrder().authority(entry.hash())); prop.put("genUrlList_urlList_"+i+"_urlExists_date", serverDate.shortDayTime(new Date(entry.word().lastModified()))); prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle()); prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintext", entry.word().wordsintext()); diff --git a/htroot/Ranking_p.java b/htroot/Ranking_p.java index 69b6b83b3..f2ce8dabb 100644 --- a/htroot/Ranking_p.java +++ b/htroot/Ranking_p.java @@ -50,6 +50,7 @@ public class Ranking_p { rankingParameters.put(plasmaSearchRankingProfile.APPREF, "Appearance In Reference"); rankingParameters.put(plasmaSearchRankingProfile.APPTAGS, "Appearance In Tags"); rankingParameters.put(plasmaSearchRankingProfile.APPURL, "Appearance In URL"); + rankingParameters.put(plasmaSearchRankingProfile.AUTHORITY, "Authority of Domain"); rankingParameters.put(plasmaSearchRankingProfile.CATHASAPP, "Category App, Appearance"); rankingParameters.put(plasmaSearchRankingProfile.CATHASAUDIO, "Category Audio Appearance"); rankingParameters.put(plasmaSearchRankingProfile.CATHASIMAGE, "Category Image Appearance"); diff --git a/source/de/anomic/index/indexRWIEntryOrder.java b/source/de/anomic/index/indexRWIEntryOrder.java index 3325e0845..ceffcf1bb 100644 --- a/source/de/anomic/index/indexRWIEntryOrder.java +++ b/source/de/anomic/index/indexRWIEntryOrder.java @@ -26,8 +26,13 @@ package de.anomic.index; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + import de.anomic.kelondro.kelondroAbstractOrder; import de.anomic.kelondro.kelondroBitfield; +import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroOrder; import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaSearchRankingProcess; @@ -37,6 +42,8 @@ import de.anomic.yacy.yacyURL; public class indexRWIEntryOrder extends kelondroAbstractOrder implements kelondroOrder { private indexRWIVarEntry min, max; private plasmaSearchRankingProfile ranking; + private kelondroMScoreCluster doms; + private int maxdomcount; private static final int processors = Runtime.getRuntime().availableProcessors(); // for multiprocessor support, used during normalization @@ -44,6 +51,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder implements kelondr this.min = null; this.max = null; this.ranking = profile; + this.doms = new kelondroMScoreCluster(); + this.maxdomcount = 0; } public void extend(indexContainer container) { @@ -59,9 +68,20 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder implements kelondr mmf1.run(); // execute other fork in this thread if (this.min == null) this.min = mmf1.entryMin; else indexRWIVarEntry.min(this.min, mmf1.entryMin); if (this.max == null) this.max = mmf1.entryMax; else indexRWIVarEntry.max(this.max, mmf1.entryMax); + Map.Entry entry; + Iterator di = mmf1.domcount().entrySet().iterator(); + while (di.hasNext()) { + entry = (Map.Entry) di.next(); + this.doms.addScore(entry.getKey(), ((Integer) entry.getValue()).intValue()); + } try {mmf0.join();} catch (InterruptedException e) {} // wait for fork thread to finish if (this.min == null) this.min = mmf0.entryMin; else indexRWIVarEntry.min(this.min, mmf0.entryMin); if (this.max == null) this.max = mmf0.entryMax; else indexRWIVarEntry.max(this.max, mmf0.entryMax); + di = mmf0.domcount().entrySet().iterator(); + while (di.hasNext()) { + entry = (Map.Entry) di.next(); + this.doms.addScore(entry.getKey(), ((Integer) entry.getValue()).intValue()); + } //long s1= System.currentTimeMillis(), sc = Math.max(1, s1 - s0); //System.out.println("***DEBUG*** indexRWIEntry.Order (2-THREADED): " + sc + " milliseconds for " + container.size() + " entries, " + (container.size() / sc) + " entries/millisecond"); } else if (container.size() > 0) { @@ -70,15 +90,26 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder implements kelondr mmf.run(); // execute without multi-threading if (this.min == null) this.min = mmf.entryMin; else indexRWIVarEntry.min(this.min, mmf.entryMin); if (this.max == null) this.max = mmf.entryMax; else indexRWIVarEntry.max(this.max, mmf.entryMax); + Map.Entry entry; + Iterator di = mmf.domcount().entrySet().iterator(); + while (di.hasNext()) { + entry = (Map.Entry) di.next(); + this.doms.addScore(entry.getKey(), ((Integer) entry.getValue()).intValue()); + } //long s1= System.currentTimeMillis(), sc = Math.max(1, s1 - s0); //System.out.println("***DEBUG*** indexRWIEntry.Order (ONETHREAD): " + sc + " milliseconds for " + container.size() + " entries, " + (container.size() / sc) + " entries/millisecond"); } + if (this.doms.size() > 0) this.maxdomcount = this.doms.getMaxScore(); } public Object clone() { return null; } + public int authority(String urlHash) { + return (doms.getScore(urlHash.substring(6)) << 8) / (1 + this.maxdomcount); + } + public long cardinal(byte[] key) { return cardinal(new indexRWIRowEntry(key)); } @@ -95,14 +126,15 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder implements kelondr + ((t.posintext() == 0) ? 0 : ((256 - (((t.posintext() - min.posintext() ) << 8) / (1 + max.posintext() - min.posintext()) )) << ranking.coeff_posintext)) + ((t.posofphrase() == 0) ? 0 : ((256 - (((t.posofphrase() - min.posofphrase() ) << 8) / (1 + max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase)) + ((t.posinphrase() == 0) ? 0 : ((256 - (((t.posinphrase() - min.posinphrase() ) << 8) / (1 + max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase)) - + ((256 - (((t.worddistance() - min.worddistance() ) << 8) / (1 + max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance) - + ( (((t.virtualAge() - min.virtualAge() ) << 8) / (1 + max.virtualAge() - min.virtualAge()) ) << ranking.coeff_date) - + ( (((t.wordsintitle() - min.wordsintitle() ) << 8) / (1 + max.wordsintitle() - min.wordsintitle()) ) << ranking.coeff_wordsintitle) - + ( (((t.wordsintext() - min.wordsintext() ) << 8) / (1 + max.wordsintext() - min.wordsintext()) ) << ranking.coeff_wordsintext) - + ( (((t.phrasesintext()- min.phrasesintext()) << 8) / (1 + max.phrasesintext()- min.phrasesintext()) ) << ranking.coeff_phrasesintext) - + ( (((t.llocal() - min.llocal() ) << 8) / (1 + max.llocal() - min.llocal()) ) << ranking.coeff_llocal) - + ( (((t.lother() - min.lother() ) << 8) / (1 + max.lother() - min.lother()) ) << ranking.coeff_lother) - + ( (((t.hitcount() - min.hitcount() ) << 8) / (1 + max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount) + + ((256 - (((t.worddistance() - min.worddistance() ) << 8) / (1 + max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance) + + ( (((t.virtualAge() - min.virtualAge() ) << 8) / (1 + max.virtualAge() - min.virtualAge()) ) << ranking.coeff_date) + + ( (((t.wordsintitle() - min.wordsintitle() ) << 8) / (1 + max.wordsintitle() - min.wordsintitle()) ) << ranking.coeff_wordsintitle) + + ( (((t.wordsintext() - min.wordsintext() ) << 8) / (1 + max.wordsintext() - min.wordsintext()) ) << ranking.coeff_wordsintext) + + ( (((t.phrasesintext()- min.phrasesintext() ) << 8) / (1 + max.phrasesintext()- min.phrasesintext()) ) << ranking.coeff_phrasesintext) + + ( (((t.llocal() - min.llocal() ) << 8) / (1 + max.llocal() - min.llocal()) ) << ranking.coeff_llocal) + + ( (((t.lother() - min.lother() ) << 8) / (1 + max.lother() - min.lother()) ) << ranking.coeff_lother) + + ( (((t.hitcount() - min.hitcount() ) << 8) / (1 + max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount) + + ( authority(t.urlHash()) << ranking.coeff_authority) + (((flags.get(indexRWIEntry.flag_app_url)) ? 255 << ranking.coeff_appurl : 0)) + (((flags.get(indexRWIEntry.flag_app_descr)) ? 255 << ranking.coeff_appdescr : 0)) + (((flags.get(indexRWIEntry.flag_app_author)) ? 255 << ranking.coeff_appauthor : 0)) @@ -157,11 +189,15 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder implements kelondr private indexRWIVarEntry entryMin, entryMax; private indexContainer container; private int start, end; + private HashMap doms; + private Integer int1; public minmaxfinder(indexContainer container, int start /*including*/, int end /*excluding*/) { this.container = container; this.start = start; this.end = end; + this.doms = new HashMap(); + this.int1 = new Integer(1); } public void run() { @@ -170,12 +206,27 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder implements kelondr this.entryMax = null; indexRWIRowEntry iEntry; int p = this.start; + String dom; + Integer count; while (p < this.end) { iEntry = new indexRWIRowEntry(container.get(p++)); + // find min/max if (this.entryMin == null) this.entryMin = new indexRWIVarEntry(iEntry); else indexRWIVarEntry.min(this.entryMin, iEntry); if (this.entryMax == null) this.entryMax = new indexRWIVarEntry(iEntry); else indexRWIVarEntry.max(this.entryMax, iEntry); + // update domcount + dom = iEntry.urlHash().substring(6); + count = (Integer) doms.get(dom); + if (count == null) { + doms.put(dom, int1); + } else { + doms.put(dom, new Integer(count.intValue() + 1)); + } } } + + public HashMap domcount() { + return this.doms; + } } } diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index 1af4a36c3..75f808999 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -62,6 +62,7 @@ public class plasmaSearchRankingProfile { public static final String POSINTEXT = "posintext"; public static final String POSOFPHRASE = "posofphrase"; public static final String POSINPHRASE = "posinphrase"; + public static final String AUTHORITY = "authority"; public static final String WORDDISTANCE = "worddistance"; public static final String APPURL = "appurl"; public static final String APPDESCR = "appdescr"; @@ -83,7 +84,7 @@ public class plasmaSearchRankingProfile { public int coeff_domlength, coeff_ybr, coeff_date, coeff_wordsintitle, coeff_wordsintext, coeff_phrasesintext, coeff_llocal, coeff_lother, coeff_urllength, coeff_urlcomps, coeff_hitcount, - coeff_posintext, coeff_posofphrase, coeff_posinphrase, coeff_worddistance, + coeff_posintext, coeff_posofphrase, coeff_posinphrase, coeff_authority, coeff_worddistance, coeff_appurl, coeff_appdescr, coeff_appauthor, coeff_apptags, coeff_appref, coeff_appemph, coeff_catindexof, coeff_cathasimage, coeff_cathasaudio, coeff_cathasvideo, coeff_cathasapp, coeff_urlcompintoplist, coeff_descrcompintoplist, coeff_prefer; @@ -91,34 +92,35 @@ public class plasmaSearchRankingProfile { public plasmaSearchRankingProfile(int mediatype) { // set default-values coeff_domlength = 8; - coeff_ybr = 8; + coeff_ybr = 9; coeff_date = 4; coeff_wordsintitle = 4; - coeff_wordsintext = 1; - coeff_phrasesintext = 1; + coeff_wordsintext = 2; + coeff_phrasesintext = 3; coeff_llocal = 2; coeff_lother = 3; - coeff_urllength = 14; - coeff_urlcomps = 14; - coeff_hitcount = 5; - coeff_posintext = 7; - coeff_posofphrase = 6; + coeff_urllength = 15; + coeff_urlcomps = 15; + coeff_hitcount = 4; + coeff_posintext = 11; + coeff_posofphrase = 9; coeff_posinphrase = 1; + coeff_authority = 13; coeff_worddistance = 15; coeff_appurl = 14; - coeff_appdescr = 13; + coeff_appdescr = 12; coeff_appauthor = 13; coeff_apptags = 8; - coeff_appref = 9; - coeff_appemph = 13; + coeff_appref = 8; + coeff_appemph = 12; coeff_urlcompintoplist = 3; coeff_descrcompintoplist = 2; coeff_prefer = 15; - coeff_catindexof = (mediatype == plasmaSearchQuery.CONTENTDOM_TEXT) ? 1 : 10; - coeff_cathasimage = (mediatype == plasmaSearchQuery.CONTENTDOM_IMAGE) ? 15 : 1; - coeff_cathasaudio = (mediatype == plasmaSearchQuery.CONTENTDOM_AUDIO) ? 15 : 1; - coeff_cathasvideo = (mediatype == plasmaSearchQuery.CONTENTDOM_VIDEO) ? 15 : 1; - coeff_cathasapp = (mediatype == plasmaSearchQuery.CONTENTDOM_APP) ? 15 : 1; + coeff_catindexof = (mediatype == plasmaSearchQuery.CONTENTDOM_TEXT) ? 0 : 15; + coeff_cathasimage = (mediatype == plasmaSearchQuery.CONTENTDOM_IMAGE) ? 15 : 0; + coeff_cathasaudio = (mediatype == plasmaSearchQuery.CONTENTDOM_AUDIO) ? 15 : 0; + coeff_cathasvideo = (mediatype == plasmaSearchQuery.CONTENTDOM_VIDEO) ? 15 : 0; + coeff_cathasapp = (mediatype == plasmaSearchQuery.CONTENTDOM_APP) ? 15 : 0; } public plasmaSearchRankingProfile(String prefix, String profile) { @@ -153,6 +155,7 @@ public class plasmaSearchRankingProfile { coeff_posintext = parseMap(coeff, POSINTEXT, coeff_posintext); coeff_posofphrase = parseMap(coeff, POSOFPHRASE, coeff_posofphrase); coeff_posinphrase = parseMap(coeff, POSINPHRASE, coeff_posinphrase); + coeff_authority = parseMap(coeff, AUTHORITY, coeff_authority); coeff_worddistance = parseMap(coeff, WORDDISTANCE, coeff_worddistance); coeff_appurl = parseMap(coeff, APPURL, coeff_appurl); coeff_appdescr = parseMap(coeff, APPDESCR, coeff_appdescr); @@ -207,6 +210,7 @@ public class plasmaSearchRankingProfile { ext.put(prefix + POSINTEXT, Integer.toString(coeff_posintext)); ext.put(prefix + POSOFPHRASE, Integer.toString(coeff_posofphrase)); ext.put(prefix + POSINPHRASE, Integer.toString(coeff_posinphrase)); + ext.put(prefix + AUTHORITY, Integer.toString(coeff_authority)); ext.put(prefix + WORDDISTANCE, Integer.toString(coeff_worddistance)); ext.put(prefix + APPURL, Integer.toString(coeff_appurl)); ext.put(prefix + APPDESCR, Integer.toString(coeff_appdescr));