added term-frequency ranking

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4413 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 1a296af6ff
commit 974fea7933

@ -152,6 +152,7 @@
<td>pos of phrase</td>
<td>pos in phrase</td>
<td>word distance</td>
<td>term frequency</td>
<td>authority</td>
<td>date</td>
<td>words in title</td>
@ -181,6 +182,7 @@
<td class="TableCellDark">#[phrase]#</td>
<td class="TableCellDark">#[posinphrase]#</td>
<td class="TableCellDark">#[worddistance]#</td>
<td>#[tf]#</td>
<td>#[authority]#</td>
<td>#[date]#</td>
<td>#[wordsintitle]#</td>

@ -67,7 +67,8 @@ public class Ranking_p {
rankingParameters.put(plasmaSearchRankingProfile.POSOFPHRASE, "Position Of Phrase");
rankingParameters.put(plasmaSearchRankingProfile.POSINPHRASE, "Position In Phrase");
rankingParameters.put(plasmaSearchRankingProfile.PREFER, "Application Of Prefer Pattern");
rankingParameters.put(plasmaSearchRankingProfile.URLCOMPINTOPLIST, "URL Component Appears In Toplist");
rankingParameters.put(plasmaSearchRankingProfile.TERMFREQUENCY, "Term Frequency");
rankingParameters.put(plasmaSearchRankingProfile.URLCOMPINTOPLIST, "URL Component Appears In Toplist");
rankingParameters.put(plasmaSearchRankingProfile.URLCOMPS, "URL Components");
rankingParameters.put(plasmaSearchRankingProfile.URLLENGTH, "URL Length");
rankingParameters.put(plasmaSearchRankingProfile.WORDDISTANCE, "Word Distance");

@ -39,10 +39,10 @@ import de.anomic.plasma.plasmaSearchRankingProcess;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.yacy.yacyURL;
public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIEntry> implements kelondroOrder<indexRWIEntry> {
public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry> implements kelondroOrder<indexRWIVarEntry> {
private indexRWIVarEntry min, max;
private plasmaSearchRankingProfile ranking;
private kelondroMScoreCluster<String> doms;
private kelondroMScoreCluster<String> doms; // collected for "authority" heuristic
private int maxdomcount;
private static final int processors = Runtime.getRuntime().availableProcessors(); // for multiprocessor support, used during normalization
@ -55,7 +55,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIEntry> imp
this.maxdomcount = 0;
}
public void extend(indexContainer container) {
public void normalizeWith(indexContainer container) {
// normalize ranking: find minimum and maxiumum of separate ranking criteria
assert (container != null);
//long s0 = System.currentTimeMillis();
@ -102,7 +103,7 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIEntry> imp
if (this.doms.size() > 0) this.maxdomcount = this.doms.getMaxScore();
}
public kelondroOrder<indexRWIEntry> clone() {
public kelondroOrder<indexRWIVarEntry> clone() {
return null;
}
@ -111,14 +112,14 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIEntry> imp
}
public long cardinal(byte[] key) {
return cardinal(new indexRWIRowEntry(key));
return cardinal(new indexRWIVarEntry(new indexRWIRowEntry(key)));
}
public long tf(indexRWIEntry t) {
return (t.hitcount() - min.hitcount()) * (1 + max.wordsintext() - min.wordsintext()) / (1 + max.hitcount() - min.hitcount()) / (t.wordsintext() - min.wordsintext());
public static final double termFrequency(indexRWIEntry t) {
return (((double) t.hitcount()) / ((double) (t.wordsintext() + t.wordsintitle() + 1)));
}
public long cardinal(indexRWIEntry t) {
public long cardinal(indexRWIVarEntry t) {
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry
kelondroBitfield flags = t.flags();
@ -138,25 +139,26 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIEntry> imp
+ ( (((t.llocal() - min.llocal() ) << 8) / (1 + max.llocal() - min.llocal()) ) << ranking.coeff_llocal)
+ ( (((t.lother() - min.lother() ) << 8) / (1 + max.lother() - min.lother()) ) << ranking.coeff_lother)
+ ( (((t.hitcount() - min.hitcount() ) << 8) / (1 + max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
+ (((int)((((t.termFrequency()- min.termFrequency() )*256.0)/ (1 + max.termFrequency()- min.termFrequency()))))<< ranking.coeff_termfrequency)
+ ( authority(t.urlHash()) << ranking.coeff_authority)
+ (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_appdescr : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_appdescr : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_appauthor : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_subject)) ? 255 << ranking.coeff_apptags : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_description)) ? 255 << ranking.coeff_appref : 0))
+ (((flags.get(indexRWIEntry.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0))
+ (((flags.get(plasmaCondenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0))
+ (((yacyURL.probablyRootURL(t.urlHash())) ? 15 << ranking.coeff_urllength : 0));
+ (((flags.get(indexRWIEntry.flag_app_dc_subject)) ? 255 << ranking.coeff_apptags : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_description)) ? 255 << ranking.coeff_appref : 0))
+ (((flags.get(indexRWIEntry.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0))
+ (((flags.get(plasmaCondenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0))
+ (((flags.get(plasmaCondenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0))
+ (((yacyURL.probablyRootURL(t.urlHash())) ? 15 << ranking.coeff_urllength : 0));
//if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;
return Long.MAX_VALUE - r; // returns a reversed number: the lower the number the better the ranking. This is used for simple sorting with a TreeMap
}
public int compare(indexRWIEntry a, indexRWIEntry b) {
public int compare(indexRWIVarEntry a, indexRWIVarEntry b) {
long ca = cardinal(a);
long cb = cardinal(b);
return (ca > cb) ? 1 : (ca < cb) ? -1 : 0;
@ -166,7 +168,7 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIEntry> imp
return "rx";
}
public boolean wellformed(indexRWIEntry a) {
public boolean wellformed(indexRWIVarEntry a) {
return true;
}

@ -39,8 +39,9 @@ public class indexRWIVarEntry implements indexRWIEntry {
posinphrase, posofphrase,
quality, urlcomps, urllength, virtualAge,
worddistance, wordsintext, wordsintitle;
public double termFrequency;
public indexRWIVarEntry(indexRWIRowEntry e) {
public indexRWIVarEntry(indexRWIEntry e) {
this.flags = e.flags();
this.freshUntil = e.freshUntil();
this.lastModified = e.lastModified();
@ -61,6 +62,7 @@ public class indexRWIVarEntry implements indexRWIEntry {
this.worddistance = e.worddistance();
this.wordsintext = e.wordsintext();
this.wordsintitle = e.wordsintitle();
this.termFrequency = ((double) e.hitcount()) / ((double) (e.wordsintext() + e.wordsintitle() + 1));
}
public void combineDistance(indexRWIEntry oe) {
@ -166,6 +168,10 @@ public class indexRWIVarEntry implements indexRWIEntry {
return wordsintitle;
}
public double termFrequency() {
return termFrequency;
}
public static final void min(indexRWIVarEntry t, indexRWIEntry other) {
int v;
long w;
@ -185,6 +191,9 @@ public class indexRWIVarEntry implements indexRWIEntry {
if (t.urllength() > (v = other.urllength())) t.urllength = v;
if (t.urlcomps() > (v = other.urlcomps())) t.urlcomps = v;
if (t.wordsintitle() > (v = other.wordsintitle())) t.wordsintitle = v;
double tf = (other instanceof indexRWIVarEntry) ? ((indexRWIVarEntry) other).termFrequency : indexRWIEntryOrder.termFrequency(other);
if (t.termFrequency > tf) t.termFrequency = tf;
}
public static final void max(indexRWIVarEntry t, indexRWIEntry other) {
@ -206,6 +215,8 @@ public class indexRWIVarEntry implements indexRWIEntry {
if (t.urllength() < (v = other.urllength())) t.urllength = v;
if (t.urlcomps() < (v = other.urlcomps())) t.urlcomps = v;
if (t.wordsintitle() < (v = other.wordsintitle())) t.wordsintitle = v;
double tf = (other instanceof indexRWIVarEntry) ? ((indexRWIVarEntry) other).termFrequency : indexRWIEntryOrder.termFrequency(other);
if (t.termFrequency < tf) t.termFrequency = tf;
}
}

@ -33,6 +33,7 @@ import java.util.Iterator;
import de.anomic.data.listManager;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryOrder;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
@ -149,6 +150,7 @@ public class plasmaSearchAPI {
prop.putNum("genUrlList_urlList_"+i+"_urlExists_ranking", (entry.ranking() - rn));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_domlength", yacyURL.domLengthEstimation(entry.hash()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", plasmaSearchRankingProcess.ybr(entry.hash()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_tf", indexRWIEntryOrder.termFrequency(entry.word()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", (ranked.getOrder() == null) ? -1 : ranked.getOrder().authority(entry.hash()));
prop.put("genUrlList_urlList_"+i+"_urlExists_date", serverDate.formatShortDay(new Date(entry.word().lastModified())));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle());

@ -40,6 +40,7 @@ import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryOrder;
import de.anomic.index.indexRWIRowEntry;
import de.anomic.index.indexRWIVarEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBinSearch;
import de.anomic.kelondro.kelondroMScoreCluster;
@ -113,81 +114,80 @@ public final class plasmaSearchRankingProcess {
}
if (sortorder == 2) {
insert(index, true);
insertRanked(index, true);
} else {
final Iterator<indexRWIRowEntry> en = index.entries();
// generate a new map where the urls are sorted (not by hash but by the url text)
indexRWIEntry ientry;
indexURLEntry uentry;
String u;
loop: while (en.hasNext()) {
ientry = (indexRWIEntry) en.next();
insertNoOrder(index, fetchURLs);
}
}
private void insertNoOrder(indexContainer index, boolean local) {
final Iterator<indexRWIRowEntry> en = index.entries();
// generate a new map where the urls are sorted (not by hash but by the url text)
indexRWIEntry ientry;
indexURLEntry uentry;
String u;
loop: while (en.hasNext()) {
ientry = (indexRWIEntry) en.next();
// check constraints
if (!testFlags(ientry)) continue loop;
// increase flag counts
for (int i = 0; i < 32; i++) {
if (ientry.flags().get(i)) {flagcount[i]++;}
}
// load url
if (sortorder == 0) {
this.sortedRWIEntries.put(ientry.urlHash(), ientry);
this.urlhashes.put(ientry.urlHash(), ientry.urlHash());
filteredCount++;
} else {
if (fetchURLs) {
uentry = wordIndex.loadedURL.load(ientry.urlHash(), ientry, 0);
if (uentry == null) {
this.misses.add(ientry.urlHash());
} else {
u = uentry.comp().url().toNormalform(false, true);
this.sortedRWIEntries.put(u, ientry);
this.urlhashes.put(ientry.urlHash(), u);
filteredCount++;
}
// check constraints
if (!testFlags(ientry)) continue loop;
// increase flag counts
for (int i = 0; i < 32; i++) {
if (ientry.flags().get(i)) {flagcount[i]++;}
}
// load url
if (sortorder == 0) {
this.sortedRWIEntries.put(ientry.urlHash(), ientry);
this.urlhashes.put(ientry.urlHash(), ientry.urlHash());
filteredCount++;
} else {
if (local) {
uentry = wordIndex.loadedURL.load(ientry.urlHash(), ientry, 0);
if (uentry == null) {
this.misses.add(ientry.urlHash());
} else {
u = uentry.comp().url().toNormalform(false, true);
this.sortedRWIEntries.put(u, ientry);
this.urlhashes.put(ientry.urlHash(), u);
filteredCount++;
}
} else {
filteredCount++;
}
// interrupt if we have enough
if ((query.neededResults() > 0) && (this.misses.size() + this.sortedRWIEntries.size() > query.neededResults())) break loop;
} // end loop
}
}
// interrupt if we have enough
if ((query.neededResults() > 0) && (this.misses.size() + this.sortedRWIEntries.size() > query.neededResults())) break loop;
} // end loop
}
public void insert(indexContainer container, boolean local) {
public void insertRanked(indexContainer index, boolean local) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
assert (container != null);
if (container.size() == 0) return;
assert (index != null);
if (index.size() == 0) return;
long timer = System.currentTimeMillis();
if (this.order == null) {
this.order = new indexRWIEntryOrder(query.ranking);
}
this.order.extend(container);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, container.size(), System.currentTimeMillis() - timer));
/*
container.setOrdering(o, 0);
container.sort();
*/
this.order.normalizeWith(index);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer));
// normalize entries and get ranking
timer = System.currentTimeMillis();
Iterator<indexRWIRowEntry> i = container.entries();
indexRWIEntry iEntry, l;
Iterator<indexRWIRowEntry> i = index.entries();
indexRWIVarEntry iEntry, l;
long biggestEntry = 0;
//long s0 = System.currentTimeMillis();
Long r;
while (i.hasNext()) {
iEntry = (indexRWIEntry) i.next();
if (iEntry.urlHash().length() != container.row().primaryKeyLength) continue;
iEntry = new indexRWIVarEntry(i.next());
if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
// increase flag counts
for (int j = 0; j < 32; j++) {
@ -216,11 +216,11 @@ public final class plasmaSearchRankingProcess {
continue;
} else {
if (urlhashes.containsKey(iEntry.urlHash())) continue;
l = (indexRWIEntry) sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey());
l = (indexRWIVarEntry) sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey());
urlhashes.remove(l.urlHash());
while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1);
sortedRWIEntries.put(r, iEntry);
biggestEntry = order.cardinal((indexRWIEntry) sortedRWIEntries.get(sortedRWIEntries.lastKey()));
biggestEntry = order.cardinal((indexRWIVarEntry) sortedRWIEntries.get(sortedRWIEntries.lastKey()));
}
}
@ -232,7 +232,7 @@ public final class plasmaSearchRankingProcess {
//System.out.println("###DEBUG### time to sort " + container.size() + " entries to " + this.filteredCount + ": " + sc + " milliseconds, " + (container.size() / sc) + " entries/millisecond, ranking = " + tc);
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, container.size(), System.currentTimeMillis() - timer));
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer));
}
private boolean testFlags(indexRWIEntry ientry) {

@ -75,6 +75,7 @@ public class plasmaSearchRankingProfile {
public static final String CATHASAUDIO = "cathasaudio";
public static final String CATHASVIDEO = "cathasvideo";
public static final String CATHASAPP = "cathasapp";
public static final String TERMFREQUENCY = "tf";
// post-sort predicates
public static final String URLCOMPINTOPLIST = "urlcompintoplist";
@ -91,7 +92,8 @@ public class plasmaSearchRankingProfile {
coeff_posintext, coeff_posofphrase, coeff_posinphrase, coeff_authority, coeff_worddistance,
coeff_appurl, coeff_appdescr, coeff_appauthor, coeff_apptags, coeff_appref, coeff_appemph,
coeff_catindexof, coeff_cathasimage, coeff_cathasaudio, coeff_cathasvideo, coeff_cathasapp,
coeff_urlcompintoplist, coeff_descrcompintoplist, coeff_prefer;
coeff_urlcompintoplist, coeff_descrcompintoplist, coeff_prefer,
coeff_termfrequency;
public plasmaSearchRankingProfile(int mediatype) {
// set default-values
@ -103,28 +105,29 @@ public class plasmaSearchRankingProfile {
coeff_phrasesintext = 3;
coeff_llocal = 2;
coeff_lother = 3;
coeff_urllength = 15;
coeff_urlcomps = 15;
coeff_urllength = 13;
coeff_urlcomps = 13;
coeff_hitcount = 4;
coeff_posintext = 11;
coeff_posintext = 10;
coeff_posofphrase = 9;
coeff_posinphrase = 1;
coeff_authority = 13;
coeff_worddistance = 15;
coeff_appurl = 14;
coeff_appdescr = 12;
coeff_appauthor = 13;
coeff_authority = 12;
coeff_worddistance = 13;
coeff_appurl = 13;
coeff_appdescr = 11;
coeff_appauthor = 12;
coeff_apptags = 8;
coeff_appref = 8;
coeff_appemph = 12;
coeff_urlcompintoplist = 3;
coeff_descrcompintoplist = 2;
coeff_prefer = 15;
coeff_appemph = 11;
coeff_catindexof = (mediatype == plasmaSearchQuery.CONTENTDOM_TEXT) ? 0 : 15;
coeff_cathasimage = (mediatype == plasmaSearchQuery.CONTENTDOM_IMAGE) ? 15 : 0;
coeff_cathasaudio = (mediatype == plasmaSearchQuery.CONTENTDOM_AUDIO) ? 15 : 0;
coeff_cathasvideo = (mediatype == plasmaSearchQuery.CONTENTDOM_VIDEO) ? 15 : 0;
coeff_cathasapp = (mediatype == plasmaSearchQuery.CONTENTDOM_APP) ? 15 : 0;
coeff_termfrequency = 14;
coeff_urlcompintoplist = 3;
coeff_descrcompintoplist = 2;
coeff_prefer = 13;
}
public plasmaSearchRankingProfile(String prefix, String profile) {
@ -172,6 +175,7 @@ public class plasmaSearchRankingProfile {
coeff_cathasaudio = parseMap(coeff, CATHASAUDIO, coeff_cathasaudio);
coeff_cathasvideo = parseMap(coeff, CATHASVIDEO, coeff_cathasvideo);
coeff_cathasapp = parseMap(coeff, CATHASAPP, coeff_cathasapp);
coeff_termfrequency = parseMap(coeff, TERMFREQUENCY, coeff_termfrequency);
coeff_urlcompintoplist = parseMap(coeff, URLCOMPINTOPLIST, coeff_urlcompintoplist);
coeff_descrcompintoplist = parseMap(coeff, DESCRCOMPINTOPLIST, coeff_descrcompintoplist);
coeff_prefer = parseMap(coeff, PREFER, coeff_prefer);
@ -227,6 +231,7 @@ public class plasmaSearchRankingProfile {
ext.put(prefix + CATHASAUDIO, Integer.toString(coeff_cathasaudio));
ext.put(prefix + CATHASVIDEO, Integer.toString(coeff_cathasvideo));
ext.put(prefix + CATHASAPP, Integer.toString(coeff_cathasapp));
ext.put(prefix + TERMFREQUENCY, Integer.toString(coeff_termfrequency));
return ext;
}

@ -557,7 +557,7 @@ public final class yacyClient {
// store remote result to local result container
synchronized (containerCache) {
// insert one container into the search result buffer
containerCache.insert(container[0], false); // one is enough
containerCache.insertRanked(container[0], false); // one is enough
// integrate remote topwords
String references = (String) result.get("references");

Loading…
Cancel
Save