added a possiblity to get the ranking values for an url.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1703 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
allo 19 years ago
parent 85ac7d8386
commit 4b0dae8fcf

@ -4,6 +4,8 @@ version 0.44svn
* FIXED: special chars(like german umlauts) in tagNames allowed. (Allo)
* ADDED: Show public Bookmarks in Bookmarks.html, private ones, if the user is logged in. (Allo)
* FIXED: /xml/bookmarks/* now uses one file for private/public entries. private only with password.
* ADDED: possibility to get the ranking for a url. (Allo)
version 0.43
* UPDATED: new database handling of index entry objects, less IO overhead (Orbiter)
* ADDED: many new ranking attributes and handling routines (Orbiter)

@ -159,19 +159,87 @@ public class plasmaSearchRankingProfile {
return new String(ext);
}
public HashMap getPreRanking(plasmaWordIndexEntry normalizedEntry){
HashMap map=new HashMap();
map.put(ENTROPY, new Integer(normalizedEntry.getQuality()));
map.put(DATE, new Integer(normalizedEntry.getVirtualAge()));
map.put(YBR, new Integer(plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash())));
map.put(POSINTEXT, new Integer((normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext())));
map.put(WORDDISTANCE, new Integer((normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance())));
map.put(HITCOUNT, new Integer((normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount()));
map.put(DOMLENGTH, new Integer((255 - normalizedEntry.domlengthNormalized())));
return map;
}
public long preRanking(plasmaWordIndexEntry normalizedEntry) {
long ranking = 0;
ranking += normalizedEntry.getQuality() << ((Integer) coeff.get(ENTROPY)).intValue();
ranking += normalizedEntry.getVirtualAge() << ((Integer) coeff.get(DATE)).intValue();
ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << ((Integer) coeff.get(YBR)).intValue();
ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue();
ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue();
ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue();
ranking += (255 - normalizedEntry.domlengthNormalized()) << ((Integer) coeff.get(DOMLENGTH)).intValue();
HashMap map=getPreRanking(normalizedEntry);
ranking += ((Integer)map.get(ENTROPY)).intValue() << ((Integer) coeff.get(ENTROPY)).intValue();
ranking += ((Integer)map.get(DATE)).intValue() << ((Integer) coeff.get(DATE)).intValue();
ranking += ((Integer)map.get(YBR)).intValue() << ((Integer) coeff.get(YBR)).intValue();
ranking += ((Integer)map.get(POSINTEXT)).intValue() << ((Integer) coeff.get(POSINTEXT)).intValue();
ranking += ((Integer)map.get(WORDDISTANCE)).intValue() << ((Integer) coeff.get(WORDDISTANCE)).intValue();
ranking += ((Integer)map.get(HITCOUNT)).intValue() << ((Integer) coeff.get(HITCOUNT)).intValue();
ranking += ((Integer)map.get(DOMLENGTH)).intValue() << ((Integer) coeff.get(DOMLENGTH)).intValue();
return ranking;
}
public HashMap getPostRanking(plasmaWordIndexEntry normalizedEntry,
plasmaSearchQuery query,
Set topwords,
String[] urlcomps,
String[] descrcomps,
plasmaCrawlLURL.Entry page){
HashMap map=new HashMap();
HashMap tmp, tmp2;
//apply 'common-sense' heuristic using references
tmp=new HashMap();
for (int j = 0; j < urlcomps.length; j++) {
if (topwords.contains(urlcomps[j]))
tmp.put(urlcomps[j], new Integer(256));
else
tmp.put(urlcomps[j], new Integer(0));
}
map.put(URLCOMPINTOPLIST, tmp);
tmp=new HashMap();
for (int j = 0; j < descrcomps.length; j++) {
if (topwords.contains(descrcomps[j]))
tmp.put(descrcomps[j], new Integer(256));
else
tmp.put(descrcomps[j], new Integer(0));
}
map.put(DESCRCOMPINTOPLIST, tmp);
// apply query-in-result matching
Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps);
Set descrcomph = plasmaSearchQuery.words2hashes(descrcomps);
Iterator shi = query.queryHashes.iterator();
String queryhash;
tmp=new HashMap();
tmp2=new HashMap();
while (shi.hasNext()) {
queryhash = (String) shi.next();
if (urlcomph.contains(queryhash))
tmp.put(queryhash, new Integer(256));
else
tmp.put(queryhash, new Integer(0));
if (descrcomph.contains(queryhash))
tmp2.put(queryhash, new Integer(256));
else
tmp2.put(queryhash, new Integer(0));
}
map.put(QUERYINURL, tmp);
map.put(QUERYINDESCR, tmp2);
// prefer short urls
map.put(URLLENGTH, new Integer((256 - page.url().toString().length())));
map.put(URLCOMPS, new Integer((32 - urlcomps.length)));
// prefer long descriptions
map.put(DESCRLENGTH, new Integer((255 * page.descr().length() / 80)));
map.put(DESCRCOMPS, new Integer((255 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12)));
return map;
}
public long postRanking(
plasmaWordIndexEntry normalizedEntry,
plasmaSearchQuery query,
@ -182,33 +250,41 @@ public class plasmaSearchRankingProfile {
// apply pre-calculated order attributes
long ranking = this.preRanking(normalizedEntry);
HashMap map=getPostRanking(normalizedEntry, query, topwords, urlcomps, descrcomps, page);
Iterator it;
HashMap tmp;
// apply 'common-sense' heuristic using references
for (int j = 0; j < urlcomps.length; j++) {
if (topwords.contains(urlcomps[j])) ranking += 256 << ((Integer) coeff.get(URLCOMPINTOPLIST)).intValue();
tmp=(HashMap) map.get(URLCOMPINTOPLIST);
it=tmp.keySet().iterator();
while(it.hasNext()){
ranking+= ((Integer)tmp.get((String)it.next())).intValue() << ((Integer)coeff.get(URLCOMPINTOPLIST)).intValue();
}
for (int j = 0; j < descrcomps.length; j++) {
if (topwords.contains(descrcomps[j])) ranking += 256 << ((Integer) coeff.get(DESCRCOMPINTOPLIST)).intValue();
tmp=(HashMap) map.get(DESCRCOMPINTOPLIST);
it=tmp.keySet().iterator();
while(it.hasNext()){
ranking+= ((Integer)tmp.get((String)it.next())).intValue() << ((Integer)coeff.get(DESCRCOMPINTOPLIST)).intValue();
}
// apply query-in-result matching
Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps);
Set descrcomph = plasmaSearchQuery.words2hashes(descrcomps);
Iterator shi = query.queryHashes.iterator();
String queryhash;
while (shi.hasNext()) {
queryhash = (String) shi.next();
if (urlcomph.contains(queryhash)) ranking += 256 << ((Integer) coeff.get(QUERYINURL)).intValue();
if (descrcomph.contains(queryhash)) ranking += 256 << ((Integer) coeff.get(QUERYINDESCR)).intValue();
tmp=(HashMap) map.get(QUERYINURL);
it=tmp.keySet().iterator();
while(it.hasNext()){
ranking+= ((Integer)tmp.get((String)it.next())).intValue() << ((Integer)coeff.get(QUERYINURL)).intValue();
}
tmp=(HashMap) map.get(QUERYINDESCR);
it=tmp.keySet().iterator();
while(it.hasNext()){
ranking+= ((Integer)tmp.get((String)it.next())).intValue() << ((Integer)coeff.get(QUERYINDESCR)).intValue();
}
// prefer short urls
ranking += (256 - page.url().toString().length()) << ((Integer) coeff.get(URLLENGTH)).intValue();
ranking += (32 - urlcomps.length) << ((Integer) coeff.get(URLCOMPS)).intValue();
ranking += ((Integer)map.get(URLLENGTH)).intValue() << ((Integer) coeff.get(URLLENGTH)).intValue();
ranking += ((Integer)map.get(URLCOMPS)).intValue() << ((Integer) coeff.get(URLCOMPS)).intValue();
// prefer long descriptions
ranking += (255 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
ranking += (255 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue();
ranking += ((Integer)map.get(DESCRLENGTH)).intValue() << ((Integer) coeff.get(DESCRLENGTH)).intValue();
ranking += ((Integer)map.get(DESCRCOMPS)).intValue() << ((Integer) coeff.get(DESCRCOMPS)).intValue();
return ranking;
}

Loading…
Cancel
Save