diff --git a/ChangeLog b/ChangeLog index 9c3d7555a..59ac638a8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -4,6 +4,8 @@ version 0.44svn * FIXED: special chars(like german umlauts) in tagNames allowed. (Allo) * ADDED: Show public Bookmarks in Bookmarks.html, private ones, if the user is logged in. (Allo) * FIXED: /xml/bookmarks/* now uses one file for private/public entries. private only with password. + * ADDED: possibility to get the ranking for a url. (Allo) + version 0.43 * UPDATED: new database handling of index entry objects, less IO overhead (Orbiter) * ADDED: many new ranking attributes and handling routines (Orbiter) diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index 169df6865..ecf580fec 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -159,19 +159,87 @@ public class plasmaSearchRankingProfile { return new String(ext); } + public HashMap getPreRanking(plasmaWordIndexEntry normalizedEntry){ + HashMap map=new HashMap(); + map.put(ENTROPY, new Integer(normalizedEntry.getQuality())); + map.put(DATE, new Integer(normalizedEntry.getVirtualAge())); + map.put(YBR, new Integer(plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()))); + map.put(POSINTEXT, new Integer((normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()))); + map.put(WORDDISTANCE, new Integer((normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()))); + map.put(HITCOUNT, new Integer((normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount())); + map.put(DOMLENGTH, new Integer((255 - normalizedEntry.domlengthNormalized()))); + return map; + } public long preRanking(plasmaWordIndexEntry normalizedEntry) { long ranking = 0; - - ranking += normalizedEntry.getQuality() << ((Integer) coeff.get(ENTROPY)).intValue(); - ranking += normalizedEntry.getVirtualAge() << ((Integer) coeff.get(DATE)).intValue(); - ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << ((Integer) coeff.get(YBR)).intValue(); - ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue(); - ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue(); - ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue(); - ranking += (255 - normalizedEntry.domlengthNormalized()) << ((Integer) coeff.get(DOMLENGTH)).intValue(); + HashMap map=getPreRanking(normalizedEntry); + ranking += ((Integer)map.get(ENTROPY)).intValue() << ((Integer) coeff.get(ENTROPY)).intValue(); + ranking += ((Integer)map.get(DATE)).intValue() << ((Integer) coeff.get(DATE)).intValue(); + ranking += ((Integer)map.get(YBR)).intValue() << ((Integer) coeff.get(YBR)).intValue(); + ranking += ((Integer)map.get(POSINTEXT)).intValue() << ((Integer) coeff.get(POSINTEXT)).intValue(); + ranking += ((Integer)map.get(WORDDISTANCE)).intValue() << ((Integer) coeff.get(WORDDISTANCE)).intValue(); + ranking += ((Integer)map.get(HITCOUNT)).intValue() << ((Integer) coeff.get(HITCOUNT)).intValue(); + ranking += ((Integer)map.get(DOMLENGTH)).intValue() << ((Integer) coeff.get(DOMLENGTH)).intValue(); return ranking; } + public HashMap getPostRanking(plasmaWordIndexEntry normalizedEntry, + plasmaSearchQuery query, + Set topwords, + String[] urlcomps, + String[] descrcomps, + plasmaCrawlLURL.Entry page){ + HashMap map=new HashMap(); + HashMap tmp, tmp2; + + //apply 'common-sense' heuristic using references + tmp=new HashMap(); + for (int j = 0; j < urlcomps.length; j++) { + if (topwords.contains(urlcomps[j])) + tmp.put(urlcomps[j], new Integer(256)); + else + tmp.put(urlcomps[j], new Integer(0)); + } + map.put(URLCOMPINTOPLIST, tmp); + tmp=new HashMap(); + for (int j = 0; j < descrcomps.length; j++) { + if (topwords.contains(descrcomps[j])) + tmp.put(descrcomps[j], new Integer(256)); + else + tmp.put(descrcomps[j], new Integer(0)); + } + map.put(DESCRCOMPINTOPLIST, tmp); + + // apply query-in-result matching + Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps); + Set descrcomph = plasmaSearchQuery.words2hashes(descrcomps); + Iterator shi = query.queryHashes.iterator(); + String queryhash; + tmp=new HashMap(); + tmp2=new HashMap(); + while (shi.hasNext()) { + queryhash = (String) shi.next(); + if (urlcomph.contains(queryhash)) + tmp.put(queryhash, new Integer(256)); + else + tmp.put(queryhash, new Integer(0)); + if (descrcomph.contains(queryhash)) + tmp2.put(queryhash, new Integer(256)); + else + tmp2.put(queryhash, new Integer(0)); + } + map.put(QUERYINURL, tmp); + map.put(QUERYINDESCR, tmp2); + + // prefer short urls + map.put(URLLENGTH, new Integer((256 - page.url().toString().length()))); + map.put(URLCOMPS, new Integer((32 - urlcomps.length))); + + // prefer long descriptions + map.put(DESCRLENGTH, new Integer((255 * page.descr().length() / 80))); + map.put(DESCRCOMPS, new Integer((255 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12))); + return map; + } public long postRanking( plasmaWordIndexEntry normalizedEntry, plasmaSearchQuery query, @@ -182,33 +250,41 @@ public class plasmaSearchRankingProfile { // apply pre-calculated order attributes long ranking = this.preRanking(normalizedEntry); + HashMap map=getPostRanking(normalizedEntry, query, topwords, urlcomps, descrcomps, page); + Iterator it; + HashMap tmp; // apply 'common-sense' heuristic using references - for (int j = 0; j < urlcomps.length; j++) { - if (topwords.contains(urlcomps[j])) ranking += 256 << ((Integer) coeff.get(URLCOMPINTOPLIST)).intValue(); + tmp=(HashMap) map.get(URLCOMPINTOPLIST); + it=tmp.keySet().iterator(); + while(it.hasNext()){ + ranking+= ((Integer)tmp.get((String)it.next())).intValue() << ((Integer)coeff.get(URLCOMPINTOPLIST)).intValue(); } - for (int j = 0; j < descrcomps.length; j++) { - if (topwords.contains(descrcomps[j])) ranking += 256 << ((Integer) coeff.get(DESCRCOMPINTOPLIST)).intValue(); + tmp=(HashMap) map.get(DESCRCOMPINTOPLIST); + it=tmp.keySet().iterator(); + while(it.hasNext()){ + ranking+= ((Integer)tmp.get((String)it.next())).intValue() << ((Integer)coeff.get(DESCRCOMPINTOPLIST)).intValue(); } // apply query-in-result matching - Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps); - Set descrcomph = plasmaSearchQuery.words2hashes(descrcomps); - Iterator shi = query.queryHashes.iterator(); - String queryhash; - while (shi.hasNext()) { - queryhash = (String) shi.next(); - if (urlcomph.contains(queryhash)) ranking += 256 << ((Integer) coeff.get(QUERYINURL)).intValue(); - if (descrcomph.contains(queryhash)) ranking += 256 << ((Integer) coeff.get(QUERYINDESCR)).intValue(); + tmp=(HashMap) map.get(QUERYINURL); + it=tmp.keySet().iterator(); + while(it.hasNext()){ + ranking+= ((Integer)tmp.get((String)it.next())).intValue() << ((Integer)coeff.get(QUERYINURL)).intValue(); + } + tmp=(HashMap) map.get(QUERYINDESCR); + it=tmp.keySet().iterator(); + while(it.hasNext()){ + ranking+= ((Integer)tmp.get((String)it.next())).intValue() << ((Integer)coeff.get(QUERYINDESCR)).intValue(); } // prefer short urls - ranking += (256 - page.url().toString().length()) << ((Integer) coeff.get(URLLENGTH)).intValue(); - ranking += (32 - urlcomps.length) << ((Integer) coeff.get(URLCOMPS)).intValue(); + ranking += ((Integer)map.get(URLLENGTH)).intValue() << ((Integer) coeff.get(URLLENGTH)).intValue(); + ranking += ((Integer)map.get(URLCOMPS)).intValue() << ((Integer) coeff.get(URLCOMPS)).intValue(); // prefer long descriptions - ranking += (255 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue(); - ranking += (255 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue(); + ranking += ((Integer)map.get(DESCRLENGTH)).intValue() << ((Integer) coeff.get(DESCRLENGTH)).intValue(); + ranking += ((Integer)map.get(DESCRCOMPS)).intValue() << ((Integer) coeff.get(DESCRCOMPS)).intValue(); return ranking; }