re-design of post-ranking process

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1537 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 883f03c399
commit 6eef848954

@ -119,71 +119,31 @@ public final class plasmaSearchPreOrder {
return (plasmaWordIndexEntry) pageAcc.remove(top);
}
/*
public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) {
Iterator i = container.entries();
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
plasmaWordIndexEntry entry;
while (i.hasNext()) {
if (System.currentTimeMillis() > limitTime) break;
entry = (plasmaWordIndexEntry) i.next();
addEntry(entry);
}
}
public void addEntry(plasmaWordIndexEntry indexEntry) {
long ranking = 0;
long factor = 4096L*4096L;
for (int i = 0; i < 3; i++) {
if (query.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += factor * indexEntry.getQuality() / 64L;
else if (query.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking += factor * indexEntry.getVirtualAge() / 64L;
else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking += factor * ybr_p(indexEntry.getUrlHash());
factor = factor / 4096L;
}
int wordpos = indexEntry.posintext();
if (wordpos == 0) wordpos = 1000;
ranking = ranking + 4096L*4096L * (1000 - wordpos + indexEntry.hitcount() - 2 * indexEntry.worddistance());
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry);
}
*/
public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) {
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
plasmaWordIndexEntry entry;
plasmaWordIndexEntry indexEntry;
// first pass: find min/max to obtain limits for normalization
Iterator i = container.entries();
int count = 0;
while (i.hasNext()) {
if (System.currentTimeMillis() > limitTime) break;
entry = (plasmaWordIndexEntry) i.next();
if (entryMin == null) entryMin = (plasmaWordIndexEntry) entry.clone(); else entryMin.min(entry);
if (entryMax == null) entryMax = (plasmaWordIndexEntry) entry.clone(); else entryMax.max(entry);
indexEntry = (plasmaWordIndexEntry) i.next();
if (entryMin == null) entryMin = (plasmaWordIndexEntry) indexEntry.clone(); else entryMin.min(indexEntry);
if (entryMax == null) entryMax = (plasmaWordIndexEntry) indexEntry.clone(); else entryMax.max(indexEntry);
count++;
}
// second pass: normalize entries
// second pass: normalize entries and get ranking
i = container.entries();
for (int j = 0; j < count; j++) {
entry = (plasmaWordIndexEntry) i.next();
entry.normalize(entryMin, entryMax);
addEntry(entry);
indexEntry = (plasmaWordIndexEntry) i.next();
pageAcc.put(serverCodings.encodeHex(query.ranking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.getUrlHash(), indexEntry);
}
}
public void addEntry(plasmaWordIndexEntry indexEntry) {
long ranking = 0;
for (int i = 0; i < 3; i++) {
if (query.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += indexEntry.getQuality() << (4 * (3 - i));
else if (query.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking += indexEntry.getVirtualAge() << (4 * (3 - i));
else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking += ybr_p(indexEntry.getUrlHash()) << (4 * (3 - i));
}
ranking += (indexEntry.posintext() == 0) ? 0 : (255 - indexEntry.posintext()) << 11;
ranking += (indexEntry.worddistance() == 0) ? 0 : (255 - indexEntry.worddistance()) << 10;
ranking += (indexEntry.hitcount() == 0) ? 0 : indexEntry.hitcount() << 9;
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry);
public plasmaWordIndexEntry[] getNormalizer() {
return new plasmaWordIndexEntry[] {entryMin, entryMax};
}
public static int ybr_p(String urlHash) {

@ -278,5 +278,5 @@ public class plasmaSearchProfile implements Cloneable {
}
}
}
}

@ -177,4 +177,19 @@ public final class plasmaSearchQuery {
if (blueList.contains(word)) it.remove();
}
}
public long ranking(plasmaWordIndexEntry normalizedEntry) {
long ranking = 0;
for (int i = 0; i < 3; i++) {
if (this.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += normalizedEntry.getQuality() << (4 * (3 - i));
else if (this.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking += normalizedEntry.getVirtualAge() << (4 * (3 - i));
else if (this.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << (4 * (3 - i));
}
ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << 11;
ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << 10;
ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << 9;
ranking += (255 - normalizedEntry.domlengthNormalized()) << 8;
return ranking;
}
}

@ -58,6 +58,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
public final class plasmaSearchResult {
private plasmaWordIndexEntry entryMin, entryMax;
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
@ -72,6 +73,8 @@ public final class plasmaSearchResult {
this.query = query;
this.globalContributions = 0;
this.localContributions = 0;
this.entryMin = null;
this.entryMax = null;
}
public plasmaSearchResult cloneSmart() {
@ -101,10 +104,10 @@ public final class plasmaSearchResult {
}
protected void addResult(plasmaWordIndexEntry indexEntry, plasmaCrawlLURL.Entry page) {
// this does 3 things:
// 1. simply store indexEntry and page to a cache
// 2. calculate references and store them to cache
// 2. add reference to reference sorting table
// make min/max for normalization
if (entryMin == null) entryMin = (plasmaWordIndexEntry) indexEntry.clone(); else entryMin.min(indexEntry);
if (entryMax == null) entryMax = (plasmaWordIndexEntry) indexEntry.clone(); else entryMax.max(indexEntry);
// take out relevant information for reference computation
URL url = page.url();
@ -136,33 +139,25 @@ public final class plasmaSearchResult {
plasmaCrawlLURL.Entry page;
String[] urlcomps;
String[] descrcomps;
long ranking, factor;
long ranking;
String queryhash;
for (int i = 0; i < results.size(); i++) {
// take out values from result array
resultVector = (Object[]) results.get(i);
indexEntry = (plasmaWordIndexEntry) resultVector[0];
page = (plasmaCrawlLURL.Entry) resultVector[1];
urlcomps = (String[]) resultVector[2];
descrcomps = (String[]) resultVector[3];
// apply pre-calculated order attributes
ranking = 0;
factor = 4096L*4096L;
for (int j = 0; j < 3; j++) {
if (query.order[j].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += factor * indexEntry.getQuality() / 64L;
else if (query.order[j].equals(plasmaSearchQuery.ORDER_DATE)) ranking += factor * indexEntry.getVirtualAge() / 64L;
else if (query.order[j].equals(plasmaSearchQuery.ORDER_YBR)) ranking += factor * plasmaSearchPreOrder.ybr_p(indexEntry.getUrlHash());
factor = factor / 4096L;
}
int wordpos = indexEntry.posintext();
if (wordpos == 0) wordpos = 1000;
ranking = ranking + 4096L*4096L * (1000 - wordpos + indexEntry.hitcount() - 2 * indexEntry.worddistance());
ranking = query.ranking(indexEntry.generateNormalized(entryMin, entryMax));
// apply 'common-sense' heuristic using references
for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking += 10L*4096L*4096L / urlcomps.length;
for (int j = 0; j < descrcomps.length; j++) if (commonSense.contains(descrcomps[j])) ranking += 10L*4096L*4096L / descrcomps.length;
urlcomps = (String[]) resultVector[2];
for (int j = 0; j < urlcomps.length; j++) {
if (commonSense.contains(urlcomps[j])) ranking += 1 << 12;
}
descrcomps = (String[]) resultVector[3];
for (int j = 0; j < descrcomps.length; j++) {
if (commonSense.contains(descrcomps[j])) ranking += 1 << 11;
}
// apply query-in-result matching
Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps);
@ -170,17 +165,18 @@ public final class plasmaSearchResult {
Iterator shi = query.queryHashes.iterator();
while (shi.hasNext()) {
queryhash = (String) shi.next();
if (urlcomph.contains(queryhash)) ranking += 90L*4096L*4096L / urlcomps.length / query.queryHashes.size();
if (descrcomph.contains(queryhash)) ranking += 40L*4096L*4096L / descrcomps.length / query.queryHashes.size();
if (urlcomph.contains(queryhash)) ranking += 1 << 13;
if (descrcomph.contains(queryhash)) ranking += 1 << 14;
}
// prefer short urls
ranking -= 64L * page.url().toString().length();
ranking -= 64L * urlcomps.length;
page = (plasmaCrawlLURL.Entry) resultVector[1];
ranking += (255 - page.url().toString().length()) << 10;
ranking += (24 - urlcomps.length) << 10;
// prefer long descriptions
ranking += 64L * (40 - Math.abs(40 - Math.min(40, page.descr().length())));
ranking += 64L * ( 8 - Math.abs( 8 - Math.min( 8, descrcomps.length)));
ranking += (40 - Math.abs(40 - Math.min(40, page.descr().length()))) << 10;
ranking += ( 8 - Math.abs( 8 - Math.min( 8, descrcomps.length))) << 10;
// insert value
//System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url());

@ -535,6 +535,19 @@ public class plasmaURL {
return hash3 + hash2 + hash1 + hash0;
}
public static final int domLengthEstimation(String urlHash) {
// generates an estimation of the original domain length
int flagbyte = kelondroBase64Order.enhancedCoder.decodeByte(urlHash.charAt(11));
int domLengthKey = flagbyte & 4;
switch (domLengthKey) {
case 0: return 4;
case 1: return 10;
case 2: return 14;
case 3: return 20;
}
return 20;
}
public static final String oldurlHash(URL url) {
if (url == null) return null;
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(htmlFilterContentScraper.urlNormalform(url))).substring(0, urlHashLength);

@ -388,6 +388,12 @@ public final class plasmaWordIndexEntry implements Cloneable {
this.quality = (this.quality == 0) ? 0 : 1 + 255 * (this.quality - min.quality ) / (1 + max.quality - min.quality);
}
public plasmaWordIndexEntry generateNormalized(plasmaWordIndexEntry min, plasmaWordIndexEntry max) {
plasmaWordIndexEntry e = (plasmaWordIndexEntry) this.clone();
e.normalize(min, max);
return e;
}
public String getUrlHash() { return urlHash; }
public int getQuality() { return quality; }
public int getVirtualAge() { return plasmaWordIndex.microDateDays(lastModified); }
@ -402,6 +408,10 @@ public final class plasmaWordIndexEntry implements Cloneable {
public String getLanguage() { return new String(language); }
public char getType() { return doctype; }
public boolean isLocal() { return localflag == LT_LOCAL; }
public int domlengthNormalized() {
return 255 * plasmaURL.domLengthEstimation(this.urlHash) / 20;
}
public static void main(String[] args) {
// outputs the word hash to a given word

Loading…
Cancel
Save