enhanced search result computation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2527 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 809960ddc6
commit 03835c2ee8

@ -154,6 +154,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
*/ */
public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"'; public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
public static String[] urlComps(String normalizedURL) { public static String[] urlComps(String normalizedURL) {
int p = normalizedURL.indexOf("//");
if (p > 0) normalizedURL = normalizedURL.substring(p + 2);
return normalizedURL.toLowerCase().split(splitrex); // word components of the url return normalizedURL.toLowerCase().split(splitrex); // word components of the url
} }

@ -266,8 +266,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// apply filter // apply filter
profileLocal.startTimer(); profileLocal.startTimer();
//acc.removeRedundant(); acc.removeRedundant();
acc.removeDoubleDom(); //acc.removeDoubleDom();
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER); profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered()); profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered());
@ -294,12 +294,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
plasmaCrawlLURL.Entry page; plasmaCrawlLURL.Entry page;
Long preranking; Long preranking;
Object[] preorderEntry; Object[] preorderEntry;
int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);
try { try {
while (preorder.hasNext()) { while (preorder.hasNext()) {
//if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break; if (System.currentTimeMillis() >= postorderLimitTime) break;
//if (acc.sizeFetched() >= minEntries) break;
if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break;
preorderEntry = preorder.next(); preorderEntry = preorder.next();
entry = (indexEntry) preorderEntry[0]; entry = (indexEntry) preorderEntry[0];
preranking = (Long) preorderEntry[1]; preranking = (Long) preorderEntry[1];
@ -322,8 +319,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// apply filter // apply filter
profileLocal.startTimer(); profileLocal.startTimer();
//acc.removeRedundant(); acc.removeRedundant();
acc.removeDoubleDom(); //acc.removeDoubleDom();
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER); profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered()); profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered());

@ -96,7 +96,7 @@ public final class plasmaSearchPreOrder {
this.pageAcc = new TreeMap(); this.pageAcc = new TreeMap();
for (int j = 0; j < count; j++) { for (int j = 0; j < count; j++) {
iEntry = (indexEntry) i.next(); iEntry = (indexEntry) i.next();
pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry); pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry);
} }
} }
@ -150,7 +150,7 @@ public final class plasmaSearchPreOrder {
} }
public Object[] /*{indexEntry, Long}*/ next() { public Object[] /*{indexEntry, Long}*/ next() {
String top = (String) pageAcc.lastKey(); String top = (String) pageAcc.firstKey();
//System.out.println("preorder-key: " + top); //System.out.println("preorder-key: " + top);
Long preranking = new Long(Long.parseLong(top.substring(0, 16), 16)); Long preranking = new Long(Long.parseLong(top.substring(0, 16), 16));
return new Object[]{(indexEntry) pageAcc.remove(top), preranking}; return new Object[]{(indexEntry) pageAcc.remove(top), preranking};

@ -170,16 +170,18 @@ public class plasmaSearchRankingProfile {
ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue(); ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue();
ranking += normalizedEntry.virtualAge() << ((Integer) coeff.get(DATE)).intValue(); ranking += normalizedEntry.virtualAge() << ((Integer) coeff.get(DATE)).intValue();
ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.urlHash()) << ((Integer) coeff.get(YBR)).intValue(); ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.urlHash()) << ((Integer) coeff.get(YBR)).intValue();
ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue(); ranking += (normalizedEntry.posintext() == 0) ? 0 : (256 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue();
ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue(); ranking += (normalizedEntry.worddistance() == 0) ? 0 : (256 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue();
ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue(); ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue();
ranking += (255 - indexURL.domLengthNormalized(normalizedEntry.urlHash())) << ((Integer) coeff.get(DOMLENGTH)).intValue(); ranking += (256 - indexURL.domLengthNormalized(normalizedEntry.urlHash())) << ((Integer) coeff.get(DOMLENGTH)).intValue();
ranking += (indexURL.probablyRootURL(normalizedEntry.urlHash())) ? 16 << ((Integer) coeff.get(URLLENGTH)).intValue() : 0; ranking += (indexURL.probablyRootURL(normalizedEntry.urlHash())) ? 16 << ((Integer) coeff.get(URLLENGTH)).intValue() : 0;
ranking += (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord)) ? 256 << ((Integer) coeff.get(QUERYINURL)).intValue() : 0; ranking += (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord)) ? 256 << ((Integer) coeff.get(QUERYINURL)).intValue() : 0;
/*
if (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord)) if (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord))
System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains word " + searchedWord + ", weighted " + ((Integer) coeff.get(QUERYINURL)).intValue() + ", ranking = " + ranking); System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains word " + searchedWord + ", weighted " + ((Integer) coeff.get(QUERYINURL)).intValue() + ", ranking = " + ranking);
else else
System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains not word " + searchedWord + ", ranking = " + ranking); System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains not word " + searchedWord + ", ranking = " + ranking);
*/
return ranking; return ranking;
} }
@ -219,11 +221,11 @@ public class plasmaSearchRankingProfile {
// prefer short urls // prefer short urls
ranking += (256 - page.url().toString().length()) << ((Integer) coeff.get(URLLENGTH)).intValue(); ranking += (256 - page.url().toString().length()) << ((Integer) coeff.get(URLLENGTH)).intValue();
ranking += (32 - urlcomps.length) << ((Integer) coeff.get(URLCOMPS)).intValue(); ranking += (8 * Math.max(0, 32 - urlcomps.length)) << ((Integer) coeff.get(URLCOMPS)).intValue();
// prefer long descriptions // prefer long descriptions
ranking += (255 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue(); ranking += (256 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
ranking += (255 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue(); ranking += (256 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue();
return ranking; return ranking;
} }

@ -100,7 +100,7 @@ public final class plasmaSearchResult {
} }
public plasmaCrawlLURL.Entry nextElement() { public plasmaCrawlLURL.Entry nextElement() {
Object top = pageAcc.lastKey(); Object top = pageAcc.firstKey();
//System.out.println("postorder-key: " + ((String) top)); //System.out.println("postorder-key: " + ((String) top));
return (plasmaCrawlLURL.Entry) pageAcc.remove(top); return (plasmaCrawlLURL.Entry) pageAcc.remove(top);
} }
@ -154,7 +154,7 @@ public final class plasmaSearchResult {
// insert value // insert value
//System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url()); //System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url());
pageAcc.put(serverCodings.encodeHex(ranking, 16) + page.hash(), page); pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - ranking, 16) + page.hash(), page);
} }
// flush memory // flush memory

Loading…
Cancel
Save