enhancements to ranking

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2535 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 63893003be
commit 64bed59ee8

@ -571,36 +571,39 @@ public class indexURL {
return hash3 + hash2 + hash1 + hash0;
}
private static final char[] rootURLFlags = new char[] {
subdomPortPath("www", 80, ""),
subdomPortPath("", 80, "")
};
private static char subdomPortPath(String subdom, int port, String rootpath) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(subdom + ":" + port + ":" + rootpath)).charAt(0);
}
private static final char rootURLFlag = subdomPortPath("www", 80, "");
public static final boolean probablyRootURL(String urlHash) {
for (int i = 0; i < rootURLFlags.length; i++) if (urlHash.charAt(6) == rootURLFlags[i]) return true;
return false;
return (urlHash.charAt(5) == rootURLFlag);
}
private static String protocolHostPort(String protocol, String host, int port) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(protocol + ":" + host + ":" + port)).substring(0, 5);
}
public static final boolean probablyWordURL(String urlHash, String word) {
if (word == null) return false;
private static String[] testTLDs = new String[] {"com", "net", "org", "uk", "fr", "de", "es", "it"};
public static final URL probablyWordURL(String urlHash, String word) {
if (word == null) return null;
String pattern = urlHash.substring(6, 11);
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".com", 80))) return true;
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".net", 80))) return true;
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".org", 80))) return true;
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".uk", 80))) return true;
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".fr", 80))) return true;
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".de", 80))) return true;
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".es", 80))) return true;
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".it", 80))) return true;
return false;
for (int i = 0; i < testTLDs.length; i++) {
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + "." + testTLDs[i], 80)))
try {
return new URL("http://www." + word.toLowerCase() + "." + testTLDs[i]);
} catch (MalformedURLException e) {
return null;
}
}
return null;
}
public static final boolean isWordRootURL(String givenURLHash, String word) {
if (!(probablyRootURL(givenURLHash))) return false;
URL wordURL = probablyWordURL(givenURLHash, word);
if (wordURL == null) return false;
return urlHash(wordURL).equals(givenURLHash);
}
public static final int domLengthEstimation(String urlHash) {

@ -223,6 +223,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
preorderTime = preorderTime - (System.currentTimeMillis() - pst);
if (preorderTime < 0) preorderTime = 200;
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, searchResult, preorderTime);
preorder.remove(true, true);
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_PRESORT, rcLocal.size());
@ -241,11 +242,10 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);
try {
while (preorder.hasNext()) {
//if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break;
//if (acc.sizeFetched() >= minEntries) break;
if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break;
preorderEntry = preorder.next();
entry = (indexEntry) preorderEntry[0];
// load only urls if there was not yet a root url of that hash
preranking = (Long) preorderEntry[1];
// find the url entry
page = urlStore.load(entry.urlHash(), entry);
@ -267,7 +267,6 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// apply filter
profileLocal.startTimer();
acc.removeRedundant();
//acc.removeDoubleDom();
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered());
@ -281,6 +280,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
profileLocal.startTimer();
if (maxtime < 0) maxtime = 200;
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, maxtime);
preorder.remove(true, true);
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_PRESORT, rcLocal.size());
@ -320,7 +320,6 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// apply filter
profileLocal.startTimer();
acc.removeRedundant();
//acc.removeDoubleDom();
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered());

@ -44,13 +44,16 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.TreeMap;
import java.util.Map;
import java.util.Iterator;
import de.anomic.server.serverCodings;
import de.anomic.server.serverFileUtils;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroBinSearch;
public final class plasmaSearchPreOrder {
@ -100,6 +103,31 @@ public final class plasmaSearchPreOrder {
}
}
public void remove(boolean rootDomExt, boolean doubleDom) {
// this removes all refererences to urls that are extended paths of existing 'RootDom'-urls
HashSet rootDoms = new HashSet();
HashSet doubleDoms = new HashSet();
Iterator i = pageAcc.entrySet().iterator();
Map.Entry entry;
indexEntry iEntry;
String hashpart;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
iEntry = (indexEntry) entry.getValue();
hashpart = iEntry.urlHash().substring(6);
if (((rootDomExt) && (rootDoms.contains(hashpart))) ||
((doubleDom) && (doubleDoms.contains(hashpart)))) {
i.remove();
if (pageAcc.size() <= query.wantedResults) return;
} else {
if (indexURL.isWordRootURL(iEntry.urlHash(), query.words(""))) {
rootDoms.add(hashpart);
}
}
doubleDoms.add(hashpart);
}
}
public static void loadYBR(File rankingPath, int count) {
// load ranking tables
if (rankingPath.exists()) {

@ -175,7 +175,7 @@ public class plasmaSearchRankingProfile {
ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue();
ranking += (256 - indexURL.domLengthNormalized(normalizedEntry.urlHash())) << ((Integer) coeff.get(DOMLENGTH)).intValue();
ranking += (indexURL.probablyRootURL(normalizedEntry.urlHash())) ? 16 << ((Integer) coeff.get(URLLENGTH)).intValue() : 0;
ranking += (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord)) ? 256 << ((Integer) coeff.get(QUERYINURL)).intValue() : 0;
ranking += (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord) != null) ? 256 << ((Integer) coeff.get(QUERYINURL)).intValue() : 0;
/*
if (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord))
System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains word " + searchedWord + ", weighted " + ((Integer) coeff.get(QUERYINURL)).intValue() + ", ranking = " + ranking);

@ -161,21 +161,6 @@ public final class plasmaSearchResult {
results = null;
}
public void removeDoubleDom() {
Iterator i = pageAcc.entrySet().iterator();
HashSet doms = new HashSet();
Map.Entry entry;
String dom;
while (i.hasNext()) {
if (pageAcc.size() <= query.wantedResults) return;
entry = (Map.Entry) i.next();
dom = ((plasmaCrawlLURL.Entry) entry.getValue()).url().getHost();
if (doms.contains(dom)) i.remove(); else doms.add(dom);
}
}
public void removeRedundant() {
// remove all urls from the pageAcc structure that occur double by specific redundancy rules
// a link is redundant, if a sub-path of the url is cited before. redundant urls are removed

@ -2068,6 +2068,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
*/
//addScoreForked(ref, gs, descr.split(" "));
//addScoreForked(ref, gs, urlstring.split("/"));
URL wordURL;
if (urlstring.matches(query.urlMask)) { //.* is default
snippet = snippetCache.retrieve(url, query.queryHashes, false, 260);
if (snippet.getSource() == plasmaSnippetCache.ERROR_NO_MATCH) {
@ -2086,7 +2087,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("type_results_" + i + "_former", formerSearch);
prop.put("type_results_" + i + "_rankingprops", urlentry.word().toPropertyForm(true) + ", domLengthEstimated=" + indexURL.domLengthEstimation(urlhash) +
((indexURL.probablyRootURL(urlhash)) ? ", probablyRootURL" : "") +
((indexURL.probablyWordURL(urlhash, query.words(""))) ? ", probablyWordURL" : ""));
(((wordURL = indexURL.probablyWordURL(urlhash, query.words(""))) != null) ? ", probablyWordURL=" + wordURL.toNormalform() : ""));
// adding snippet if available
if (snippet.exists()) {
prop.put("type_results_" + i + "_snippet", 1);

Loading…
Cancel
Save