redesign of ranking data structure

- the index administration now uses the same code base for url selection and collection
  as the search interface. The index administration is therefore a good test environment for
  ranking order control
- removed old postsorting-algorithms, will be replaced with new one
- fixed many bugs occurred before during ranking; especially the contraint filtering method
  removed too many links
- fixed media search flags; had been attached to too many urls. The effect should be a better
  pre-sorting before media load within snippet fetch

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4223 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 6f1308da2f
commit c48b73cda2

@ -42,7 +42,7 @@
<td><strong>Local Text Indexing</strong></td>
<td><strong>Local Media Indexing</strong></td>
<td><strong>Remote Indexing</strong></td>
<td></td>
<td><strong>Status / Action</strong></td>
</tr>
#{crawlProfiles}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
@ -61,12 +61,14 @@
<td>#(indexMedia)#no::yes#(/indexMedia)#</td>
<td>#(remoteIndexing)#no::yes#(/remoteIndexing)#</td>
<td>#(terminateButton)#::
<div style="text-decoration:blink">Running</div>
<form action="CrawlProfileEditor_p.html" method="get" enctype="multipart/form-data">
<input type="hidden" name="handle" value="#[handle]#" />
<input type="submit" name="terminate" value="Terminate" />
</form>
#(/terminateButton)#
#(deleteButton)#::
Finished
<form action="CrawlProfileEditor_p.html" method="get" enctype="multipart/form-data">
<input type="hidden" name="handle" value="#[handle]#" />
<input type="submit" name="delete" value="Delete" />

@ -46,7 +46,6 @@ import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProcess;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.urlPattern.abstractURLPattern;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverDate;
@ -92,8 +91,8 @@ public class IndexControlRWIs_p {
if (post.containsKey("keystringsearch")) {
keyhash = plasmaCondenser.word2hash(keystring);
prop.put("keyhash", keyhash);
final plasmaWordIndex.Finding finding = genSearchresult(prop, sb, keyhash, null, false, sortorder);
if (finding.size() == 0) {
final plasmaSearchRankingProcess ranking = genSearchresult(prop, sb, keyhash, null, sortorder, false);
if (ranking.filteredCount() == 0) {
prop.put("searchresult", 1);
prop.put("searchresult_word", keystring);
}
@ -103,8 +102,8 @@ public class IndexControlRWIs_p {
if (keystring.length() == 0 || !plasmaCondenser.word2hash(keystring).equals(keyhash)) {
prop.put("keystring", "&lt;not possible to compute word from hash&gt;");
}
final plasmaWordIndex.Finding finding = genSearchresult(prop, sb, keyhash, null, false, sortorder);
if (finding.size() == 0) {
final plasmaSearchRankingProcess ranking = genSearchresult(prop, sb, keyhash, null, sortorder, false);
if (ranking.filteredCount() == 0) {
prop.put("searchresult", 2);
prop.put("searchresult_wordhash", keyhash);
}
@ -162,8 +161,8 @@ public class IndexControlRWIs_p {
}
kelondroBitfield flags = compileFlags(post);
int count = (post.get("lines", "all").equals("all")) ? -1 : post.getInt("lines", -1);
final plasmaWordIndex.Finding finding = genSearchresult(prop, sb, keyhash, flags, true, sortorder);
genURLList(prop, keyhash, keystring, finding, flags, count, sortorder);
final plasmaSearchRankingProcess ranking = genSearchresult(prop, sb, keyhash, flags, sortorder, true);
genURLList(prop, keyhash, keystring, ranking, flags, count, sortorder);
}
// transfer to other peer
@ -319,11 +318,11 @@ public class IndexControlRWIs_p {
private static kelondroBitfield compileFlags(serverObjects post) {
kelondroBitfield b = new kelondroBitfield(4);
if (post.get("allurl", "").equals("on")) {
for (int i = 0; i < 32; i++) {b.set(i, true);}
return b;
if (post.get("allurl", "").equals("on")) return null;
if (post.get("flags") != null) {
if (post.get("flags","").length() == 0) return null;
return new kelondroBitfield(4, (String) post.get("flags"));
}
if (post.get("flags") != null) return new kelondroBitfield(4, (String) post.get("flags"));
if (post.get("reference", "").equals("on")) b.set(indexRWIEntry.flag_app_reference, true);
if (post.get("description", "").equals("on")) b.set(indexRWIEntry.flag_app_descr, true);
if (post.get("author", "").equals("on")) b.set(indexRWIEntry.flag_app_author, true);
@ -359,51 +358,52 @@ public class IndexControlRWIs_p {
}
}
private static plasmaWordIndex.Finding genSearchresult(serverObjects prop, plasmaSwitchboard sb, String keyhash, kelondroBitfield filter, boolean urlfetch, int sortorder) {
final plasmaWordIndex.Finding finding = sb.wordIndex.retrieveURLs(new plasmaSearchQuery(keyhash, -1, filter), urlfetch, sortorder, sb.getRanking());
if (finding.size() == 0) {
private static plasmaSearchRankingProcess genSearchresult(serverObjects prop, plasmaSwitchboard sb, String keyhash, kelondroBitfield filter, int sortorder, boolean fetchURLs) {
plasmaSearchQuery query = new plasmaSearchQuery(keyhash, -1, filter);
plasmaSearchRankingProcess ranked = new plasmaSearchRankingProcess(sb.wordIndex, query, null, sb.getRanking(), sortorder, Integer.MAX_VALUE);
ranked.execQuery(fetchURLs);
if (ranked.filteredCount() == 0) {
prop.put("searchresult", 2);
prop.put("searchresult_wordhash", keyhash);
} else {
prop.put("searchresult", 3);
prop.put("searchresult_allurl", finding.size());
prop.put("searchresult_reference", finding.flagcount()[indexRWIEntry.flag_app_reference]);
prop.put("searchresult_description", finding.flagcount()[indexRWIEntry.flag_app_descr]);
prop.put("searchresult_author", finding.flagcount()[indexRWIEntry.flag_app_author]);
prop.put("searchresult_tag", finding.flagcount()[indexRWIEntry.flag_app_tags]);
prop.put("searchresult_url", finding.flagcount()[indexRWIEntry.flag_app_url]);
prop.put("searchresult_emphasized", finding.flagcount()[indexRWIEntry.flag_app_emphasized]);
prop.put("searchresult_image", finding.flagcount()[plasmaCondenser.flag_cat_hasimage]);
prop.put("searchresult_audio", finding.flagcount()[plasmaCondenser.flag_cat_hasaudio]);
prop.put("searchresult_video", finding.flagcount()[plasmaCondenser.flag_cat_hasvideo]);
prop.put("searchresult_app", finding.flagcount()[plasmaCondenser.flag_cat_hasapp]);
prop.put("searchresult_indexof", finding.flagcount()[plasmaCondenser.flag_cat_indexof]);
prop.put("searchresult_allurl", ranked.filteredCount());
prop.put("searchresult_reference", ranked.flagCount()[indexRWIEntry.flag_app_reference]);
prop.put("searchresult_description", ranked.flagCount()[indexRWIEntry.flag_app_descr]);
prop.put("searchresult_author", ranked.flagCount()[indexRWIEntry.flag_app_author]);
prop.put("searchresult_tag", ranked.flagCount()[indexRWIEntry.flag_app_tags]);
prop.put("searchresult_url", ranked.flagCount()[indexRWIEntry.flag_app_url]);
prop.put("searchresult_emphasized", ranked.flagCount()[indexRWIEntry.flag_app_emphasized]);
prop.put("searchresult_image", ranked.flagCount()[plasmaCondenser.flag_cat_hasimage]);
prop.put("searchresult_audio", ranked.flagCount()[plasmaCondenser.flag_cat_hasaudio]);
prop.put("searchresult_video", ranked.flagCount()[plasmaCondenser.flag_cat_hasvideo]);
prop.put("searchresult_app", ranked.flagCount()[plasmaCondenser.flag_cat_hasapp]);
prop.put("searchresult_indexof", ranked.flagCount()[plasmaCondenser.flag_cat_indexof]);
}
return finding;
return ranked;
}
private static void genURLList(serverObjects prop, String keyhash, String keystring, plasmaWordIndex.Finding finding, kelondroBitfield flags, int maxlines, int ordering) {
private static void genURLList(serverObjects prop, String keyhash, String keystring, plasmaSearchRankingProcess ranked, kelondroBitfield flags, int maxlines, int ordering) {
// search for a word hash and generate a list of url links
prop.put("genUrlList_keyHash", keyhash);
if (finding.size() == 0) {
if (ranked.filteredCount() == 0) {
prop.put("genUrlList", 1);
prop.put("genUrlList_count", 0);
prop.put("searchresult", 2);
} else {
prop.put("genUrlList", 2);
prop.put("searchresult", 3);
prop.put("genUrlList_flags", flags.exportB64());
prop.put("genUrlList_flags", (flags == null) ? "" : flags.exportB64());
prop.put("genUrlList_lines", maxlines);
prop.put("genUrlList_ordering", ordering);
int i = 0;
yacyURL url;
Iterator iter = finding.urls();
indexURLEntry entry;
String us;
long rn = -1;
while (iter.hasNext()) {
entry = (indexURLEntry) iter.next();
while ((ranked.size() > 0) && ((entry = ranked.bestURL(false)) != null)) {
if ((entry == null) || (entry.comp() == null)) continue;
url = entry.comp().url();
if (url == null) continue;
@ -452,7 +452,7 @@ public class IndexControlRWIs_p {
i++;
if ((maxlines >= 0) && (i >= maxlines)) break;
}
iter = finding.miss().iterator();
Iterator iter = ranked.miss(); // iterates url hash strings
while (iter.hasNext()) {
us = (String) iter.next();
prop.put("genUrlList_urlList_"+i+"_urlExists", "0");

@ -36,6 +36,7 @@ import de.anomic.kelondro.kelondroRotateIterator;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
public class IndexControlURLs_p {
@ -171,7 +172,7 @@ public class IndexControlURLs_p {
return prop;
}
indexURLEntry.Components comp = entry.comp();
indexURLEntry le = (entry.referrerHash() == null) ? null : switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null, 0);
indexURLEntry le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null, 0);
if (comp.url() == null) {
prop.put("genUrlProfile", "1");
prop.put("genUrlProfile_urlhash", urlhash);

@ -83,7 +83,7 @@ public final class search {
String profile = post.get("profile", ""); // remote profile hand-over
if (profile.length() > 0) profile = crypt.simpleDecode(profile, null);
//final boolean includesnippet = post.get("includesnippet", "false").equals("true");
final kelondroBitfield constraint = new kelondroBitfield(4, post.get("constraint", "______"));
final kelondroBitfield constraint = ((post.containsKey("constraint")) && (post.get("constraint", "").length() > 0)) ? new kelondroBitfield(4, post.get("constraint", "______")) : null;
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
@ -133,7 +133,7 @@ public final class search {
long urlRetrievalAllTime = 0, snippetComputationAllTime = 0;
if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts
theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, plasmaSearchQuery.catchall_constraint, false);
theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, null, false);
theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");

@ -127,7 +127,7 @@ public class yacysearch {
prop.put("input_urlmaskfilter", ".*");
prop.put("input_prefermaskfilter", "");
prop.put("input_indexof", "off");
prop.put("input_constraint", plasmaSearchQuery.catchall_constraint.exportB64());
prop.put("input_constraint", "");
prop.put("input_cat", "href");
prop.put("input_depth", "0");
prop.put("input_contentdom", "text");
@ -167,7 +167,7 @@ public class yacysearch {
String prefermask = post.get("prefermaskfilter", "");
if ((prefermask.length() > 0) && (prefermask.indexOf(".*") < 0)) prefermask = ".*" + prefermask + ".*";
kelondroBitfield constraint = post.containsKey("constraint") ? new kelondroBitfield(4, post.get("constraint", "______")) : plasmaSearchQuery.catchall_constraint;
kelondroBitfield constraint = ((post.containsKey("constraint")) && (post.get("constraint", "").length() > 0)) ? new kelondroBitfield(4, post.get("constraint", "______")) : null;
if (indexof) {
constraint = new kelondroBitfield(4);
constraint.set(plasmaCondenser.flag_cat_indexof, true);
@ -401,7 +401,7 @@ public class yacysearch {
prop.putHTML("input_urlmaskfilter", urlmask);
prop.putHTML("input_prefermaskfilter", prefermask);
prop.put("input_indexof", (indexof) ? "on" : "off");
prop.put("input_constraint", constraint.exportB64());
prop.put("input_constraint", (constraint == null) ? "" : constraint.exportB64());
prop.put("input_contentdom", post.get("contentdom", "text"));
prop.put("input_contentdomCheckText", (contentdomCode == plasmaSearchQuery.CONTENTDOM_TEXT) ? "1" : "0");
prop.put("input_contentdomCheckAudio", (contentdomCode == plasmaSearchQuery.CONTENTDOM_AUDIO) ? "1" : "0");
@ -418,6 +418,17 @@ public class yacysearch {
}
private static String navurla(int page, int display, plasmaSearchQuery theQuery) {
return "<a href=\"yacysearch.html?display=" + display + "&amp;search=" + theQuery.queryString() + "&amp;count="+ theQuery.displayResults() + "&amp;offset=" + (page * theQuery.displayResults()) + "&amp;resource=" + theQuery.searchdom() + "&amp;time=" + (theQuery.maximumTime / 1000) + "&amp;urlmaskfilter=" + theQuery.urlMask + "&amp;prefermaskfilter=" + theQuery.prefer + "&amp;cat=href&amp;constraint=" + theQuery.constraint.exportB64() + "&amp;contentdom=" + theQuery.contentdom() + "&amp;former=" + theQuery.queryString() + "\">";
return
"<a href=\"yacysearch.html?display=" + display +
"&amp;search=" + theQuery.queryString() +
"&amp;count="+ theQuery.displayResults() +
"&amp;offset=" + (page * theQuery.displayResults()) +
"&amp;resource=" + theQuery.searchdom() +
"&amp;time=" + (theQuery.maximumTime / 1000) +
"&amp;urlmaskfilter=" + theQuery.urlMask +
"&amp;prefermaskfilter=" + theQuery.prefer +
"&amp;cat=href&amp;constraint=" + ((theQuery.constraint == null) ? "" : theQuery.constraint.exportB64()) +
"&amp;contentdom=" + theQuery.contentdom() +
"&amp;former=" + theQuery.queryString() + "\">";
}
}

@ -220,8 +220,8 @@ public class yacysearchitem {
prop.put("content_rankingprops", result.word().toPropertyForm() + ", domLengthEstimated=" + yacyURL.domLengthEstimation(result.hash()) +
((yacyURL.probablyRootURL(result.hash())) ? ", probablyRootURL" : "") +
(((wordURL = yacyURL.probablyWordURL(result.hash(), query[0])) != null) ? ", probablyWordURL=" + wordURL.toNormalform(false, true) : ""));
prop.put("content_snippet", result.textSnippet().getLineMarked(theQuery.queryHashes));
plasmaSnippetCache.TextSnippet snippet = result.textSnippet();
prop.put("content_snippet", (snippet == null) ? "(snippet not found)" : snippet.getLineMarked(theQuery.queryHashes));
return prop;
}

@ -125,7 +125,7 @@ public final class plasmaSearchEvent {
if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ||
(query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) {
// do a global search
this.rankedCache = new plasmaSearchRankingProcess(query, process, ranking, max_results_preparation);
this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, process, ranking, 2, max_results_preparation);
int fetchpeers = (int) (query.maximumTime / 500L); // number of target peers; means 10 peers in 10 seconds
if (fetchpeers > 50) fetchpeers = 50;
@ -160,14 +160,15 @@ public final class plasmaSearchEvent {
serverLog.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + primarySearchThreads.length + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
} else {
// do a local search
process.startTimer();
Map[] searchContainerMaps = wordIndex.localSearchContainers(query, null);
process.yield(COLLECTION, searchContainerMaps[0].size());
this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, process, ranking, 2, max_results_preparation);
this.rankedCache.execQuery(true);
this.localcount = this.rankedCache.filteredCount();
//plasmaWordIndex.Finding finding = wordIndex.retrieveURLs(query, false, 2, ranking, process);
if (generateAbstracts) {
// compute index abstracts
process.startTimer();
Iterator ci = searchContainerMaps[0].entrySet().iterator();
Iterator ci = this.rankedCache.searchContainerMaps()[0].entrySet().iterator();
Map.Entry entry;
int maxcount = -1;
double mindhtdistance = 1.1, d;
@ -190,22 +191,9 @@ public final class plasmaSearchEvent {
IACount.put(wordhash, new Integer(container.size()));
IAResults.put(wordhash, indexContainer.compressIndex(container, null, 1000).toString());
}
process.yield("abstract generation", searchContainerMaps[0].size());
process.yield("abstract generation", this.rankedCache.searchContainerMaps()[0].size());
}
process.startTimer();
indexContainer rcLocal =
(searchContainerMaps == null) ?
plasmaWordIndex.emptyContainer(null, 0) :
indexContainer.joinExcludeContainers(
searchContainerMaps[0].values(),
searchContainerMaps[1].values(),
query.maxDistance);
process.yield(JOIN, rcLocal.size());
this.localcount = rcLocal.size();
this.rankedCache = new plasmaSearchRankingProcess(query, process, ranking, max_results_preparation);
this.rankedCache.insert(rcLocal, true);
}
if (query.onlineSnippetFetch) {
@ -221,10 +209,8 @@ public final class plasmaSearchEvent {
indexURLEntry uentry;
ResultEntry resultEntry;
synchronized (rankedCache) {
Iterator urlIterator = rankedCache.entries(wordIndex, true);
while ((urlIterator.hasNext()) && (resultList.size() < (query.neededResults()))) {
// fetch next entry
uentry = (indexURLEntry) urlIterator.next();
while ((rankedCache.size() > 0) && ((uentry = rankedCache.bestURL(true)) != null) && (resultList.size() < (query.neededResults()))) {
System.out.println("***DEBUG*** SEARCH RESULT URL=" + uentry.comp().url().toNormalform(false, false));
resultEntry = obtainResultEntry(uentry, (snippetComputationAllTime < 300) ? 1 : 0);
if (resultEntry == null) continue; // the entry had some problems, cannot be used
@ -260,51 +246,12 @@ public final class plasmaSearchEvent {
public void run() {
// do a local search
process.startTimer();
Map[] searchContainerMaps = wordIndex.localSearchContainers(query, null);
process.yield(COLLECTION, searchContainerMaps[0].size());
// use the search containers to fill up rcAbstracts locally
/*
if ((rcAbstracts != null) && (searchContainerMap != null)) {
Iterator i, ci = searchContainerMap.entrySet().iterator();
Map.Entry entry;
String wordhash;
indexContainer container;
TreeMap singleAbstract;
String mypeerhash = yacyCore.seedDB.mySeed.hash;
while (ci.hasNext()) {
entry = (Map.Entry) ci.next();
wordhash = (String) entry.getKey();
container = (indexContainer) entry.getValue();
// collect all urlhashes from the container
synchronized (rcAbstracts) {
singleAbstract = (TreeMap) rcAbstracts.get(wordhash); // a mapping from url-hashes to a string of peer-hashes
if (singleAbstract == null) singleAbstract = new TreeMap();
i = container.entries();
while (i.hasNext()) singleAbstract.put(((indexEntry) i.next()).urlHash(), mypeerhash);
rcAbstracts.put(wordhash, singleAbstract);
}
}
}
*/
// join and exlcude the local result
process.startTimer();
indexContainer rcLocal =
(searchContainerMaps == null) ?
plasmaWordIndex.emptyContainer(null, 0) :
indexContainer.joinExcludeContainers(
searchContainerMaps[0].values(),
searchContainerMaps[1].values(),
query.maxDistance);
process.yield(JOIN, rcLocal.size());
localcount = rcLocal.size();
// sort the local containers and truncate it to a limited count,
// so following sortings together with the global results will be fast
synchronized (rankedCache) {
rankedCache.insert(rcLocal, true);
rankedCache.execQuery(true);
localcount = rankedCache.filteredCount();
}
}
}
@ -367,7 +314,7 @@ public final class plasmaSearchEvent {
}
// check constraints
if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) &&
if ((query.constraint != null) &&
(query.constraint.get(plasmaCondenser.flag_cat_indexof)) &&
(!(comp.title().startsWith("Index of")))) {
final Iterator wi = query.queryHashes.iterator();
@ -401,7 +348,7 @@ public final class plasmaSearchEvent {
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
// attach text snippet
startTime = System.currentTimeMillis();
plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, (snippetFetchMode == 2), query.constraint.get(plasmaCondenser.flag_cat_indexof), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000);
plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000);
long snippetComputationTime = System.currentTimeMillis() - startTime;
serverLog.logInfo("SEARCH_EVENT", "text snippet load time for " + comp.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
@ -550,7 +497,6 @@ public final class plasmaSearchEvent {
private class resultWorker extends Thread {
private indexRWIEntry entry; // entry this thread is working on
private long timeout; // the date until this thread should try to work
private long sleeptime; // the sleeptime of this thread at the beginning of its life
private int id;
@ -559,7 +505,6 @@ public final class plasmaSearchEvent {
this.id = id;
this.timeout = System.currentTimeMillis() + lifetime;
this.sleeptime = lifetime / 10 * id;
this.entry = null;
}
public void run() {
@ -568,35 +513,16 @@ public final class plasmaSearchEvent {
if (anyRemoteSearchAlive()) try {Thread.sleep(this.sleeptime);} catch (InterruptedException e1) {}
// start fetching urls and snippets
while (true) {
if (resultList.size() > query.neededResults() + query.displayResults()) break; // computed enough
if (System.currentTimeMillis() > this.timeout) break; // time is over
indexURLEntry page;
while ((resultList.size() < query.neededResults() + query.displayResults()) &&
(System.currentTimeMillis() < this.timeout) &&
((page = rankedCache.bestURL(true)) != null)) {
if (anyResultWith(page.hash())) continue;
if (anyFailureWith(page.hash())) continue;
// try secondary search
prepareSecondarySearch(); // will be executed only once
// fetch next entry to work on
this.entry = null;
entry = nextOrder();
if (entry == null) {
if (anyRemoteSearchAlive()) {
// wait and try again
try {Thread.sleep(100);} catch (InterruptedException e) {}
continue;
} else {
// we will not see that there come more results in
break;
}
}
indexURLEntry page = wordIndex.loadedURL.load(entry.urlHash(), entry, 0);
if (page == null) {
registerFailure(entry.urlHash(), "url does not exist in lurl-db");
continue;
}
ResultEntry resultEntry = obtainResultEntry(page, 2);
if (resultEntry == null) continue; // the entry had some problems, cannot be used
urlRetrievalAllTime += resultEntry.dbRetrievalTime;
@ -617,29 +543,6 @@ public final class plasmaSearchEvent {
serverLog.logInfo("SEARCH", "resultWorker thread " + id + " terminated");
}
private indexRWIEntry nextOrder() {
synchronized (rankedCache) {
Iterator i = rankedCache.entries(null, false);
indexRWIEntry entry;
String urlhash;
while (i.hasNext()) {
entry = (indexRWIEntry) i.next();
urlhash = entry.urlHash();
if ((anyFailureWith(urlhash)) || (anyWorkerWith(urlhash)) || (anyResultWith(urlhash))) continue;
return entry;
}
}
return null; // no more entries available
}
private boolean anyWorkerWith(String urlhash) {
for (int i = 0; i < workerThreadCount; i++) {
if ((workerThreads[i] == null) || (workerThreads[i] == this)) continue;
if ((workerThreads[i].entry != null) && (workerThreads[i].entry.urlHash().equals(urlhash))) return true;
}
return false;
}
private boolean anyResultWith(String urlhash) {
for (int i = 0; i < resultList.size(); i++) {
if (((ResultEntry) resultList.get(i)).urlentry.hash().equals(urlhash)) return true;
@ -681,7 +584,7 @@ public final class plasmaSearchEvent {
// fetch the best entry from the resultList, not the entry from item position
// whenever a specific entry was switched in its position and was returned here
// a moving pointer is set to assign that item position as not changeable
int bestpick = postRankingFavourite(item);
int bestpick = item; //postRankingFavourite(item);
if (bestpick != item) {
// switch the elements
ResultEntry buf = (ResultEntry) this.resultList.get(bestpick);
@ -695,68 +598,6 @@ public final class plasmaSearchEvent {
}
}
private int postRankingFavourite(int item) {
// do a post-ranking on resultList, which should be locked upon time of this call
long rank, bestrank = 0;
int bestitem = item;
ResultEntry entry;
for (int i = item; i < this.resultList.size(); i++) {
entry = (ResultEntry) this.resultList.get(i);
rank = this.ranking.postRanking(this.query, this.references(10), entry, item);
if (rank > bestrank) {
bestrank = rank;
bestitem = i;
}
}
return bestitem;
}
/*
public void removeRedundant() {
// remove all urls from the pageAcc structure that occur double by specific redundancy rules
// a link is redundant, if a sub-path of the url is cited before. redundant urls are removed
// we find redundant urls by iteration over all elements in pageAcc
Iterator i = pageAcc.entrySet().iterator();
HashMap paths = new HashMap(); // a url-subpath to pageAcc-key relation
Map.Entry entry;
// first scan all entries and find all urls that are referenced
while (i.hasNext()) {
entry = (Map.Entry) i.next();
paths.put(((indexURLEntry) entry.getValue()).comp().url().toNormalform(true, true), entry.getKey());
//if (path != null) path = shortenPath(path);
//if (path != null) paths.put(path, entry.getKey());
}
// now scan the pageAcc again and remove all redundant urls
i = pageAcc.entrySet().iterator();
String shorten;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
shorten = shortenPath(((indexURLEntry) entry.getValue()).comp().url().toNormalform(true, true));
// scan all subpaths of the url
while (shorten != null) {
if (pageAcc.size() <= query.wantedResults) break;
if (paths.containsKey(shorten)) {
//System.out.println("deleting path from search result: " + path + " is redundant to " + shorten);
try {
i.remove();
} catch (IllegalStateException e) {
}
}
shorten = shortenPath(shorten);
}
}
}
private static String shortenPath(String path) {
int pos = path.lastIndexOf('/');
if (pos < 0) return null;
return path.substring(0, pos);
}
*/
public ArrayList completeResults(long waitingtime) {
long timeout = System.currentTimeMillis() + waitingtime;
while ((this.resultList.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) {

@ -29,7 +29,6 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
@ -40,34 +39,45 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryOrder;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBinSearch;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.serverCodings;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverProfiling;
import de.anomic.yacy.yacyURL;
public final class plasmaSearchRankingProcess {
public static kelondroBinSearch[] ybrTables = null; // block-rank tables
private static boolean useYBR = true;
private TreeMap pageAcc; // key = ranking (Long); value = indexRWIEntry
private TreeMap sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String
private HashMap doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries
private HashMap handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process
private plasmaSearchQuery query;
private plasmaSearchRankingProfile ranking;
private int sortorder;
private int filteredCount;
private indexRWIEntryOrder order;
private serverProfiling process;
private int maxentries;
private int globalcount;
private indexRWIEntryOrder order;
private serverProfiling process;
private HashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
private int[] c; // flag counter
private int[] flagcount; // flag counter
private TreeSet misses; // contains url-hashes that could not been found in the LURL-DB
private plasmaWordIndex wordIndex;
private Map[] localSearchContainerMaps;
public plasmaSearchRankingProcess(plasmaSearchQuery query, serverProfiling process, plasmaSearchRankingProfile ranking, int maxentries) {
public plasmaSearchRankingProcess(plasmaWordIndex wordIndex, plasmaSearchQuery query, serverProfiling process, plasmaSearchRankingProfile ranking, int sortorder, int maxentries) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
this.pageAcc = new TreeMap();
// sortorder: 0 = hash, 1 = url, 2 = ranking
this.localSearchContainerMaps = null;
this.sortedRWIEntries = new TreeMap();
this.doubleDomCache = new HashMap();
this.handover = new HashMap();
this.filteredCount = 0;
this.process = process;
this.order = null;
this.query = query;
@ -76,8 +86,80 @@ public final class plasmaSearchRankingProcess {
this.globalcount = 0;
this.urlhashes = new HashMap();
this.ref = new kelondroMScoreCluster();
c = new int[32];
for (int i = 0; i < 32; i++) {c[i] = 0;}
this.misses = new TreeSet();
this.wordIndex = wordIndex;
this.sortorder = sortorder;
this.flagcount = new int[32];
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
}
public void execQuery(boolean fetchURLs) {
if (process != null) process.startTimer();
this.localSearchContainerMaps = wordIndex.localSearchContainers(query, null);
if (process != null) process.yield(plasmaSearchEvent.COLLECTION, this.localSearchContainerMaps[0].size());
// join and exlcude the local result
if (process != null) process.startTimer();
indexContainer index =
(this.localSearchContainerMaps == null) ?
plasmaWordIndex.emptyContainer(null, 0) :
indexContainer.joinExcludeContainers(
this.localSearchContainerMaps[0].values(),
this.localSearchContainerMaps[1].values(),
query.maxDistance);
if (process != null) process.yield(plasmaSearchEvent.JOIN, index.size());
int joincount = index.size();
if ((index == null) || (joincount == 0)) {
return;
}
if (sortorder == 2) {
insert(index, true);
} else {
final Iterator en = index.entries();
// generate a new map where the urls are sorted (not by hash but by the url text)
indexRWIEntry ientry;
indexURLEntry uentry;
String u;
loop: while (en.hasNext()) {
ientry = (indexRWIEntry) en.next();
// check constraints
if (!testFlags(ientry)) continue loop;
// increase flag counts
for (int i = 0; i < 32; i++) {
if (ientry.flags().get(i)) {flagcount[i]++;}
}
// load url
if (sortorder == 0) {
this.sortedRWIEntries.put(ientry.urlHash(), ientry);
this.urlhashes.put(ientry.urlHash(), ientry.urlHash());
filteredCount++;
} else {
if (fetchURLs) {
uentry = wordIndex.loadedURL.load(ientry.urlHash(), ientry, 0);
if (uentry == null) {
this.misses.add(ientry.urlHash());
} else {
u = uentry.comp().url().toNormalform(false, true);
this.sortedRWIEntries.put(u, ientry);
this.urlhashes.put(ientry.urlHash(), u);
filteredCount++;
}
} else {
filteredCount++;
}
}
// interrupt if we have enough
if ((query.neededResults() > 0) && (this.misses.size() + this.sortedRWIEntries.size() > query.neededResults())) break loop;
} // end loop
}
}
public void insert(indexContainer container, boolean local) {
@ -102,7 +184,6 @@ public final class plasmaSearchRankingProcess {
// normalize entries and get ranking
if (process != null) process.startTimer();
Iterator i = container.entries();
this.pageAcc = new TreeMap();
indexRWIEntry iEntry, l;
long biggestEntry = 0;
//long s0 = System.currentTimeMillis();
@ -113,89 +194,164 @@ public final class plasmaSearchRankingProcess {
// increase flag counts
for (int j = 0; j < 32; j++) {
if (iEntry.flags().get(j)) {c[j]++;}
if (iEntry.flags().get(j)) {flagcount[j]++;}
}
// kick out entries that are too bad according to current findings
r = new Long(order.cardinal(iEntry));
if ((maxentries >= 0) && (pageAcc.size() >= maxentries) && (r.longValue() > biggestEntry)) continue;
if ((maxentries >= 0) && (sortedRWIEntries.size() >= maxentries) && (r.longValue() > biggestEntry)) continue;
// check constraints
if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && (!(iEntry.flags().allOf(query.constraint)))) continue; // filter out entries that do not match the search constraint
if (!testFlags(iEntry)) continue;
if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasaudio)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasvideo)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue;
}
if ((maxentries < 0) || (pageAcc.size() < maxentries)) {
if ((maxentries < 0) || (sortedRWIEntries.size() < maxentries)) {
if (urlhashes.containsKey(iEntry.urlHash())) continue;
while (pageAcc.containsKey(r)) r = new Long(r.longValue() + 1);
pageAcc.put(r, iEntry);
while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1);
sortedRWIEntries.put(r, iEntry);
} else {
if (r.longValue() > biggestEntry) {
continue;
} else {
if (urlhashes.containsKey(iEntry.urlHash())) continue;
l = (indexRWIEntry) pageAcc.remove((Long) pageAcc.lastKey());
l = (indexRWIEntry) sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey());
urlhashes.remove(l.urlHash());
while (pageAcc.containsKey(r)) r = new Long(r.longValue() + 1);
pageAcc.put(r, iEntry);
biggestEntry = order.cardinal((indexRWIEntry) pageAcc.get(pageAcc.lastKey()));
while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1);
sortedRWIEntries.put(r, iEntry);
biggestEntry = order.cardinal((indexRWIEntry) sortedRWIEntries.get(sortedRWIEntries.lastKey()));
}
}
urlhashes.put(iEntry.urlHash(), r);
// increase counter for statistics
if (!local) this.globalcount++;
}
this.filteredCount = pageAcc.size();
this.filteredCount = sortedRWIEntries.size();
//long sc = Math.max(1, System.currentTimeMillis() - s0);
//System.out.println("###DEBUG### time to sort " + container.size() + " entries to " + this.filteredCount + ": " + sc + " milliseconds, " + (container.size() / sc) + " entries/millisecond, ranking = " + tc);
if (container.size() > query.neededResults()) remove(true, true);
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
if (process != null) process.yield(plasmaSearchEvent.PRESORT, container.size());
}
public class rIterator implements Iterator {
boolean urls;
Iterator r;
plasmaWordIndex wi;
public rIterator(plasmaWordIndex wi, boolean fetchURLs) {
// if fetchURLs == true, this iterates indexURLEntry objects, otherwise it iterates indexRWIEntry objects
this.urls = fetchURLs;
this.r = pageAcc.entrySet().iterator();
this.wi = wi;
}
public boolean hasNext() {
return r.hasNext();
}
public Object next() {
Map.Entry entry = (Map.Entry) r.next();
indexRWIEntry ientry = (indexRWIEntry) entry.getValue();
if (urls) {
return wi.loadedURL.load(ientry.urlHash(), ientry, ((Long) entry.getKey()).longValue());
} else {
return ientry;
}
}
public void remove() {
throw new UnsupportedOperationException();
}
private boolean testFlags(indexRWIEntry ientry) {
if (query.constraint == null) return true;
// test if ientry matches with filter
// if all = true: let only entries pass that has all matching bits
// if all = false: let all entries pass that has at least one matching bit
if (query.allofconstraint) {
for (int i = 0; i < 32; i++) {
if ((query.constraint.get(i)) && (!ientry.flags().get(i))) return false;
}
return true;
}
for (int i = 0; i < 32; i++) {
if ((query.constraint.get(i)) && (ientry.flags().get(i))) return true;
}
return false;
}
public synchronized Map[] searchContainerMaps() {
// direct access to the result maps is needed for abstract generation
// this is only available if execQuery() was called before
return localSearchContainerMaps;
}
public int size() {
assert pageAcc.size() == urlhashes.size();
return pageAcc.size();
// todo:
// - remove redundant urls (sub-path occurred before)
// - move up shorter urls
// - root-domain guessing to prefer the root domain over other urls if search word appears in domain name
private synchronized Object[] /*{Object, indexRWIEntry}*/ bestRWI(boolean skipDoubleDom) {
// returns from the current RWI list the best entry and removed this entry from the list
Object bestEntry;
TreeMap m;
indexRWIEntry rwi;
while (sortedRWIEntries.size() > 0) {
bestEntry = sortedRWIEntries.firstKey();
rwi = (indexRWIEntry) sortedRWIEntries.remove(bestEntry);
if (!skipDoubleDom) return new Object[]{bestEntry, rwi};
// check doubledom
String domhash = rwi.urlHash().substring(6);
m = (TreeMap) this.doubleDomCache.get(domhash);
if (m == null) {
// first appearance of dom
m = new TreeMap();
this.doubleDomCache.put(domhash, m);
return new Object[]{bestEntry, rwi};
}
// second appearances of dom
m.put(bestEntry, rwi);
}
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
// find best entry from all caches
Iterator i = this.doubleDomCache.values().iterator();
bestEntry = null;
Object o;
indexRWIEntry bestrwi = null;
while (i.hasNext()) {
m = (TreeMap) i.next();
if (m.size() == 0) continue;
if (bestEntry == null) {
bestEntry = m.firstKey();
bestrwi = (indexRWIEntry) m.remove(bestEntry);
continue;
}
o = m.firstKey();
rwi = (indexRWIEntry) m.remove(o);
if (o instanceof Long) {
if (((Long) o).longValue() < ((Long) bestEntry).longValue()) {
bestEntry = o;
bestrwi = rwi;
}
}
if (o instanceof String) {
if (((String) o).compareTo((String) bestEntry) < 0) {
bestEntry = o;
bestrwi = rwi;
}
}
}
if (bestrwi == null) return null;
// finally remove the best entry from the doubledom cache
m = (TreeMap) this.doubleDomCache.get(bestrwi.urlHash().substring(6));
m.remove(bestEntry);
return new Object[]{bestEntry, bestrwi};
}
public synchronized indexURLEntry bestURL(boolean skipDoubleDom) {
// returns from the current RWI list the best URL entry and removed this entry from the list
while ((sortedRWIEntries.size() > 0) || (size() > 0)) {
Object[] obrwi = bestRWI(skipDoubleDom);
Object bestEntry = obrwi[0];
indexRWIEntry ientry = (indexRWIEntry) obrwi[1];
long ranking = (bestEntry instanceof Long) ? ((Long) bestEntry).longValue() : 0;
indexURLEntry u = wordIndex.loadedURL.load(ientry.urlHash(), ientry, ranking);
if (u != null) {
this.handover.put(u.hash(), u.comp().url().toNormalform(true, false)); // remember that we handed over this url
return u;
}
misses.add(ientry.urlHash());
}
return null;
}
public synchronized int size() {
//assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
int c = sortedRWIEntries.size();
Iterator i = this.doubleDomCache.values().iterator();
while (i.hasNext()) c += ((TreeMap) i.next()).size();
return c;
}
public int[] flagCount() {
return c;
return flagcount;
}
public int filteredCount() {
@ -207,17 +363,16 @@ public final class plasmaSearchRankingProcess {
}
public indexRWIEntry remove(String urlHash) {
Long r = (Long) urlhashes.get(urlHash);
Object r = (Long) urlhashes.get(urlHash);
if (r == null) return null;
assert pageAcc.containsKey(r);
indexRWIEntry iEntry = (indexRWIEntry) pageAcc.remove(r);
assert sortedRWIEntries.containsKey(r);
indexRWIEntry iEntry = (indexRWIEntry) sortedRWIEntries.remove(r);
urlhashes.remove(urlHash);
return iEntry;
}
public Iterator entries(plasmaWordIndex wi, boolean fetchURLs) {
// if fetchURLs == true, this iterates indexURLEntry objects, otherwise it iterates indexRWIEntry objects
return new rIterator(wi, fetchURLs);
public Iterator miss() {
return this.misses.iterator();
}
public Set getReferences(int count) {
@ -257,35 +412,6 @@ public final class plasmaSearchRankingProcess {
return this.order;
}
private void remove(boolean rootDomExt, boolean doubleDom) {
// this removes all refererences to urls that are extended paths of existing 'RootDom'-urls
if (pageAcc.size() <= query.neededResults()) return;
HashSet rootDoms = new HashSet();
HashSet doubleDoms = new HashSet();
Iterator i = pageAcc.entrySet().iterator();
Map.Entry entry;
indexRWIEntry iEntry;
String hashpart;
boolean isWordRootURL;
TreeSet querywords = plasmaSearchQuery.cleanQuery(query.queryString())[0];
while (i.hasNext()) {
if (pageAcc.size() <= query.neededResults()) break;
entry = (Map.Entry) i.next();
iEntry = (indexRWIEntry) entry.getValue();
hashpart = iEntry.urlHash().substring(6);
isWordRootURL = yacyURL.isWordRootURL(iEntry.urlHash(), querywords);
if (isWordRootURL) {
rootDoms.add(hashpart);
} else {
if (((rootDomExt) && (rootDoms.contains(hashpart))) ||
((doubleDom) && (doubleDoms.contains(hashpart)))) {
i.remove();
}
}
doubleDoms.add(hashpart);
}
}
public static void loadYBR(File rankingPath, int count) {
// load ranking tables
if (rankingPath.exists()) {
@ -337,4 +463,45 @@ public final class plasmaSearchRankingProcess {
return 15;
}
public long postRanking(
Set topwords,
plasmaSearchEvent.ResultEntry rentry,
int position) {
long r = (255 - position) << 8;
// for media search: prefer pages with many links
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) r += rentry.limage() << ranking.coeff_cathasimage;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) r += rentry.laudio() << ranking.coeff_cathasaudio;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) r += rentry.lvideo() << ranking.coeff_cathasvideo;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) r += rentry.lapp() << ranking.coeff_cathasapp;
// prefer hit with 'prefer' pattern
if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << ranking.coeff_prefer;
if (rentry.title().matches(query.prefer)) r += 256 << ranking.coeff_prefer;
// apply 'common-sense' heuristic using references
String urlstring = rentry.url().toNormalform(true, true);
String[] urlcomps = htmlFilterContentScraper.urlComps(urlstring);
String[] descrcomps = rentry.title().toLowerCase().split(htmlFilterContentScraper.splitrex);
for (int j = 0; j < urlcomps.length; j++) {
if (topwords.contains(urlcomps[j])) r += Math.max(1, 256 - urlstring.length()) << ranking.coeff_urlcompintoplist;
}
for (int j = 0; j < descrcomps.length; j++) {
if (topwords.contains(descrcomps[j])) r += Math.max(1, 256 - rentry.title().length()) << ranking.coeff_descrcompintoplist;
}
// apply query-in-result matching
Set urlcomph = plasmaCondenser.words2hashSet(urlcomps);
Set descrcomph = plasmaCondenser.words2hashSet(descrcomps);
Iterator shi = query.queryHashes.iterator();
String queryhash;
while (shi.hasNext()) {
queryhash = (String) shi.next();
if (urlcomph.contains(queryhash)) r += 256 << ranking.coeff_appurl;
if (descrcomph.contains(queryhash)) r += 256 << ranking.coeff_appdescr;
}
return r;
}
}

@ -44,9 +44,6 @@ package de.anomic.plasma;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import de.anomic.htmlFilter.htmlFilterContentScraper;
public class plasmaSearchRankingProfile {
@ -113,7 +110,7 @@ public class plasmaSearchRankingProfile {
coeff_appauthor = 13;
coeff_apptags = 8;
coeff_appref = 9;
coeff_appemph = 11;
coeff_appemph = 13;
coeff_urlcompintoplist = 3;
coeff_descrcompintoplist = 2;
coeff_prefer = 15;
@ -248,47 +245,4 @@ public class plasmaSearchRankingProfile {
return new String(ext);
}
public long postRanking(
plasmaSearchQuery query,
Set topwords,
plasmaSearchEvent.ResultEntry rentry,
int position) {
long ranking = (255 - position) << 8;
// for media search: prefer pages with many links
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) ranking += rentry.limage() << coeff_cathasimage;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) ranking += rentry.laudio() << coeff_cathasaudio;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) ranking += rentry.lvideo() << coeff_cathasvideo;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) ranking += rentry.lapp() << coeff_cathasapp;
// prefer hit with 'prefer' pattern
if (rentry.url().toNormalform(true, true).matches(query.prefer)) ranking += 256 << coeff_prefer;
if (rentry.title().matches(query.prefer)) ranking += 256 << coeff_prefer;
// apply 'common-sense' heuristic using references
String urlstring = rentry.url().toNormalform(true, true);
String[] urlcomps = htmlFilterContentScraper.urlComps(urlstring);
String[] descrcomps = rentry.title().toLowerCase().split(htmlFilterContentScraper.splitrex);
for (int j = 0; j < urlcomps.length; j++) {
if (topwords.contains(urlcomps[j])) ranking += Math.max(1, 256 - urlstring.length()) << coeff_urlcompintoplist;
}
for (int j = 0; j < descrcomps.length; j++) {
if (topwords.contains(descrcomps[j])) ranking += Math.max(1, 256 - rentry.title().length()) << coeff_descrcompintoplist;
}
// apply query-in-result matching
Set urlcomph = plasmaCondenser.words2hashSet(urlcomps);
Set descrcomph = plasmaCondenser.words2hashSet(descrcomps);
Iterator shi = query.queryHashes.iterator();
String queryhash;
while (shi.hasNext()) {
queryhash = (String) shi.next();
if (urlcomph.contains(queryhash)) ranking += 256 << coeff_appurl;
if (descrcomph.contains(queryhash)) ranking += 256 << coeff_appdescr;
}
return ranking;
}
}

@ -34,7 +34,6 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -47,7 +46,6 @@ import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIRowEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroCloneableIterator;
import de.anomic.kelondro.kelondroMergeIterator;
import de.anomic.kelondro.kelondroOrder;
@ -65,7 +63,7 @@ public final class plasmaWordIndex implements indexRI {
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
public static final int wCacheMaxChunk = 400; // maximum number of references for each urlhash
public static final int lowcachedivisor = 320;
public static final int maxCollectionPartition = 8; // should be 7
public static final int maxCollectionPartition = 7; // should be 7
private final kelondroOrder indexOrder = kelondroBase64Order.enhancedCoder;
private final indexRAMRI dhtOutCache, dhtInCache;
@ -407,115 +405,6 @@ public final class plasmaWordIndex implements indexRI {
return new Map[]{inclusionContainers, exclusionContainers};
}
public Finding retrieveURLs(plasmaSearchQuery query, boolean loadurl, int sortorder, plasmaSearchRankingProfile ranking) {
// search for a word hash and generate a list of url links
// sortorder: 0 = hash, 1 = url, 2 = ranking
assert query.queryHashes.size() == 1;
final TreeSet mi = new TreeSet();
String keyhash = (String) query.queryHashes.first();
kelondroBitfield filter = query.constraint;
indexContainer index = getContainer(keyhash, null);
indexRWIEntry ientry;
indexURLEntry uentry;
final int[] c = new int[32];
for (int i = 0; i < 32; i++) {c[i] = 0;}
if ((index == null) || (index.size() == 0)) {
return new Finding(mi.iterator(), mi.iterator(), mi, 0, c);
}
if (sortorder == 2) {
plasmaSearchRankingProcess process = new plasmaSearchRankingProcess(query, null, ranking, query.neededResults());
process.insert(index, true);
return new Finding(process.entries(this, true), null, mi, process.filteredCount(), process.flagCount());
} else {
final TreeMap tm = new TreeMap();
final ArrayList indexes = new ArrayList();
final Iterator en = index.entries();
// generate a new map where the urls are sorted (not by hash but by the url text)
loop: while (en.hasNext()) {
ientry = (indexRWIEntry) en.next();
// test if ientry matches with filter
if (filter != null) {
// if all = true: let only entries pass that has all matching bits
// if all = false: let all entries pass that has at least one matching bit
if (query.allofconstraint) {
for (int i = 0; i < 32; i++) {
if ((filter.get(i)) && (!ientry.flags().get(i))) continue loop;
}
} else {
boolean nok = true;
flagtest: for (int i = 0; i < 32; i++) {
if ((filter.get(i)) && (ientry.flags().get(i))) {nok = false; break flagtest;}
}
if (nok) continue loop;
}
}
// increase flag counts
for (int i = 0; i < 32; i++) {
if (ientry.flags().get(i)) {c[i]++;}
}
// load url
if (loadurl) {
uentry = loadedURL.load(ientry.urlHash(), ientry, 0);
if (uentry == null) {
mi.add(ientry.urlHash());
} else {
if (sortorder == 0) {
tm.put(uentry.comp().url().toNormalform(false, true), uentry);
}
if (sortorder == 1) {
tm.put(ientry.urlHash(), uentry);
}
}
} else {
indexes.add(ientry);
}
if ((query.neededResults() > 0) && (mi.size() + tm.size() > query.neededResults())) break loop;
} // end loop
if (loadurl) {
return new Finding(tm.values().iterator(), null, mi, tm.size(), c);
} else {
return new Finding(null, indexes.iterator(), mi, indexes.size(), c);
}
}
}
public static class Finding {
private Iterator urls; // an iterator if indexURLEntry objects
private Iterator rwientries; // an iterator of indexRWIEntry objects
private Set misses; // a set of hashes where we did not found items
private int findcount;
private int[] flagcount;
public Finding(Iterator urls, Iterator rwientries, Set misses, int findcount, int[] flagcount) {
this.findcount = findcount;
this.urls = urls;
this.rwientries = rwientries;
this.misses = misses;
this.flagcount = flagcount;
}
public int size() {
return this.findcount;
}
public Iterator urls() {
return this.urls;
}
public Iterator rwientries() {
return this.rwientries;
}
public Set miss() {
return this.misses;
}
public int[] flagcount() {
return this.flagcount;
}
}
public int size() {
return java.lang.Math.max(collections.size(), java.lang.Math.max(dhtInCache.size(), dhtOutCache.size()));
}

@ -380,7 +380,7 @@ public final class yacyClient {
post.put("ttl", "0");
post.put("maxdist", maxDistance);
post.put("profile", crypt.simpleEncode(rankingProfile.toExternalString()));
post.put("constraint", constraint.exportB64());
post.put("constraint", (constraint == null) ? "" : constraint.exportB64());
if (abstractCache != null) post.put("abstracts", "auto");
final long timestamp = System.currentTimeMillis();

Loading…
Cancel
Save