diff --git a/htroot/CrawlProfileEditor_p.html b/htroot/CrawlProfileEditor_p.html index cef1768de..ba36674b8 100644 --- a/htroot/CrawlProfileEditor_p.html +++ b/htroot/CrawlProfileEditor_p.html @@ -42,7 +42,7 @@ Local Text Indexing Local Media Indexing Remote Indexing - + Status / Action #{crawlProfiles}# @@ -61,12 +61,14 @@ #(indexMedia)#no::yes#(/indexMedia)# #(remoteIndexing)#no::yes#(/remoteIndexing)# #(terminateButton)#:: +
Running
#(/terminateButton)# #(deleteButton)#:: + Finished
diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 5094c7047..e5ba3cc9b 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -46,7 +46,6 @@ import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProcess; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.urlPattern.abstractURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverDate; @@ -92,8 +91,8 @@ public class IndexControlRWIs_p { if (post.containsKey("keystringsearch")) { keyhash = plasmaCondenser.word2hash(keystring); prop.put("keyhash", keyhash); - final plasmaWordIndex.Finding finding = genSearchresult(prop, sb, keyhash, null, false, sortorder); - if (finding.size() == 0) { + final plasmaSearchRankingProcess ranking = genSearchresult(prop, sb, keyhash, null, sortorder, false); + if (ranking.filteredCount() == 0) { prop.put("searchresult", 1); prop.put("searchresult_word", keystring); } @@ -103,8 +102,8 @@ public class IndexControlRWIs_p { if (keystring.length() == 0 || !plasmaCondenser.word2hash(keystring).equals(keyhash)) { prop.put("keystring", "<not possible to compute word from hash>"); } - final plasmaWordIndex.Finding finding = genSearchresult(prop, sb, keyhash, null, false, sortorder); - if (finding.size() == 0) { + final plasmaSearchRankingProcess ranking = genSearchresult(prop, sb, keyhash, null, sortorder, false); + if (ranking.filteredCount() == 0) { prop.put("searchresult", 2); prop.put("searchresult_wordhash", keyhash); } @@ -162,8 +161,8 @@ public class IndexControlRWIs_p { } kelondroBitfield flags = compileFlags(post); int count = (post.get("lines", "all").equals("all")) ? -1 : post.getInt("lines", -1); - final plasmaWordIndex.Finding finding = genSearchresult(prop, sb, keyhash, flags, true, sortorder); - genURLList(prop, keyhash, keystring, finding, flags, count, sortorder); + final plasmaSearchRankingProcess ranking = genSearchresult(prop, sb, keyhash, flags, sortorder, true); + genURLList(prop, keyhash, keystring, ranking, flags, count, sortorder); } // transfer to other peer @@ -319,11 +318,11 @@ public class IndexControlRWIs_p { private static kelondroBitfield compileFlags(serverObjects post) { kelondroBitfield b = new kelondroBitfield(4); - if (post.get("allurl", "").equals("on")) { - for (int i = 0; i < 32; i++) {b.set(i, true);} - return b; + if (post.get("allurl", "").equals("on")) return null; + if (post.get("flags") != null) { + if (post.get("flags","").length() == 0) return null; + return new kelondroBitfield(4, (String) post.get("flags")); } - if (post.get("flags") != null) return new kelondroBitfield(4, (String) post.get("flags")); if (post.get("reference", "").equals("on")) b.set(indexRWIEntry.flag_app_reference, true); if (post.get("description", "").equals("on")) b.set(indexRWIEntry.flag_app_descr, true); if (post.get("author", "").equals("on")) b.set(indexRWIEntry.flag_app_author, true); @@ -359,51 +358,52 @@ public class IndexControlRWIs_p { } } - private static plasmaWordIndex.Finding genSearchresult(serverObjects prop, plasmaSwitchboard sb, String keyhash, kelondroBitfield filter, boolean urlfetch, int sortorder) { - final plasmaWordIndex.Finding finding = sb.wordIndex.retrieveURLs(new plasmaSearchQuery(keyhash, -1, filter), urlfetch, sortorder, sb.getRanking()); - if (finding.size() == 0) { + private static plasmaSearchRankingProcess genSearchresult(serverObjects prop, plasmaSwitchboard sb, String keyhash, kelondroBitfield filter, int sortorder, boolean fetchURLs) { + plasmaSearchQuery query = new plasmaSearchQuery(keyhash, -1, filter); + plasmaSearchRankingProcess ranked = new plasmaSearchRankingProcess(sb.wordIndex, query, null, sb.getRanking(), sortorder, Integer.MAX_VALUE); + ranked.execQuery(fetchURLs); + + if (ranked.filteredCount() == 0) { prop.put("searchresult", 2); prop.put("searchresult_wordhash", keyhash); } else { prop.put("searchresult", 3); - prop.put("searchresult_allurl", finding.size()); - prop.put("searchresult_reference", finding.flagcount()[indexRWIEntry.flag_app_reference]); - prop.put("searchresult_description", finding.flagcount()[indexRWIEntry.flag_app_descr]); - prop.put("searchresult_author", finding.flagcount()[indexRWIEntry.flag_app_author]); - prop.put("searchresult_tag", finding.flagcount()[indexRWIEntry.flag_app_tags]); - prop.put("searchresult_url", finding.flagcount()[indexRWIEntry.flag_app_url]); - prop.put("searchresult_emphasized", finding.flagcount()[indexRWIEntry.flag_app_emphasized]); - prop.put("searchresult_image", finding.flagcount()[plasmaCondenser.flag_cat_hasimage]); - prop.put("searchresult_audio", finding.flagcount()[plasmaCondenser.flag_cat_hasaudio]); - prop.put("searchresult_video", finding.flagcount()[plasmaCondenser.flag_cat_hasvideo]); - prop.put("searchresult_app", finding.flagcount()[plasmaCondenser.flag_cat_hasapp]); - prop.put("searchresult_indexof", finding.flagcount()[plasmaCondenser.flag_cat_indexof]); + prop.put("searchresult_allurl", ranked.filteredCount()); + prop.put("searchresult_reference", ranked.flagCount()[indexRWIEntry.flag_app_reference]); + prop.put("searchresult_description", ranked.flagCount()[indexRWIEntry.flag_app_descr]); + prop.put("searchresult_author", ranked.flagCount()[indexRWIEntry.flag_app_author]); + prop.put("searchresult_tag", ranked.flagCount()[indexRWIEntry.flag_app_tags]); + prop.put("searchresult_url", ranked.flagCount()[indexRWIEntry.flag_app_url]); + prop.put("searchresult_emphasized", ranked.flagCount()[indexRWIEntry.flag_app_emphasized]); + prop.put("searchresult_image", ranked.flagCount()[plasmaCondenser.flag_cat_hasimage]); + prop.put("searchresult_audio", ranked.flagCount()[plasmaCondenser.flag_cat_hasaudio]); + prop.put("searchresult_video", ranked.flagCount()[plasmaCondenser.flag_cat_hasvideo]); + prop.put("searchresult_app", ranked.flagCount()[plasmaCondenser.flag_cat_hasapp]); + prop.put("searchresult_indexof", ranked.flagCount()[plasmaCondenser.flag_cat_indexof]); } - return finding; + return ranked; } - private static void genURLList(serverObjects prop, String keyhash, String keystring, plasmaWordIndex.Finding finding, kelondroBitfield flags, int maxlines, int ordering) { + private static void genURLList(serverObjects prop, String keyhash, String keystring, plasmaSearchRankingProcess ranked, kelondroBitfield flags, int maxlines, int ordering) { // search for a word hash and generate a list of url links prop.put("genUrlList_keyHash", keyhash); - if (finding.size() == 0) { + if (ranked.filteredCount() == 0) { prop.put("genUrlList", 1); prop.put("genUrlList_count", 0); prop.put("searchresult", 2); } else { prop.put("genUrlList", 2); prop.put("searchresult", 3); - prop.put("genUrlList_flags", flags.exportB64()); + prop.put("genUrlList_flags", (flags == null) ? "" : flags.exportB64()); prop.put("genUrlList_lines", maxlines); prop.put("genUrlList_ordering", ordering); int i = 0; yacyURL url; - Iterator iter = finding.urls(); indexURLEntry entry; String us; long rn = -1; - while (iter.hasNext()) { - entry = (indexURLEntry) iter.next(); + while ((ranked.size() > 0) && ((entry = ranked.bestURL(false)) != null)) { if ((entry == null) || (entry.comp() == null)) continue; url = entry.comp().url(); if (url == null) continue; @@ -452,7 +452,7 @@ public class IndexControlRWIs_p { i++; if ((maxlines >= 0) && (i >= maxlines)) break; } - iter = finding.miss().iterator(); + Iterator iter = ranked.miss(); // iterates url hash strings while (iter.hasNext()) { us = (String) iter.next(); prop.put("genUrlList_urlList_"+i+"_urlExists", "0"); diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 5fdd7013f..5b2fda5b6 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -36,6 +36,7 @@ import de.anomic.kelondro.kelondroRotateIterator; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; +import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacyURL; public class IndexControlURLs_p { @@ -171,7 +172,7 @@ public class IndexControlURLs_p { return prop; } indexURLEntry.Components comp = entry.comp(); - indexURLEntry le = (entry.referrerHash() == null) ? null : switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null, 0); + indexURLEntry le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null, 0); if (comp.url() == null) { prop.put("genUrlProfile", "1"); prop.put("genUrlProfile_urlhash", urlhash); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index e3820db35..d9b91c952 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -83,7 +83,7 @@ public final class search { String profile = post.get("profile", ""); // remote profile hand-over if (profile.length() > 0) profile = crypt.simpleDecode(profile, null); //final boolean includesnippet = post.get("includesnippet", "false").equals("true"); - final kelondroBitfield constraint = new kelondroBitfield(4, post.get("constraint", "______")); + final kelondroBitfield constraint = ((post.containsKey("constraint")) && (post.get("constraint", "").length() > 0)) ? new kelondroBitfield(4, post.get("constraint", "______")) : null; // final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers // Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time @@ -133,7 +133,7 @@ public final class search { long urlRetrievalAllTime = 0, snippetComputationAllTime = 0; if ((query.length() == 0) && (abstractSet != null)) { // this is _not_ a normal search, only a request for index abstracts - theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, plasmaSearchQuery.catchall_constraint, false); + theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, null, false); theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 71c111d96..dd403ef9b 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -127,7 +127,7 @@ public class yacysearch { prop.put("input_urlmaskfilter", ".*"); prop.put("input_prefermaskfilter", ""); prop.put("input_indexof", "off"); - prop.put("input_constraint", plasmaSearchQuery.catchall_constraint.exportB64()); + prop.put("input_constraint", ""); prop.put("input_cat", "href"); prop.put("input_depth", "0"); prop.put("input_contentdom", "text"); @@ -167,7 +167,7 @@ public class yacysearch { String prefermask = post.get("prefermaskfilter", ""); if ((prefermask.length() > 0) && (prefermask.indexOf(".*") < 0)) prefermask = ".*" + prefermask + ".*"; - kelondroBitfield constraint = post.containsKey("constraint") ? new kelondroBitfield(4, post.get("constraint", "______")) : plasmaSearchQuery.catchall_constraint; + kelondroBitfield constraint = ((post.containsKey("constraint")) && (post.get("constraint", "").length() > 0)) ? new kelondroBitfield(4, post.get("constraint", "______")) : null; if (indexof) { constraint = new kelondroBitfield(4); constraint.set(plasmaCondenser.flag_cat_indexof, true); @@ -401,7 +401,7 @@ public class yacysearch { prop.putHTML("input_urlmaskfilter", urlmask); prop.putHTML("input_prefermaskfilter", prefermask); prop.put("input_indexof", (indexof) ? "on" : "off"); - prop.put("input_constraint", constraint.exportB64()); + prop.put("input_constraint", (constraint == null) ? "" : constraint.exportB64()); prop.put("input_contentdom", post.get("contentdom", "text")); prop.put("input_contentdomCheckText", (contentdomCode == plasmaSearchQuery.CONTENTDOM_TEXT) ? "1" : "0"); prop.put("input_contentdomCheckAudio", (contentdomCode == plasmaSearchQuery.CONTENTDOM_AUDIO) ? "1" : "0"); @@ -418,6 +418,17 @@ public class yacysearch { } private static String navurla(int page, int display, plasmaSearchQuery theQuery) { - return ""; + return + ""; } } diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 9cbd82cc9..9a44ba4ee 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -220,8 +220,8 @@ public class yacysearchitem { prop.put("content_rankingprops", result.word().toPropertyForm() + ", domLengthEstimated=" + yacyURL.domLengthEstimation(result.hash()) + ((yacyURL.probablyRootURL(result.hash())) ? ", probablyRootURL" : "") + (((wordURL = yacyURL.probablyWordURL(result.hash(), query[0])) != null) ? ", probablyWordURL=" + wordURL.toNormalform(false, true) : "")); - - prop.put("content_snippet", result.textSnippet().getLineMarked(theQuery.queryHashes)); + plasmaSnippetCache.TextSnippet snippet = result.textSnippet(); + prop.put("content_snippet", (snippet == null) ? "(snippet not found)" : snippet.getLineMarked(theQuery.queryHashes)); return prop; } diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 7de4a6a55..417bf9df3 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -125,7 +125,7 @@ public final class plasmaSearchEvent { if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) || (query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) { // do a global search - this.rankedCache = new plasmaSearchRankingProcess(query, process, ranking, max_results_preparation); + this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, process, ranking, 2, max_results_preparation); int fetchpeers = (int) (query.maximumTime / 500L); // number of target peers; means 10 peers in 10 seconds if (fetchpeers > 50) fetchpeers = 50; @@ -160,14 +160,15 @@ public final class plasmaSearchEvent { serverLog.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + primarySearchThreads.length + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); } else { // do a local search - process.startTimer(); - Map[] searchContainerMaps = wordIndex.localSearchContainers(query, null); - process.yield(COLLECTION, searchContainerMaps[0].size()); + this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, process, ranking, 2, max_results_preparation); + this.rankedCache.execQuery(true); + this.localcount = this.rankedCache.filteredCount(); + //plasmaWordIndex.Finding finding = wordIndex.retrieveURLs(query, false, 2, ranking, process); if (generateAbstracts) { // compute index abstracts process.startTimer(); - Iterator ci = searchContainerMaps[0].entrySet().iterator(); + Iterator ci = this.rankedCache.searchContainerMaps()[0].entrySet().iterator(); Map.Entry entry; int maxcount = -1; double mindhtdistance = 1.1, d; @@ -190,22 +191,9 @@ public final class plasmaSearchEvent { IACount.put(wordhash, new Integer(container.size())); IAResults.put(wordhash, indexContainer.compressIndex(container, null, 1000).toString()); } - process.yield("abstract generation", searchContainerMaps[0].size()); + process.yield("abstract generation", this.rankedCache.searchContainerMaps()[0].size()); } - process.startTimer(); - indexContainer rcLocal = - (searchContainerMaps == null) ? - plasmaWordIndex.emptyContainer(null, 0) : - indexContainer.joinExcludeContainers( - searchContainerMaps[0].values(), - searchContainerMaps[1].values(), - query.maxDistance); - process.yield(JOIN, rcLocal.size()); - - this.localcount = rcLocal.size(); - this.rankedCache = new plasmaSearchRankingProcess(query, process, ranking, max_results_preparation); - this.rankedCache.insert(rcLocal, true); } if (query.onlineSnippetFetch) { @@ -221,10 +209,8 @@ public final class plasmaSearchEvent { indexURLEntry uentry; ResultEntry resultEntry; synchronized (rankedCache) { - Iterator urlIterator = rankedCache.entries(wordIndex, true); - while ((urlIterator.hasNext()) && (resultList.size() < (query.neededResults()))) { - // fetch next entry - uentry = (indexURLEntry) urlIterator.next(); + while ((rankedCache.size() > 0) && ((uentry = rankedCache.bestURL(true)) != null) && (resultList.size() < (query.neededResults()))) { + System.out.println("***DEBUG*** SEARCH RESULT URL=" + uentry.comp().url().toNormalform(false, false)); resultEntry = obtainResultEntry(uentry, (snippetComputationAllTime < 300) ? 1 : 0); if (resultEntry == null) continue; // the entry had some problems, cannot be used @@ -260,51 +246,12 @@ public final class plasmaSearchEvent { public void run() { // do a local search - process.startTimer(); - Map[] searchContainerMaps = wordIndex.localSearchContainers(query, null); - process.yield(COLLECTION, searchContainerMaps[0].size()); - - // use the search containers to fill up rcAbstracts locally - /* - if ((rcAbstracts != null) && (searchContainerMap != null)) { - Iterator i, ci = searchContainerMap.entrySet().iterator(); - Map.Entry entry; - String wordhash; - indexContainer container; - TreeMap singleAbstract; - String mypeerhash = yacyCore.seedDB.mySeed.hash; - while (ci.hasNext()) { - entry = (Map.Entry) ci.next(); - wordhash = (String) entry.getKey(); - container = (indexContainer) entry.getValue(); - // collect all urlhashes from the container - synchronized (rcAbstracts) { - singleAbstract = (TreeMap) rcAbstracts.get(wordhash); // a mapping from url-hashes to a string of peer-hashes - if (singleAbstract == null) singleAbstract = new TreeMap(); - i = container.entries(); - while (i.hasNext()) singleAbstract.put(((indexEntry) i.next()).urlHash(), mypeerhash); - rcAbstracts.put(wordhash, singleAbstract); - } - } - } - */ - - // join and exlcude the local result - process.startTimer(); - indexContainer rcLocal = - (searchContainerMaps == null) ? - plasmaWordIndex.emptyContainer(null, 0) : - indexContainer.joinExcludeContainers( - searchContainerMaps[0].values(), - searchContainerMaps[1].values(), - query.maxDistance); - process.yield(JOIN, rcLocal.size()); - localcount = rcLocal.size(); // sort the local containers and truncate it to a limited count, // so following sortings together with the global results will be fast synchronized (rankedCache) { - rankedCache.insert(rcLocal, true); + rankedCache.execQuery(true); + localcount = rankedCache.filteredCount(); } } } @@ -367,7 +314,7 @@ public final class plasmaSearchEvent { } // check constraints - if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && + if ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof)) && (!(comp.title().startsWith("Index of")))) { final Iterator wi = query.queryHashes.iterator(); @@ -401,7 +348,7 @@ public final class plasmaSearchEvent { if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) { // attach text snippet startTime = System.currentTimeMillis(); - plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, (snippetFetchMode == 2), query.constraint.get(plasmaCondenser.flag_cat_indexof), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000); + plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000); long snippetComputationTime = System.currentTimeMillis() - startTime; serverLog.logInfo("SEARCH_EVENT", "text snippet load time for " + comp.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")"))); @@ -550,7 +497,6 @@ public final class plasmaSearchEvent { private class resultWorker extends Thread { - private indexRWIEntry entry; // entry this thread is working on private long timeout; // the date until this thread should try to work private long sleeptime; // the sleeptime of this thread at the beginning of its life private int id; @@ -559,7 +505,6 @@ public final class plasmaSearchEvent { this.id = id; this.timeout = System.currentTimeMillis() + lifetime; this.sleeptime = lifetime / 10 * id; - this.entry = null; } public void run() { @@ -568,35 +513,16 @@ public final class plasmaSearchEvent { if (anyRemoteSearchAlive()) try {Thread.sleep(this.sleeptime);} catch (InterruptedException e1) {} // start fetching urls and snippets - while (true) { - - if (resultList.size() > query.neededResults() + query.displayResults()) break; // computed enough - - if (System.currentTimeMillis() > this.timeout) break; // time is over + indexURLEntry page; + while ((resultList.size() < query.neededResults() + query.displayResults()) && + (System.currentTimeMillis() < this.timeout) && + ((page = rankedCache.bestURL(true)) != null)) { + if (anyResultWith(page.hash())) continue; + if (anyFailureWith(page.hash())) continue; // try secondary search prepareSecondarySearch(); // will be executed only once - // fetch next entry to work on - this.entry = null; - entry = nextOrder(); - if (entry == null) { - if (anyRemoteSearchAlive()) { - // wait and try again - try {Thread.sleep(100);} catch (InterruptedException e) {} - continue; - } else { - // we will not see that there come more results in - break; - } - } - - indexURLEntry page = wordIndex.loadedURL.load(entry.urlHash(), entry, 0); - if (page == null) { - registerFailure(entry.urlHash(), "url does not exist in lurl-db"); - continue; - } - ResultEntry resultEntry = obtainResultEntry(page, 2); if (resultEntry == null) continue; // the entry had some problems, cannot be used urlRetrievalAllTime += resultEntry.dbRetrievalTime; @@ -617,29 +543,6 @@ public final class plasmaSearchEvent { serverLog.logInfo("SEARCH", "resultWorker thread " + id + " terminated"); } - private indexRWIEntry nextOrder() { - synchronized (rankedCache) { - Iterator i = rankedCache.entries(null, false); - indexRWIEntry entry; - String urlhash; - while (i.hasNext()) { - entry = (indexRWIEntry) i.next(); - urlhash = entry.urlHash(); - if ((anyFailureWith(urlhash)) || (anyWorkerWith(urlhash)) || (anyResultWith(urlhash))) continue; - return entry; - } - } - return null; // no more entries available - } - - private boolean anyWorkerWith(String urlhash) { - for (int i = 0; i < workerThreadCount; i++) { - if ((workerThreads[i] == null) || (workerThreads[i] == this)) continue; - if ((workerThreads[i].entry != null) && (workerThreads[i].entry.urlHash().equals(urlhash))) return true; - } - return false; - } - private boolean anyResultWith(String urlhash) { for (int i = 0; i < resultList.size(); i++) { if (((ResultEntry) resultList.get(i)).urlentry.hash().equals(urlhash)) return true; @@ -681,7 +584,7 @@ public final class plasmaSearchEvent { // fetch the best entry from the resultList, not the entry from item position // whenever a specific entry was switched in its position and was returned here // a moving pointer is set to assign that item position as not changeable - int bestpick = postRankingFavourite(item); + int bestpick = item; //postRankingFavourite(item); if (bestpick != item) { // switch the elements ResultEntry buf = (ResultEntry) this.resultList.get(bestpick); @@ -695,68 +598,6 @@ public final class plasmaSearchEvent { } } - private int postRankingFavourite(int item) { - // do a post-ranking on resultList, which should be locked upon time of this call - long rank, bestrank = 0; - int bestitem = item; - ResultEntry entry; - for (int i = item; i < this.resultList.size(); i++) { - entry = (ResultEntry) this.resultList.get(i); - rank = this.ranking.postRanking(this.query, this.references(10), entry, item); - if (rank > bestrank) { - bestrank = rank; - bestitem = i; - } - } - return bestitem; - } - - /* - public void removeRedundant() { - // remove all urls from the pageAcc structure that occur double by specific redundancy rules - // a link is redundant, if a sub-path of the url is cited before. redundant urls are removed - // we find redundant urls by iteration over all elements in pageAcc - Iterator i = pageAcc.entrySet().iterator(); - HashMap paths = new HashMap(); // a url-subpath to pageAcc-key relation - Map.Entry entry; - - // first scan all entries and find all urls that are referenced - while (i.hasNext()) { - entry = (Map.Entry) i.next(); - paths.put(((indexURLEntry) entry.getValue()).comp().url().toNormalform(true, true), entry.getKey()); - //if (path != null) path = shortenPath(path); - //if (path != null) paths.put(path, entry.getKey()); - } - - // now scan the pageAcc again and remove all redundant urls - i = pageAcc.entrySet().iterator(); - String shorten; - while (i.hasNext()) { - entry = (Map.Entry) i.next(); - shorten = shortenPath(((indexURLEntry) entry.getValue()).comp().url().toNormalform(true, true)); - // scan all subpaths of the url - while (shorten != null) { - if (pageAcc.size() <= query.wantedResults) break; - if (paths.containsKey(shorten)) { - //System.out.println("deleting path from search result: " + path + " is redundant to " + shorten); - try { - i.remove(); - } catch (IllegalStateException e) { - - } - } - shorten = shortenPath(shorten); - } - } - } - - private static String shortenPath(String path) { - int pos = path.lastIndexOf('/'); - if (pos < 0) return null; - return path.substring(0, pos); - } - */ - public ArrayList completeResults(long waitingtime) { long timeout = System.currentTimeMillis() + waitingtime; while ((this.resultList.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) { diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index 400c6b882..1c1beceb6 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -29,7 +29,6 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; @@ -40,34 +39,45 @@ import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; import de.anomic.index.indexRWIEntryOrder; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBinSearch; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.server.serverCodings; import de.anomic.server.serverFileUtils; import de.anomic.server.serverProfiling; -import de.anomic.yacy.yacyURL; public final class plasmaSearchRankingProcess { public static kelondroBinSearch[] ybrTables = null; // block-rank tables private static boolean useYBR = true; - private TreeMap pageAcc; // key = ranking (Long); value = indexRWIEntry + private TreeMap sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String + private HashMap doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries + private HashMap handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process private plasmaSearchQuery query; private plasmaSearchRankingProfile ranking; + private int sortorder; private int filteredCount; - private indexRWIEntryOrder order; - private serverProfiling process; private int maxentries; private int globalcount; + private indexRWIEntryOrder order; + private serverProfiling process; private HashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic - private int[] c; // flag counter + private int[] flagcount; // flag counter + private TreeSet misses; // contains url-hashes that could not been found in the LURL-DB + private plasmaWordIndex wordIndex; + private Map[] localSearchContainerMaps; - public plasmaSearchRankingProcess(plasmaSearchQuery query, serverProfiling process, plasmaSearchRankingProfile ranking, int maxentries) { + public plasmaSearchRankingProcess(plasmaWordIndex wordIndex, plasmaSearchQuery query, serverProfiling process, plasmaSearchRankingProfile ranking, int sortorder, int maxentries) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime - this.pageAcc = new TreeMap(); + // sortorder: 0 = hash, 1 = url, 2 = ranking + this.localSearchContainerMaps = null; + this.sortedRWIEntries = new TreeMap(); + this.doubleDomCache = new HashMap(); + this.handover = new HashMap(); + this.filteredCount = 0; this.process = process; this.order = null; this.query = query; @@ -76,8 +86,80 @@ public final class plasmaSearchRankingProcess { this.globalcount = 0; this.urlhashes = new HashMap(); this.ref = new kelondroMScoreCluster(); - c = new int[32]; - for (int i = 0; i < 32; i++) {c[i] = 0;} + this.misses = new TreeSet(); + this.wordIndex = wordIndex; + this.sortorder = sortorder; + this.flagcount = new int[32]; + for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;} + } + + public void execQuery(boolean fetchURLs) { + + if (process != null) process.startTimer(); + this.localSearchContainerMaps = wordIndex.localSearchContainers(query, null); + if (process != null) process.yield(plasmaSearchEvent.COLLECTION, this.localSearchContainerMaps[0].size()); + + // join and exlcude the local result + if (process != null) process.startTimer(); + indexContainer index = + (this.localSearchContainerMaps == null) ? + plasmaWordIndex.emptyContainer(null, 0) : + indexContainer.joinExcludeContainers( + this.localSearchContainerMaps[0].values(), + this.localSearchContainerMaps[1].values(), + query.maxDistance); + if (process != null) process.yield(plasmaSearchEvent.JOIN, index.size()); + int joincount = index.size(); + + if ((index == null) || (joincount == 0)) { + return; + } + + if (sortorder == 2) { + insert(index, true); + } else { + final Iterator en = index.entries(); + // generate a new map where the urls are sorted (not by hash but by the url text) + + indexRWIEntry ientry; + indexURLEntry uentry; + String u; + loop: while (en.hasNext()) { + ientry = (indexRWIEntry) en.next(); + + // check constraints + if (!testFlags(ientry)) continue loop; + + // increase flag counts + for (int i = 0; i < 32; i++) { + if (ientry.flags().get(i)) {flagcount[i]++;} + } + + // load url + if (sortorder == 0) { + this.sortedRWIEntries.put(ientry.urlHash(), ientry); + this.urlhashes.put(ientry.urlHash(), ientry.urlHash()); + filteredCount++; + } else { + if (fetchURLs) { + uentry = wordIndex.loadedURL.load(ientry.urlHash(), ientry, 0); + if (uentry == null) { + this.misses.add(ientry.urlHash()); + } else { + u = uentry.comp().url().toNormalform(false, true); + this.sortedRWIEntries.put(u, ientry); + this.urlhashes.put(ientry.urlHash(), u); + filteredCount++; + } + } else { + filteredCount++; + } + } + + // interrupt if we have enough + if ((query.neededResults() > 0) && (this.misses.size() + this.sortedRWIEntries.size() > query.neededResults())) break loop; + } // end loop + } } public void insert(indexContainer container, boolean local) { @@ -102,7 +184,6 @@ public final class plasmaSearchRankingProcess { // normalize entries and get ranking if (process != null) process.startTimer(); Iterator i = container.entries(); - this.pageAcc = new TreeMap(); indexRWIEntry iEntry, l; long biggestEntry = 0; //long s0 = System.currentTimeMillis(); @@ -113,89 +194,164 @@ public final class plasmaSearchRankingProcess { // increase flag counts for (int j = 0; j < 32; j++) { - if (iEntry.flags().get(j)) {c[j]++;} + if (iEntry.flags().get(j)) {flagcount[j]++;} } // kick out entries that are too bad according to current findings r = new Long(order.cardinal(iEntry)); - if ((maxentries >= 0) && (pageAcc.size() >= maxentries) && (r.longValue() > biggestEntry)) continue; - + if ((maxentries >= 0) && (sortedRWIEntries.size() >= maxentries) && (r.longValue() > biggestEntry)) continue; + // check constraints - if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && (!(iEntry.flags().allOf(query.constraint)))) continue; // filter out entries that do not match the search constraint + if (!testFlags(iEntry)) continue; + if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) { if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasaudio)))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasvideo)))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue; } - if ((maxentries < 0) || (pageAcc.size() < maxentries)) { + if ((maxentries < 0) || (sortedRWIEntries.size() < maxentries)) { if (urlhashes.containsKey(iEntry.urlHash())) continue; - while (pageAcc.containsKey(r)) r = new Long(r.longValue() + 1); - pageAcc.put(r, iEntry); + while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1); + sortedRWIEntries.put(r, iEntry); } else { if (r.longValue() > biggestEntry) { continue; } else { if (urlhashes.containsKey(iEntry.urlHash())) continue; - l = (indexRWIEntry) pageAcc.remove((Long) pageAcc.lastKey()); + l = (indexRWIEntry) sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey()); urlhashes.remove(l.urlHash()); - while (pageAcc.containsKey(r)) r = new Long(r.longValue() + 1); - pageAcc.put(r, iEntry); - biggestEntry = order.cardinal((indexRWIEntry) pageAcc.get(pageAcc.lastKey())); + while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1); + sortedRWIEntries.put(r, iEntry); + biggestEntry = order.cardinal((indexRWIEntry) sortedRWIEntries.get(sortedRWIEntries.lastKey())); } } - urlhashes.put(iEntry.urlHash(), r); // increase counter for statistics if (!local) this.globalcount++; } - this.filteredCount = pageAcc.size(); + this.filteredCount = sortedRWIEntries.size(); //long sc = Math.max(1, System.currentTimeMillis() - s0); //System.out.println("###DEBUG### time to sort " + container.size() + " entries to " + this.filteredCount + ": " + sc + " milliseconds, " + (container.size() / sc) + " entries/millisecond, ranking = " + tc); - if (container.size() > query.neededResults()) remove(true, true); + //if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true); if (process != null) process.yield(plasmaSearchEvent.PRESORT, container.size()); } - - public class rIterator implements Iterator { - - boolean urls; - Iterator r; - plasmaWordIndex wi; - public rIterator(plasmaWordIndex wi, boolean fetchURLs) { - // if fetchURLs == true, this iterates indexURLEntry objects, otherwise it iterates indexRWIEntry objects - this.urls = fetchURLs; - this.r = pageAcc.entrySet().iterator(); - this.wi = wi; - } - - public boolean hasNext() { - return r.hasNext(); - } - - public Object next() { - Map.Entry entry = (Map.Entry) r.next(); - indexRWIEntry ientry = (indexRWIEntry) entry.getValue(); - if (urls) { - return wi.loadedURL.load(ientry.urlHash(), ientry, ((Long) entry.getKey()).longValue()); - } else { - return ientry; - } - } - public void remove() { - throw new UnsupportedOperationException(); - } + private boolean testFlags(indexRWIEntry ientry) { + if (query.constraint == null) return true; + // test if ientry matches with filter + // if all = true: let only entries pass that has all matching bits + // if all = false: let all entries pass that has at least one matching bit + if (query.allofconstraint) { + for (int i = 0; i < 32; i++) { + if ((query.constraint.get(i)) && (!ientry.flags().get(i))) return false; + } + return true; + } + for (int i = 0; i < 32; i++) { + if ((query.constraint.get(i)) && (ientry.flags().get(i))) return true; + } + return false; } - public int size() { - assert pageAcc.size() == urlhashes.size(); - return pageAcc.size(); + public synchronized Map[] searchContainerMaps() { + // direct access to the result maps is needed for abstract generation + // this is only available if execQuery() was called before + return localSearchContainerMaps; + } + + // todo: + // - remove redundant urls (sub-path occurred before) + // - move up shorter urls + // - root-domain guessing to prefer the root domain over other urls if search word appears in domain name + + + private synchronized Object[] /*{Object, indexRWIEntry}*/ bestRWI(boolean skipDoubleDom) { + // returns from the current RWI list the best entry and removed this entry from the list + Object bestEntry; + TreeMap m; + indexRWIEntry rwi; + while (sortedRWIEntries.size() > 0) { + bestEntry = sortedRWIEntries.firstKey(); + rwi = (indexRWIEntry) sortedRWIEntries.remove(bestEntry); + if (!skipDoubleDom) return new Object[]{bestEntry, rwi}; + // check doubledom + String domhash = rwi.urlHash().substring(6); + m = (TreeMap) this.doubleDomCache.get(domhash); + if (m == null) { + // first appearance of dom + m = new TreeMap(); + this.doubleDomCache.put(domhash, m); + return new Object[]{bestEntry, rwi}; + } + // second appearances of dom + m.put(bestEntry, rwi); + } + // no more entries in sorted RWI entries. Now take Elements from the doubleDomCache + // find best entry from all caches + Iterator i = this.doubleDomCache.values().iterator(); + bestEntry = null; + Object o; + indexRWIEntry bestrwi = null; + while (i.hasNext()) { + m = (TreeMap) i.next(); + if (m.size() == 0) continue; + if (bestEntry == null) { + bestEntry = m.firstKey(); + bestrwi = (indexRWIEntry) m.remove(bestEntry); + continue; + } + o = m.firstKey(); + rwi = (indexRWIEntry) m.remove(o); + if (o instanceof Long) { + if (((Long) o).longValue() < ((Long) bestEntry).longValue()) { + bestEntry = o; + bestrwi = rwi; + } + } + if (o instanceof String) { + if (((String) o).compareTo((String) bestEntry) < 0) { + bestEntry = o; + bestrwi = rwi; + } + } + } + if (bestrwi == null) return null; + // finally remove the best entry from the doubledom cache + m = (TreeMap) this.doubleDomCache.get(bestrwi.urlHash().substring(6)); + m.remove(bestEntry); + return new Object[]{bestEntry, bestrwi}; + } + + public synchronized indexURLEntry bestURL(boolean skipDoubleDom) { + // returns from the current RWI list the best URL entry and removed this entry from the list + while ((sortedRWIEntries.size() > 0) || (size() > 0)) { + Object[] obrwi = bestRWI(skipDoubleDom); + Object bestEntry = obrwi[0]; + indexRWIEntry ientry = (indexRWIEntry) obrwi[1]; + long ranking = (bestEntry instanceof Long) ? ((Long) bestEntry).longValue() : 0; + indexURLEntry u = wordIndex.loadedURL.load(ientry.urlHash(), ientry, ranking); + if (u != null) { + this.handover.put(u.hash(), u.comp().url().toNormalform(true, false)); // remember that we handed over this url + return u; + } + misses.add(ientry.urlHash()); + } + return null; + } + + public synchronized int size() { + //assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size(); + int c = sortedRWIEntries.size(); + Iterator i = this.doubleDomCache.values().iterator(); + while (i.hasNext()) c += ((TreeMap) i.next()).size(); + return c; } public int[] flagCount() { - return c; + return flagcount; } public int filteredCount() { @@ -207,17 +363,16 @@ public final class plasmaSearchRankingProcess { } public indexRWIEntry remove(String urlHash) { - Long r = (Long) urlhashes.get(urlHash); + Object r = (Long) urlhashes.get(urlHash); if (r == null) return null; - assert pageAcc.containsKey(r); - indexRWIEntry iEntry = (indexRWIEntry) pageAcc.remove(r); + assert sortedRWIEntries.containsKey(r); + indexRWIEntry iEntry = (indexRWIEntry) sortedRWIEntries.remove(r); urlhashes.remove(urlHash); return iEntry; } - - public Iterator entries(plasmaWordIndex wi, boolean fetchURLs) { - // if fetchURLs == true, this iterates indexURLEntry objects, otherwise it iterates indexRWIEntry objects - return new rIterator(wi, fetchURLs); + + public Iterator miss() { + return this.misses.iterator(); } public Set getReferences(int count) { @@ -257,35 +412,6 @@ public final class plasmaSearchRankingProcess { return this.order; } - private void remove(boolean rootDomExt, boolean doubleDom) { - // this removes all refererences to urls that are extended paths of existing 'RootDom'-urls - if (pageAcc.size() <= query.neededResults()) return; - HashSet rootDoms = new HashSet(); - HashSet doubleDoms = new HashSet(); - Iterator i = pageAcc.entrySet().iterator(); - Map.Entry entry; - indexRWIEntry iEntry; - String hashpart; - boolean isWordRootURL; - TreeSet querywords = plasmaSearchQuery.cleanQuery(query.queryString())[0]; - while (i.hasNext()) { - if (pageAcc.size() <= query.neededResults()) break; - entry = (Map.Entry) i.next(); - iEntry = (indexRWIEntry) entry.getValue(); - hashpart = iEntry.urlHash().substring(6); - isWordRootURL = yacyURL.isWordRootURL(iEntry.urlHash(), querywords); - if (isWordRootURL) { - rootDoms.add(hashpart); - } else { - if (((rootDomExt) && (rootDoms.contains(hashpart))) || - ((doubleDom) && (doubleDoms.contains(hashpart)))) { - i.remove(); - } - } - doubleDoms.add(hashpart); - } - } - public static void loadYBR(File rankingPath, int count) { // load ranking tables if (rankingPath.exists()) { @@ -337,4 +463,45 @@ public final class plasmaSearchRankingProcess { return 15; } + public long postRanking( + Set topwords, + plasmaSearchEvent.ResultEntry rentry, + int position) { + + long r = (255 - position) << 8; + + // for media search: prefer pages with many links + if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) r += rentry.limage() << ranking.coeff_cathasimage; + if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) r += rentry.laudio() << ranking.coeff_cathasaudio; + if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) r += rentry.lvideo() << ranking.coeff_cathasvideo; + if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) r += rentry.lapp() << ranking.coeff_cathasapp; + + // prefer hit with 'prefer' pattern + if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << ranking.coeff_prefer; + if (rentry.title().matches(query.prefer)) r += 256 << ranking.coeff_prefer; + + // apply 'common-sense' heuristic using references + String urlstring = rentry.url().toNormalform(true, true); + String[] urlcomps = htmlFilterContentScraper.urlComps(urlstring); + String[] descrcomps = rentry.title().toLowerCase().split(htmlFilterContentScraper.splitrex); + for (int j = 0; j < urlcomps.length; j++) { + if (topwords.contains(urlcomps[j])) r += Math.max(1, 256 - urlstring.length()) << ranking.coeff_urlcompintoplist; + } + for (int j = 0; j < descrcomps.length; j++) { + if (topwords.contains(descrcomps[j])) r += Math.max(1, 256 - rentry.title().length()) << ranking.coeff_descrcompintoplist; + } + + // apply query-in-result matching + Set urlcomph = plasmaCondenser.words2hashSet(urlcomps); + Set descrcomph = plasmaCondenser.words2hashSet(descrcomps); + Iterator shi = query.queryHashes.iterator(); + String queryhash; + while (shi.hasNext()) { + queryhash = (String) shi.next(); + if (urlcomph.contains(queryhash)) r += 256 << ranking.coeff_appurl; + if (descrcomph.contains(queryhash)) r += 256 << ranking.coeff_appdescr; + } + + return r; + } } diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index 021408ec0..1af4a36c3 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -44,9 +44,6 @@ package de.anomic.plasma; import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import java.util.Set; - -import de.anomic.htmlFilter.htmlFilterContentScraper; public class plasmaSearchRankingProfile { @@ -113,7 +110,7 @@ public class plasmaSearchRankingProfile { coeff_appauthor = 13; coeff_apptags = 8; coeff_appref = 9; - coeff_appemph = 11; + coeff_appemph = 13; coeff_urlcompintoplist = 3; coeff_descrcompintoplist = 2; coeff_prefer = 15; @@ -248,47 +245,4 @@ public class plasmaSearchRankingProfile { return new String(ext); } - public long postRanking( - plasmaSearchQuery query, - Set topwords, - plasmaSearchEvent.ResultEntry rentry, - int position) { - - long ranking = (255 - position) << 8; - - // for media search: prefer pages with many links - if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) ranking += rentry.limage() << coeff_cathasimage; - if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) ranking += rentry.laudio() << coeff_cathasaudio; - if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) ranking += rentry.lvideo() << coeff_cathasvideo; - if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) ranking += rentry.lapp() << coeff_cathasapp; - - // prefer hit with 'prefer' pattern - if (rentry.url().toNormalform(true, true).matches(query.prefer)) ranking += 256 << coeff_prefer; - if (rentry.title().matches(query.prefer)) ranking += 256 << coeff_prefer; - - // apply 'common-sense' heuristic using references - String urlstring = rentry.url().toNormalform(true, true); - String[] urlcomps = htmlFilterContentScraper.urlComps(urlstring); - String[] descrcomps = rentry.title().toLowerCase().split(htmlFilterContentScraper.splitrex); - for (int j = 0; j < urlcomps.length; j++) { - if (topwords.contains(urlcomps[j])) ranking += Math.max(1, 256 - urlstring.length()) << coeff_urlcompintoplist; - } - for (int j = 0; j < descrcomps.length; j++) { - if (topwords.contains(descrcomps[j])) ranking += Math.max(1, 256 - rentry.title().length()) << coeff_descrcompintoplist; - } - - // apply query-in-result matching - Set urlcomph = plasmaCondenser.words2hashSet(urlcomps); - Set descrcomph = plasmaCondenser.words2hashSet(descrcomps); - Iterator shi = query.queryHashes.iterator(); - String queryhash; - while (shi.hasNext()) { - queryhash = (String) shi.next(); - if (urlcomph.contains(queryhash)) ranking += 256 << coeff_appurl; - if (descrcomph.contains(queryhash)) ranking += 256 << coeff_appdescr; - } - - return ranking; - } - } diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 264bb479d..479e10ec1 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -34,7 +34,6 @@ import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; -import java.util.TreeMap; import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterContentScraper; @@ -47,7 +46,6 @@ import de.anomic.index.indexRWIEntry; import de.anomic.index.indexRWIRowEntry; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; -import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroCloneableIterator; import de.anomic.kelondro.kelondroMergeIterator; import de.anomic.kelondro.kelondroOrder; @@ -65,7 +63,7 @@ public final class plasmaWordIndex implements indexRI { public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes public static final int wCacheMaxChunk = 400; // maximum number of references for each urlhash public static final int lowcachedivisor = 320; - public static final int maxCollectionPartition = 8; // should be 7 + public static final int maxCollectionPartition = 7; // should be 7 private final kelondroOrder indexOrder = kelondroBase64Order.enhancedCoder; private final indexRAMRI dhtOutCache, dhtInCache; @@ -407,115 +405,6 @@ public final class plasmaWordIndex implements indexRI { return new Map[]{inclusionContainers, exclusionContainers}; } - public Finding retrieveURLs(plasmaSearchQuery query, boolean loadurl, int sortorder, plasmaSearchRankingProfile ranking) { - // search for a word hash and generate a list of url links - // sortorder: 0 = hash, 1 = url, 2 = ranking - assert query.queryHashes.size() == 1; - final TreeSet mi = new TreeSet(); - String keyhash = (String) query.queryHashes.first(); - kelondroBitfield filter = query.constraint; - indexContainer index = getContainer(keyhash, null); - indexRWIEntry ientry; - indexURLEntry uentry; - final int[] c = new int[32]; - for (int i = 0; i < 32; i++) {c[i] = 0;} - - if ((index == null) || (index.size() == 0)) { - return new Finding(mi.iterator(), mi.iterator(), mi, 0, c); - } - - if (sortorder == 2) { - plasmaSearchRankingProcess process = new plasmaSearchRankingProcess(query, null, ranking, query.neededResults()); - process.insert(index, true); - return new Finding(process.entries(this, true), null, mi, process.filteredCount(), process.flagCount()); - } else { - final TreeMap tm = new TreeMap(); - final ArrayList indexes = new ArrayList(); - - final Iterator en = index.entries(); - // generate a new map where the urls are sorted (not by hash but by the url text) - - loop: while (en.hasNext()) { - ientry = (indexRWIEntry) en.next(); - - // test if ientry matches with filter - if (filter != null) { - // if all = true: let only entries pass that has all matching bits - // if all = false: let all entries pass that has at least one matching bit - if (query.allofconstraint) { - for (int i = 0; i < 32; i++) { - if ((filter.get(i)) && (!ientry.flags().get(i))) continue loop; - } - } else { - boolean nok = true; - flagtest: for (int i = 0; i < 32; i++) { - if ((filter.get(i)) && (ientry.flags().get(i))) {nok = false; break flagtest;} - } - if (nok) continue loop; - } - } - - // increase flag counts - for (int i = 0; i < 32; i++) { - if (ientry.flags().get(i)) {c[i]++;} - } - - // load url - if (loadurl) { - uentry = loadedURL.load(ientry.urlHash(), ientry, 0); - if (uentry == null) { - mi.add(ientry.urlHash()); - } else { - if (sortorder == 0) { - tm.put(uentry.comp().url().toNormalform(false, true), uentry); - } - if (sortorder == 1) { - tm.put(ientry.urlHash(), uentry); - } - } - } else { - indexes.add(ientry); - } - if ((query.neededResults() > 0) && (mi.size() + tm.size() > query.neededResults())) break loop; - } // end loop - if (loadurl) { - return new Finding(tm.values().iterator(), null, mi, tm.size(), c); - } else { - return new Finding(null, indexes.iterator(), mi, indexes.size(), c); - } - } - } - - public static class Finding { - private Iterator urls; // an iterator if indexURLEntry objects - private Iterator rwientries; // an iterator of indexRWIEntry objects - private Set misses; // a set of hashes where we did not found items - private int findcount; - private int[] flagcount; - public Finding(Iterator urls, Iterator rwientries, Set misses, int findcount, int[] flagcount) { - this.findcount = findcount; - this.urls = urls; - this.rwientries = rwientries; - this.misses = misses; - this.flagcount = flagcount; - } - public int size() { - return this.findcount; - } - public Iterator urls() { - return this.urls; - } - public Iterator rwientries() { - return this.rwientries; - } - public Set miss() { - return this.misses; - } - public int[] flagcount() { - return this.flagcount; - } - } - public int size() { return java.lang.Math.max(collections.size(), java.lang.Math.max(dhtInCache.size(), dhtOutCache.size())); } diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 6a40273a1..8640933eb 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -380,7 +380,7 @@ public final class yacyClient { post.put("ttl", "0"); post.put("maxdist", maxDistance); post.put("profile", crypt.simpleEncode(rankingProfile.toExternalString())); - post.put("constraint", constraint.exportB64()); + post.put("constraint", (constraint == null) ? "" : constraint.exportB64()); if (abstractCache != null) post.put("abstracts", "auto"); final long timestamp = System.currentTimeMillis();