diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 7f86197bd..dbe3951dd 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -54,6 +54,7 @@ import java.util.HashSet; import java.util.HashMap; import java.util.Iterator; import java.util.TreeMap; + import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaCrawlLURL; @@ -150,9 +151,7 @@ public class IndexControl_p { // generate an urlx array plasmaWordIndexEntity index = null; try { - HashSet keyhashes = new HashSet(); - keyhashes.add(keyhash); - index = switchboard.searchManager.searchHashes(keyhashes, 10000); + index = switchboard.wordIndex.getEntity(keyhash, true); Enumeration en = index.elements(true); int i = 0; urlx = new String[index.size()]; @@ -437,9 +436,7 @@ public class IndexControl_p { // search for a word hash and generate a list of url links plasmaWordIndexEntity index = null; try { - final HashSet keyhashes = new HashSet(); - keyhashes.add(keyhash); - index = switchboard.searchManager.searchHashes(keyhashes, 10000); + index = switchboard.wordIndex.getEntity(keyhash, true); final StringBuffer result = new StringBuffer(1024); if (index.size() == 0) { diff --git a/htroot/index.java b/htroot/index.java index 423d7a01f..64c210bf6 100644 --- a/htroot/index.java +++ b/htroot/index.java @@ -139,8 +139,8 @@ public class index { (yacyCore.seedDB.mySeed != null) && (yacyCore.seedDB.mySeed.getAddress() != null)); - final String order1 = (order.equals("Quality-Date")) ? "quality" : "date"; - final String order2 = (order.equals("Quality-Date")) ? "date" : "quality"; + final String order1 = (order.equals("Quality-Date")) ? plasmaSearchQuery.ORDER_QUALITY : plasmaSearchQuery.ORDER_DATE; + final String order2 = (order.equals("Quality-Date")) ? plasmaSearchQuery.ORDER_DATE : plasmaSearchQuery.ORDER_QUALITY; String urlmask = ""; if (post.containsKey("urlmask") && post.get("urlmask").equals("no")) { urlmask = ".*"; @@ -149,7 +149,7 @@ public class index { } // do the search - plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, referer, new String[]{order1, order2}, count, searchtime, urlmask, + plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, new String[]{order1, order2}, count, searchtime, urlmask, referer, ((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL, "", 20); final serverObjects prop = sb.searchFromLocal(thisSearch); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index dd0ad6763..2e69d3048 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -51,6 +51,7 @@ import java.util.HashSet; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndexEntry; +import de.anomic.plasma.plasmaSearchQuery; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; @@ -89,7 +90,11 @@ public final class search { keyhashes.add(query.substring(i * plasmaWordIndexEntry.wordHashLength, (i + 1) * plasmaWordIndexEntry.wordHashLength)); } final long timestamp = System.currentTimeMillis(); - prop = sb.searchFromRemote(keyhashes, count, global, duetime); + + plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, new String[]{plasmaSearchQuery.ORDER_QUALITY, plasmaSearchQuery.ORDER_DATE}, + count, duetime, ".*"); + + prop = sb.searchFromRemote(squery); prop.put("searchtime", Long.toString(System.currentTimeMillis() - timestamp)); final int links = Integer.parseInt(prop.get("linkcount","0")); diff --git a/source/de/anomic/plasma/plasmaSearch.java b/source/de/anomic/plasma/plasmaSearch.java index 19469cf6f..a7d753f92 100644 --- a/source/de/anomic/plasma/plasmaSearch.java +++ b/source/de/anomic/plasma/plasmaSearch.java @@ -113,89 +113,19 @@ public final class plasmaSearch { //System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries"); return condenser.getWords().size(); } - + /* public plasmaWordIndexEntity searchWords(Set words, long time) throws IOException { - // search for the set of words and return an array of urlEntry elements - return searchHashes(plasmaSearchQuery.words2hashes(words), time); - } + } + */ + /* public plasmaWordIndexEntity searchHashes(Set hashes, long time) throws IOException { - // search for the set of hashes and return an array of urlEntry elements - - long stamp = System.currentTimeMillis(); - TreeMap map = new TreeMap(); - String singleHash; - plasmaWordIndexEntity singleResult; - Iterator i = hashes.iterator(); - while (i.hasNext()) { - // get next hash: - singleHash = (String) i.next(); - - // retrieve index - singleResult = wordIndex.getEntity(singleHash, true); - - // check result - if ((singleResult == null) || (singleResult.size() == 0)) return new plasmaWordIndexEntity(null); // as this is a cunjunction of searches, we have no result if any word is not known - - // store result in order of result size - map.put(serverCodings.enhancedCoder.encodeHex(singleResult.size(), 8) + singleHash, singleResult); - } - - // check if there is any result - if (map.size() == 0) return new plasmaWordIndexEntity(null); // no result, nothing found - - // the map now holds the search results in order of number of hits per word - // we now must pairwise build up a conjunction of these sets - String k = (String) map.firstKey(); // the smallest, which means, the one with the least entries - plasmaWordIndexEntity searchA, searchB, searchResult = (plasmaWordIndexEntity) map.remove(k); - while ((map.size() > 0) && (searchResult.size() > 0) && (time > 0)) { - // take the first element of map which is a result and combine it with result - k = (String) map.firstKey(); // the next smallest... - time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis(); - searchA = searchResult; - searchB = (plasmaWordIndexEntity) map.remove(k); - searchResult = plasmaWordIndexEntity.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1)); - // close the input files/structures - if (searchA != searchResult) searchA.close(); - if (searchB != searchResult) searchB.close(); - } - searchA = null; // free resources - searchB = null; // free resources - // in 'searchResult' is now the combined search result - if (searchResult.size() == 0) return new plasmaWordIndexEntity(null); - return searchResult; } - + */ + /* public plasmaSearchResult order(plasmaWordIndexEntity searchResult, Set searchhashes, Set stopwords, char[] priority, long maxTime, int minEntries) throws IOException { - // we collect the urlhashes from it and construct a List with urlEntry objects - // attention: if minEntries is too high, this method will not terminate within the maxTime - plasmaSearchResult acc = new plasmaSearchResult(searchhashes, stopwords, priority); - if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty - if (searchResult.size() == 0) return acc; // case that we have nothing to do - - Enumeration e = searchResult.elements(true); - plasmaWordIndexEntry entry; - long startCreateTime = System.currentTimeMillis(); - plasmaCrawlLURL.Entry page; - try { - while (e.hasMoreElements()) { - if ((acc.sizeFetched() >= minEntries) && - (System.currentTimeMillis() - startCreateTime >= maxTime)) break; - entry = (plasmaWordIndexEntry) e.nextElement(); - // find the url entry - page = urlStore.getEntry(entry.getUrlHash()); - // add a result - acc.addResult(entry, page); - } - } catch (kelondroException ee) { - serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee); - } - long startSortTime = System.currentTimeMillis(); - acc.sortResults(); - serverLog.logFine("PLASMA", "plasmaSearch.order: minEntries = " + minEntries + ", effectiveEntries = " + acc.sizeOrdered() + ", demanded Time = " + maxTime + ", effectiveTime = " + (System.currentTimeMillis() - startCreateTime) + ", createTime = " + (startSortTime - startCreateTime) + ", sortTime = " + (System.currentTimeMillis() - startSortTime)); - return acc; } - + */ } diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 13e556cf3..46bd79a8d 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -43,13 +43,98 @@ package de.anomic.plasma; import java.util.Iterator; +import java.util.Set; +import java.util.HashSet; +import java.util.TreeMap; +import java.util.Enumeration; +import java.io.IOException; + +import de.anomic.kelondro.kelondroException; +import de.anomic.server.logging.serverLog; +import de.anomic.server.serverCodings; public final class plasmaSearchEvent { + private serverLog log; private plasmaSearchQuery query; - - public plasmaSearchEvent(plasmaSearchQuery query) { + private plasmaWordIndex wordIndex; + private plasmaCrawlLURL urlStore; + private plasmaSnippetCache snippetCache; + + public plasmaSearchEvent(plasmaSearchQuery query, serverLog log, plasmaWordIndex wordIndex, plasmaCrawlLURL urlStore, plasmaSnippetCache snippetCache) { + this.log = log; + this.wordIndex = wordIndex; this.query = query; + this.urlStore = urlStore; + this.snippetCache = snippetCache; + } + + public plasmaWordIndexEntity search(long time) throws IOException { + // search for the set of hashes and return an array of urlEntry elements + + long stamp = System.currentTimeMillis(); + + // retrieve entities that belong to the hashes + Set entities = wordIndex.getEntities(query.queryHashes, true, true); + + // since this is a conjunction we return an empty entity if any word is not known + if (entities == null) return new plasmaWordIndexEntity(null); + + // join the result + return plasmaWordIndexEntity.joinEntities(entities, time - (System.currentTimeMillis() - stamp)); } + public plasmaSearchResult order(plasmaWordIndexEntity searchResult, long maxTime, int minEntries) throws IOException { + // we collect the urlhashes from it and construct a List with urlEntry objects + // attention: if minEntries is too high, this method will not terminate within the maxTime + + plasmaSearchResult acc = new plasmaSearchResult(query); + if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty + if (searchResult.size() == 0) return acc; // case that we have nothing to do + + Enumeration e = searchResult.elements(true); + plasmaWordIndexEntry entry; + long startCreateTime = System.currentTimeMillis(); + plasmaCrawlLURL.Entry page; + try { + while (e.hasMoreElements()) { + if ((acc.sizeFetched() >= minEntries) && + (System.currentTimeMillis() - startCreateTime >= maxTime)) break; + entry = (plasmaWordIndexEntry) e.nextElement(); + // find the url entry + page = urlStore.getEntry(entry.getUrlHash()); + // add a result + acc.addResult(entry, page); + } + } catch (kelondroException ee) { + serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee); + } + long startSortTime = System.currentTimeMillis(); + acc.sortResults(); + serverLog.logFine("PLASMA", "plasmaSearch.order: minEntries = " + minEntries + ", effectiveEntries = " + acc.sizeOrdered() + ", demanded Time = " + maxTime + ", effectiveTime = " + (System.currentTimeMillis() - startCreateTime) + ", createTime = " + (startSortTime - startCreateTime) + ", sortTime = " + (System.currentTimeMillis() - startSortTime)); + return acc; + } + + /* + public void preSearch() { + plasmaWordIndexEntity idx = null; + try { + // search the database locally + log.logFine("presearch: started job"); + idx = searchHashes(query.queryHashes, time); + log.logFine("presearch: found " + idx.size() + " results"); + plasmaSearchResult acc = order(idx, queryhashes, order, time, searchcount); + if (acc == null) return; + log.logFine("presearch: ordered results, now " + acc.sizeOrdered() + " URLs ready for fetch"); + + // take some elements and fetch the snippets + snippetCache.fetch(acc, queryhashes, urlmask, fetchcount); + } catch (IOException e) { + log.logSevere("presearch: failed", e); + } finally { + if (idx != null) try { idx.close(); } catch (Exception e){} + } + log.logFine("presearch: job terminated"); + } + */ } diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index fdd145292..f7cf1eb7a 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -52,6 +52,9 @@ import de.anomic.server.serverByteBuffer; public final class plasmaSearchQuery { + public static final String ORDER_QUALITY = "quality"; + public static final String ORDER_DATE = "date"; + public static final int SEARCHDOM_LOCAL = 0; public static final int SEARCHDOM_GROUPDHT = 1; public static final int SEARCHDOM_GROUPALL = 2; @@ -69,21 +72,35 @@ public final class plasmaSearchQuery { public String domGroupName; public int domMaxTargets; - public plasmaSearchQuery(Set queryWords, String referrer, + public plasmaSearchQuery(Set queryWords, String[] order, int wantedResults, long maximumTime, String urlMask, + String referrer, int domType, String domGroupName, int domMaxTargets) { this.queryWords = queryWords; this.queryHashes = words2hashes(queryWords); - this.referrer = referrer; this.order = order; this.wantedResults = wantedResults; this.maximumTime = maximumTime; this.urlMask = urlMask; + this.referrer = referrer; this.domType = domType; this.domGroupName = domGroupName; this.domMaxTargets = domMaxTargets; } + public plasmaSearchQuery(Set queryHashes, + String[] order, int wantedResults, long maximumTime, String urlMask) { + this.queryWords = null; + this.queryHashes = queryHashes; + this.order = order; + this.wantedResults = wantedResults; + this.maximumTime = maximumTime; + this.urlMask = urlMask; + this.referrer = referrer; + this.domType = -1; + this.domGroupName = null; + this.domMaxTargets = -1; + } public static Set words2hashes(String[] words) { TreeSet hashes = new TreeSet(); @@ -117,4 +134,13 @@ public final class plasmaSearchQuery { return query; } + public void filterOut(Set blueList) { + // filter out words that appear in this set + Iterator it = queryWords.iterator(); + String word; + while (it.hasNext()) { + word = (String) it.next(); + if (blueList.contains(word)) it.remove(); + } + } } diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java index 07d97833a..142a19003 100644 --- a/source/de/anomic/plasma/plasmaSearchResult.java +++ b/source/de/anomic/plasma/plasmaSearchResult.java @@ -54,29 +54,23 @@ import de.anomic.server.serverCodings; public final class plasmaSearchResult { - public static final char O_QUALITY = 'q'; - public static final char O_AGE = 'a'; public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"'; private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic - private Set searchhashes; // hashes that are searched here - private Set stopwords; // words that are excluded from the commonSense heuristic - private char[] order; // order of heuristics private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects + private plasmaSearchQuery query; - public plasmaSearchResult(Set searchhashes, Set stopwords, char[] order) { + public plasmaSearchResult(plasmaSearchQuery query) { this.pageAcc = new TreeMap(); - ref = new kelondroMScoreCluster(); - this.searchhashes = searchhashes; - this.stopwords = stopwords; - this.order = order; + this.ref = new kelondroMScoreCluster(); this.results = new ArrayList(); + this.query = query; } public plasmaSearchResult cloneSmart() { // clones only the top structure - plasmaSearchResult theClone = new plasmaSearchResult(this.searchhashes, this.stopwords, this.order); + plasmaSearchResult theClone = new plasmaSearchResult(query); theClone.pageAcc = (TreeMap) this.pageAcc.clone(); theClone.ref = this.ref; theClone.results = this.results; @@ -149,10 +143,10 @@ public final class plasmaSearchResult { // apply pre-calculated order attributes ranking = 0; - if (order[0] == O_QUALITY) ranking = 4096 * indexEntry.getQuality(); - else if (order[0] == O_AGE) ranking = 4096 * indexEntry.getVirtualAge(); - if (order[1] == O_QUALITY) ranking += indexEntry.getQuality(); - else if (order[1] == O_AGE) ranking += indexEntry.getVirtualAge(); + if (query.order[0].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking = 4096 * indexEntry.getQuality(); + else if (query.order[0].equals(plasmaSearchQuery.ORDER_DATE)) ranking = 4096 * indexEntry.getVirtualAge(); + if (query.order[1].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += indexEntry.getQuality(); + else if (query.order[1].equals(plasmaSearchQuery.ORDER_DATE)) ranking += indexEntry.getVirtualAge(); // apply 'common-sense' heuristic using references for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking += inc; @@ -161,7 +155,7 @@ public final class plasmaSearchResult { // apply query-in-result matching Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps); Set descrcomph = plasmaSearchQuery.words2hashes(descrcomps); - Iterator shi = searchhashes.iterator(); + Iterator shi = query.queryHashes.iterator(); while (shi.hasNext()) { queryhash = (String) shi.next(); if (urlcomph.contains(queryhash)) ranking += 10 * inc; @@ -187,9 +181,8 @@ public final class plasmaSearchResult { for (int i = 0; i < words.length; i++) { word = words[i].toLowerCase(); if ((word.length() > 2) && - (!(stopwords.contains(word))) && - ("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) && - (!(searchhashes.contains(plasmaWordIndexEntry.word2hash(word))))) + ("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) && + (!(query.queryHashes.contains(plasmaWordIndexEntry.word2hash(word))))) ref.incScore(word); } } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 6604c095f..d29c7824b 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1394,13 +1394,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (date == null) return ""; else return DateFormatter.format(date); } + /* public class presearch extends Thread { Set queryhashes; char[] order; String urlmask; long time; int searchcount, fetchcount; - public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask, int searchcount, int fetchcount) { + public presearch(Set queryhashes, char[] order, long time, String urlmask, int searchcount, int fetchcount) { this.queryhashes = queryhashes; this.order = order; this.urlmask = urlmask; @@ -1430,38 +1431,34 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } + */ + //public serverObjects searchFromLocal(Set querywords, String order1, String order2, int count, boolean global, long time /*milliseconds*/, String urlmask) { public serverObjects searchFromLocal(plasmaSearchQuery query) { // tell all threads to do nothing for a specific time - wordIndex.intermission(query.maximumTime); - intermissionAllThreads(query.maximumTime); + wordIndex.intermission(2 * query.maximumTime); + intermissionAllThreads(2 * query.maximumTime); serverObjects prop = new serverObjects(); try { - char[] order = new char[2]; - if (query.order[0].equals("quality")) order[0] = plasmaSearchResult.O_QUALITY; else order[0] = plasmaSearchResult.O_AGE; - if (query.order[1].equals("quality")) order[1] = plasmaSearchResult.O_QUALITY; else order[1] = plasmaSearchResult.O_AGE; + //char[] order = new char[2]; + //if (query.order[0].equals("quality")) order[0] = plasmaSearchResult.O_QUALITY; else order[0] = plasmaSearchResult.O_AGE; + //if (query.order[1].equals("quality")) order[1] = plasmaSearchResult.O_QUALITY; else order[1] = plasmaSearchResult.O_AGE; // filter out words that appear in bluelist - Iterator it = query.queryWords.iterator(); - String word, gs = ""; - while (it.hasNext()) { - word = (String) it.next(); - if (blueList.contains(word)) it.remove(); else gs += "+" + word; - } - if (gs.length() > 0) gs = gs.substring(1); + query.filterOut(blueList); // log - log.logInfo("INIT WORD SEARCH: " + gs + ":" + query.queryHashes + " - " + query.wantedResults + " links, " + (query.maximumTime / 1000) + " seconds"); + log.logInfo("INIT WORD SEARCH: " + query.queryWords + ":" + query.queryHashes + " - " + query.wantedResults + " links, " + (query.maximumTime / 1000) + " seconds"); long timestamp = System.currentTimeMillis(); // start a presearch, which makes only sense if we idle afterwards. // this is especially the case if we start a global search and idle until search - if (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) { - Thread preselect = new presearch(query.queryHashes, order, query.maximumTime / 10, query.urlMask, 10, 3); - preselect.start(); - } + //if (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) { + // Thread preselect = new presearch(query.queryHashes, order, query.maximumTime / 10, query.urlMask, 10, 3); + // preselect.start(); + //} // do global fetching int globalresults = 0; @@ -1479,13 +1476,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // now search locally (the global results should be now in the local db) long remainingTime = query.maximumTime - (System.currentTimeMillis() - timestamp); - plasmaWordIndexEntity idx = searchManager.searchHashes(query.queryHashes, remainingTime * 8 / 10); // the search + plasmaSearchEvent theSearch = new plasmaSearchEvent(query, log, wordIndex, urlPool.loadedURL, snippetCache); + plasmaWordIndexEntity idx = theSearch.search(remainingTime * 8 / 10); log.logFine("SEARCH TIME AFTER FINDING " + idx.size() + " ELEMENTS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); remainingTime = query.maximumTime - (System.currentTimeMillis() - timestamp); if (remainingTime < 500) remainingTime = 500; if (remainingTime > 3000) remainingTime = 3000; - plasmaSearchResult acc = searchManager.order(idx, query.queryHashes, stopwords, order, remainingTime, 10); + plasmaSearchResult acc = theSearch.order(idx, remainingTime, 10); if (query.domType != plasmaSearchQuery.SEARCHDOM_GLOBALDHT) snippetCache.fetch(acc.cloneSmart(), query.queryHashes, query.urlMask, 10); log.logFine("SEARCH TIME AFTER ORDERING OF SEARCH RESULT: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); @@ -1595,7 +1593,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // log - log.logInfo("EXIT WORD SEARCH: " + gs + " - " + + log.logInfo("EXIT WORD SEARCH: " + query.queryWords + " - " + prop.get("totalcount", "0") + " links found, " + prop.get("orderedcount", "0") + " links ordered, " + prop.get("linkcount", "?") + " links selected, " + @@ -1607,21 +1605,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } - public serverObjects searchFromRemote(Set hashes, int count, boolean global, long duetime) { + public serverObjects searchFromRemote(plasmaSearchQuery query) { // tell all threads to do nothing for a specific time - wordIndex.intermission(duetime); - intermissionAllThreads(duetime); + wordIndex.intermission(2 * query.maximumTime); + intermissionAllThreads(2 * query.maximumTime); - if (hashes == null) hashes = new HashSet(); serverObjects prop = new serverObjects(); try { - log.logInfo("INIT HASH SEARCH: " + hashes + " - " + count + " links"); + log.logInfo("INIT HASH SEARCH: " + query.queryHashes + " - " + query.wantedResults + " links"); long timestamp = System.currentTimeMillis(); - plasmaWordIndexEntity idx = searchManager.searchHashes(hashes, duetime * 8 / 10); // a nameless temporary index, not sorted by special order but by hash - long remainingTime = duetime - (System.currentTimeMillis() - timestamp); + plasmaSearchEvent theSearch = new plasmaSearchEvent(query, log, wordIndex, urlPool.loadedURL, snippetCache); + plasmaWordIndexEntity idx = theSearch.search(query.maximumTime * 8 / 10); + long remainingTime = query.maximumTime - (System.currentTimeMillis() - timestamp); if (remainingTime < 500) remainingTime = 500; - plasmaSearchResult acc = searchManager.order(idx, hashes, stopwords, new char[]{plasmaSearchResult.O_QUALITY, plasmaSearchResult.O_AGE}, remainingTime, 10); + plasmaSearchResult acc = theSearch.order(idx, remainingTime, 10); // result is a List of urlEntry elements if (acc == null) { @@ -1636,9 +1634,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser //plasmaIndexEntry pie; plasmaCrawlLURL.Entry urlentry; plasmaSnippetCache.result snippet; - while ((acc.hasMoreElements()) && (i < count)) { + while ((acc.hasMoreElements()) && (i < query.wantedResults)) { urlentry = acc.nextElement(); - snippet = snippetCache.retrieve(urlentry.url(), hashes, false, 260); + snippet = snippetCache.retrieve(urlentry.url(), query.queryHashes, false, 260); if (snippet.source == plasmaSnippetCache.ERROR_NO_MATCH) { // suppress line: there is no match in that resource } else { @@ -1669,7 +1667,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser prop.put("fwrec", ""); // peers that would have helped to construct this result (recommendations) // log - log.logInfo("EXIT HASH SEARCH: " + hashes + " - " + + log.logInfo("EXIT HASH SEARCH: " + query.queryHashes + " - " + ((idx == null) ? "0" : (""+idx.size())) + " links found, " + prop.get("linkcount", "?") + " links selected, " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index e756a2a16..8ec91c388 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -53,6 +53,8 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.Iterator; import java.util.TreeSet; +import java.util.HashSet; +import java.util.Set; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.server.logging.serverLog; @@ -105,6 +107,28 @@ public final class plasmaWordIndex { return ramCache.getIndex(wordHash, deleteIfEmpty); } + public Set getEntities(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty) { + + // retrieve entities that belong to the hashes + HashSet entities = new HashSet(); + String singleHash; + plasmaWordIndexEntity singleEntity; + Iterator i = wordHashes.iterator(); + while (i.hasNext()) { + // get next hash: + singleHash = (String) i.next(); + + // retrieve index + singleEntity = getEntity(singleHash, true); + + // check result + if (((singleEntity == null) || (singleEntity.size() == 0)) && (interruptIfEmpty)) return null; + + entities.add(singleEntity); + } + return entities; + } + public int size() { return ramCache.size(); } diff --git a/source/de/anomic/plasma/plasmaWordIndexEntity.java b/source/de/anomic/plasma/plasmaWordIndexEntity.java index 0a5963097..45ac236b7 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntity.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntity.java @@ -46,6 +46,7 @@ import java.io.IOException; import java.util.Enumeration; import java.util.Iterator; import java.util.TreeMap; +import java.util.Set; import de.anomic.kelondro.kelondroRecords; import de.anomic.kelondro.kelondroTree; @@ -293,6 +294,54 @@ public final class plasmaWordIndexEntity { return l; } + public static plasmaWordIndexEntity joinEntities(Set entities, long time) throws IOException { + + long stamp = System.currentTimeMillis(); + + // order entities by their size + TreeMap map = new TreeMap(); + plasmaWordIndexEntity singleEntity; + Iterator i = entities.iterator(); + int count = 0; + while (i.hasNext()) { + // get next entity: + singleEntity = (plasmaWordIndexEntity) i.next(); + + // check result + if ((singleEntity == null) || (singleEntity.size() == 0)) return new plasmaWordIndexEntity(null); // as this is a cunjunction of searches, we have no result if any word is not known + + // store result in order of result size + map.put(new Long(singleEntity.size() * 1000 + count), singleEntity); + count++; + } + + // check if there is any result + if (map.size() == 0) return new plasmaWordIndexEntity(null); // no result, nothing found + + // the map now holds the search results in order of number of hits per word + // we now must pairwise build up a conjunction of these sets + Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries + plasmaWordIndexEntity searchA, searchB, searchResult = (plasmaWordIndexEntity) map.remove(k); + while ((map.size() > 0) && (searchResult.size() > 0) && (time > 0)) { + // take the first element of map which is a result and combine it with result + k = (Long) map.firstKey(); // the next smallest... + time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis(); + searchA = searchResult; + searchB = (plasmaWordIndexEntity) map.remove(k); + searchResult = plasmaWordIndexEntity.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1)); + // close the input files/structures + if (searchA != searchResult) searchA.close(); + if (searchB != searchResult) searchB.close(); + } + searchA = null; // free resources + searchB = null; // free resources + + // in 'searchResult' is now the combined search result + if (searchResult.size() == 0) return new plasmaWordIndexEntity(null); + return searchResult; + } + + public static plasmaWordIndexEntity joinConstructive(plasmaWordIndexEntity i1, plasmaWordIndexEntity i2, long time) throws IOException { if ((i1 == null) || (i2 == null)) return null; if ((i1.size() == 0) || (i2.size() == 0)) return new plasmaWordIndexEntity(null); diff --git a/source/de/anomic/server/serverAbstractThread.java b/source/de/anomic/server/serverAbstractThread.java index 7d7724bf3..80d521c41 100644 --- a/source/de/anomic/server/serverAbstractThread.java +++ b/source/de/anomic/server/serverAbstractThread.java @@ -238,8 +238,10 @@ public abstract class serverAbstractThread extends Thread implements serverThrea while (running) { if (this.intermission > 0) { - if (this.intermission > System.currentTimeMillis()) { - ratz(this.intermission - System.currentTimeMillis()); + long itime = this.intermission - System.currentTimeMillis(); + if (itime > 0) { + logSystem("thread '" + this.getName() + "' breaks for intermission: " + (itime / 1000) + " seconds"); + ratz(itime); } this.intermission = 0; }