refactoring of search / preparation for better search methods

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@921 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 9ff2bd2f0e
commit d29dfb0a12

@ -54,6 +54,7 @@ import java.util.HashSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeMap;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURL;
@ -150,9 +151,7 @@ public class IndexControl_p {
// generate an urlx array
plasmaWordIndexEntity index = null;
try {
HashSet keyhashes = new HashSet();
keyhashes.add(keyhash);
index = switchboard.searchManager.searchHashes(keyhashes, 10000);
index = switchboard.wordIndex.getEntity(keyhash, true);
Enumeration en = index.elements(true);
int i = 0;
urlx = new String[index.size()];
@ -437,9 +436,7 @@ public class IndexControl_p {
// search for a word hash and generate a list of url links
plasmaWordIndexEntity index = null;
try {
final HashSet keyhashes = new HashSet();
keyhashes.add(keyhash);
index = switchboard.searchManager.searchHashes(keyhashes, 10000);
index = switchboard.wordIndex.getEntity(keyhash, true);
final StringBuffer result = new StringBuffer(1024);
if (index.size() == 0) {

@ -139,8 +139,8 @@ public class index {
(yacyCore.seedDB.mySeed != null) &&
(yacyCore.seedDB.mySeed.getAddress() != null));
final String order1 = (order.equals("Quality-Date")) ? "quality" : "date";
final String order2 = (order.equals("Quality-Date")) ? "date" : "quality";
final String order1 = (order.equals("Quality-Date")) ? plasmaSearchQuery.ORDER_QUALITY : plasmaSearchQuery.ORDER_DATE;
final String order2 = (order.equals("Quality-Date")) ? plasmaSearchQuery.ORDER_DATE : plasmaSearchQuery.ORDER_QUALITY;
String urlmask = "";
if (post.containsKey("urlmask") && post.get("urlmask").equals("no")) {
urlmask = ".*";
@ -149,7 +149,7 @@ public class index {
}
// do the search
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, referer, new String[]{order1, order2}, count, searchtime, urlmask,
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, new String[]{order1, order2}, count, searchtime, urlmask, referer,
((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
"", 20);
final serverObjects prop = sb.searchFromLocal(thisSearch);

@ -51,6 +51,7 @@ import java.util.HashSet;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
@ -89,7 +90,11 @@ public final class search {
keyhashes.add(query.substring(i * plasmaWordIndexEntry.wordHashLength, (i + 1) * plasmaWordIndexEntry.wordHashLength));
}
final long timestamp = System.currentTimeMillis();
prop = sb.searchFromRemote(keyhashes, count, global, duetime);
plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, new String[]{plasmaSearchQuery.ORDER_QUALITY, plasmaSearchQuery.ORDER_DATE},
count, duetime, ".*");
prop = sb.searchFromRemote(squery);
prop.put("searchtime", Long.toString(System.currentTimeMillis() - timestamp));
final int links = Integer.parseInt(prop.get("linkcount","0"));

@ -113,89 +113,19 @@ public final class plasmaSearch {
//System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
return condenser.getWords().size();
}
/*
public plasmaWordIndexEntity searchWords(Set words, long time) throws IOException {
// search for the set of words and return an array of urlEntry elements
return searchHashes(plasmaSearchQuery.words2hashes(words), time);
}
public plasmaWordIndexEntity searchHashes(Set hashes, long time) throws IOException {
// search for the set of hashes and return an array of urlEntry elements
long stamp = System.currentTimeMillis();
TreeMap map = new TreeMap();
String singleHash;
plasmaWordIndexEntity singleResult;
Iterator i = hashes.iterator();
while (i.hasNext()) {
// get next hash:
singleHash = (String) i.next();
// retrieve index
singleResult = wordIndex.getEntity(singleHash, true);
// check result
if ((singleResult == null) || (singleResult.size() == 0)) return new plasmaWordIndexEntity(null); // as this is a cunjunction of searches, we have no result if any word is not known
// store result in order of result size
map.put(serverCodings.enhancedCoder.encodeHex(singleResult.size(), 8) + singleHash, singleResult);
}
*/
/*
public plasmaWordIndexEntity searchHashes(Set hashes, long time) throws IOException {
// check if there is any result
if (map.size() == 0) return new plasmaWordIndexEntity(null); // no result, nothing found
// the map now holds the search results in order of number of hits per word
// we now must pairwise build up a conjunction of these sets
String k = (String) map.firstKey(); // the smallest, which means, the one with the least entries
plasmaWordIndexEntity searchA, searchB, searchResult = (plasmaWordIndexEntity) map.remove(k);
while ((map.size() > 0) && (searchResult.size() > 0) && (time > 0)) {
// take the first element of map which is a result and combine it with result
k = (String) map.firstKey(); // the next smallest...
time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis();
searchA = searchResult;
searchB = (plasmaWordIndexEntity) map.remove(k);
searchResult = plasmaWordIndexEntity.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1));
// close the input files/structures
if (searchA != searchResult) searchA.close();
if (searchB != searchResult) searchB.close();
}
searchA = null; // free resources
searchB = null; // free resources
// in 'searchResult' is now the combined search result
if (searchResult.size() == 0) return new plasmaWordIndexEntity(null);
return searchResult;
}
*/
/*
public plasmaSearchResult order(plasmaWordIndexEntity searchResult, Set searchhashes, Set stopwords, char[] priority, long maxTime, int minEntries) throws IOException {
// we collect the urlhashes from it and construct a List with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
plasmaSearchResult acc = new plasmaSearchResult(searchhashes, stopwords, priority);
if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty
if (searchResult.size() == 0) return acc; // case that we have nothing to do
Enumeration e = searchResult.elements(true);
plasmaWordIndexEntry entry;
long startCreateTime = System.currentTimeMillis();
plasmaCrawlLURL.Entry page;
try {
while (e.hasMoreElements()) {
if ((acc.sizeFetched() >= minEntries) &&
(System.currentTimeMillis() - startCreateTime >= maxTime)) break;
entry = (plasmaWordIndexEntry) e.nextElement();
// find the url entry
page = urlStore.getEntry(entry.getUrlHash());
// add a result
acc.addResult(entry, page);
}
} catch (kelondroException ee) {
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
}
long startSortTime = System.currentTimeMillis();
acc.sortResults();
serverLog.logFine("PLASMA", "plasmaSearch.order: minEntries = " + minEntries + ", effectiveEntries = " + acc.sizeOrdered() + ", demanded Time = " + maxTime + ", effectiveTime = " + (System.currentTimeMillis() - startCreateTime) + ", createTime = " + (startSortTime - startCreateTime) + ", sortTime = " + (System.currentTimeMillis() - startSortTime));
return acc;
}
*/
}

@ -43,13 +43,98 @@
package de.anomic.plasma;
import java.util.Iterator;
import java.util.Set;
import java.util.HashSet;
import java.util.TreeMap;
import java.util.Enumeration;
import java.io.IOException;
import de.anomic.kelondro.kelondroException;
import de.anomic.server.logging.serverLog;
import de.anomic.server.serverCodings;
public final class plasmaSearchEvent {
private serverLog log;
private plasmaSearchQuery query;
private plasmaWordIndex wordIndex;
private plasmaCrawlLURL urlStore;
private plasmaSnippetCache snippetCache;
public plasmaSearchEvent(plasmaSearchQuery query) {
public plasmaSearchEvent(plasmaSearchQuery query, serverLog log, plasmaWordIndex wordIndex, plasmaCrawlLURL urlStore, plasmaSnippetCache snippetCache) {
this.log = log;
this.wordIndex = wordIndex;
this.query = query;
this.urlStore = urlStore;
this.snippetCache = snippetCache;
}
public plasmaWordIndexEntity search(long time) throws IOException {
// search for the set of hashes and return an array of urlEntry elements
long stamp = System.currentTimeMillis();
// retrieve entities that belong to the hashes
Set entities = wordIndex.getEntities(query.queryHashes, true, true);
// since this is a conjunction we return an empty entity if any word is not known
if (entities == null) return new plasmaWordIndexEntity(null);
// join the result
return plasmaWordIndexEntity.joinEntities(entities, time - (System.currentTimeMillis() - stamp));
}
public plasmaSearchResult order(plasmaWordIndexEntity searchResult, long maxTime, int minEntries) throws IOException {
// we collect the urlhashes from it and construct a List with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
plasmaSearchResult acc = new plasmaSearchResult(query);
if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty
if (searchResult.size() == 0) return acc; // case that we have nothing to do
Enumeration e = searchResult.elements(true);
plasmaWordIndexEntry entry;
long startCreateTime = System.currentTimeMillis();
plasmaCrawlLURL.Entry page;
try {
while (e.hasMoreElements()) {
if ((acc.sizeFetched() >= minEntries) &&
(System.currentTimeMillis() - startCreateTime >= maxTime)) break;
entry = (plasmaWordIndexEntry) e.nextElement();
// find the url entry
page = urlStore.getEntry(entry.getUrlHash());
// add a result
acc.addResult(entry, page);
}
} catch (kelondroException ee) {
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
}
long startSortTime = System.currentTimeMillis();
acc.sortResults();
serverLog.logFine("PLASMA", "plasmaSearch.order: minEntries = " + minEntries + ", effectiveEntries = " + acc.sizeOrdered() + ", demanded Time = " + maxTime + ", effectiveTime = " + (System.currentTimeMillis() - startCreateTime) + ", createTime = " + (startSortTime - startCreateTime) + ", sortTime = " + (System.currentTimeMillis() - startSortTime));
return acc;
}
/*
public void preSearch() {
plasmaWordIndexEntity idx = null;
try {
// search the database locally
log.logFine("presearch: started job");
idx = searchHashes(query.queryHashes, time);
log.logFine("presearch: found " + idx.size() + " results");
plasmaSearchResult acc = order(idx, queryhashes, order, time, searchcount);
if (acc == null) return;
log.logFine("presearch: ordered results, now " + acc.sizeOrdered() + " URLs ready for fetch");
// take some elements and fetch the snippets
snippetCache.fetch(acc, queryhashes, urlmask, fetchcount);
} catch (IOException e) {
log.logSevere("presearch: failed", e);
} finally {
if (idx != null) try { idx.close(); } catch (Exception e){}
}
log.logFine("presearch: job terminated");
}
*/
}

@ -52,6 +52,9 @@ import de.anomic.server.serverByteBuffer;
public final class plasmaSearchQuery {
public static final String ORDER_QUALITY = "quality";
public static final String ORDER_DATE = "date";
public static final int SEARCHDOM_LOCAL = 0;
public static final int SEARCHDOM_GROUPDHT = 1;
public static final int SEARCHDOM_GROUPALL = 2;
@ -69,21 +72,35 @@ public final class plasmaSearchQuery {
public String domGroupName;
public int domMaxTargets;
public plasmaSearchQuery(Set queryWords, String referrer,
public plasmaSearchQuery(Set queryWords,
String[] order, int wantedResults, long maximumTime, String urlMask,
String referrer,
int domType, String domGroupName, int domMaxTargets) {
this.queryWords = queryWords;
this.queryHashes = words2hashes(queryWords);
this.referrer = referrer;
this.order = order;
this.wantedResults = wantedResults;
this.maximumTime = maximumTime;
this.urlMask = urlMask;
this.referrer = referrer;
this.domType = domType;
this.domGroupName = domGroupName;
this.domMaxTargets = domMaxTargets;
}
public plasmaSearchQuery(Set queryHashes,
String[] order, int wantedResults, long maximumTime, String urlMask) {
this.queryWords = null;
this.queryHashes = queryHashes;
this.order = order;
this.wantedResults = wantedResults;
this.maximumTime = maximumTime;
this.urlMask = urlMask;
this.referrer = referrer;
this.domType = -1;
this.domGroupName = null;
this.domMaxTargets = -1;
}
public static Set words2hashes(String[] words) {
TreeSet hashes = new TreeSet();
@ -117,4 +134,13 @@ public final class plasmaSearchQuery {
return query;
}
public void filterOut(Set blueList) {
// filter out words that appear in this set
Iterator it = queryWords.iterator();
String word;
while (it.hasNext()) {
word = (String) it.next();
if (blueList.contains(word)) it.remove();
}
}
}

@ -54,29 +54,23 @@ import de.anomic.server.serverCodings;
public final class plasmaSearchResult {
public static final char O_QUALITY = 'q';
public static final char O_AGE = 'a';
public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
private Set searchhashes; // hashes that are searched here
private Set stopwords; // words that are excluded from the commonSense heuristic
private char[] order; // order of heuristics
private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
private plasmaSearchQuery query;
public plasmaSearchResult(Set searchhashes, Set stopwords, char[] order) {
public plasmaSearchResult(plasmaSearchQuery query) {
this.pageAcc = new TreeMap();
ref = new kelondroMScoreCluster();
this.searchhashes = searchhashes;
this.stopwords = stopwords;
this.order = order;
this.ref = new kelondroMScoreCluster();
this.results = new ArrayList();
this.query = query;
}
public plasmaSearchResult cloneSmart() {
// clones only the top structure
plasmaSearchResult theClone = new plasmaSearchResult(this.searchhashes, this.stopwords, this.order);
plasmaSearchResult theClone = new plasmaSearchResult(query);
theClone.pageAcc = (TreeMap) this.pageAcc.clone();
theClone.ref = this.ref;
theClone.results = this.results;
@ -149,10 +143,10 @@ public final class plasmaSearchResult {
// apply pre-calculated order attributes
ranking = 0;
if (order[0] == O_QUALITY) ranking = 4096 * indexEntry.getQuality();
else if (order[0] == O_AGE) ranking = 4096 * indexEntry.getVirtualAge();
if (order[1] == O_QUALITY) ranking += indexEntry.getQuality();
else if (order[1] == O_AGE) ranking += indexEntry.getVirtualAge();
if (query.order[0].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking = 4096 * indexEntry.getQuality();
else if (query.order[0].equals(plasmaSearchQuery.ORDER_DATE)) ranking = 4096 * indexEntry.getVirtualAge();
if (query.order[1].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += indexEntry.getQuality();
else if (query.order[1].equals(plasmaSearchQuery.ORDER_DATE)) ranking += indexEntry.getVirtualAge();
// apply 'common-sense' heuristic using references
for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking += inc;
@ -161,7 +155,7 @@ public final class plasmaSearchResult {
// apply query-in-result matching
Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps);
Set descrcomph = plasmaSearchQuery.words2hashes(descrcomps);
Iterator shi = searchhashes.iterator();
Iterator shi = query.queryHashes.iterator();
while (shi.hasNext()) {
queryhash = (String) shi.next();
if (urlcomph.contains(queryhash)) ranking += 10 * inc;
@ -187,9 +181,8 @@ public final class plasmaSearchResult {
for (int i = 0; i < words.length; i++) {
word = words[i].toLowerCase();
if ((word.length() > 2) &&
(!(stopwords.contains(word))) &&
("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&
(!(searchhashes.contains(plasmaWordIndexEntry.word2hash(word)))))
(!(query.queryHashes.contains(plasmaWordIndexEntry.word2hash(word)))))
ref.incScore(word);
}
}

@ -1394,13 +1394,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (date == null) return ""; else return DateFormatter.format(date);
}
/*
public class presearch extends Thread {
Set queryhashes;
char[] order;
String urlmask;
long time;
int searchcount, fetchcount;
public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask, int searchcount, int fetchcount) {
public presearch(Set queryhashes, char[] order, long time, String urlmask, int searchcount, int fetchcount) {
this.queryhashes = queryhashes;
this.order = order;
this.urlmask = urlmask;
@ -1430,38 +1431,34 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
}
*/
//public serverObjects searchFromLocal(Set querywords, String order1, String order2, int count, boolean global, long time /*milliseconds*/, String urlmask) {
public serverObjects searchFromLocal(plasmaSearchQuery query) {
// tell all threads to do nothing for a specific time
wordIndex.intermission(query.maximumTime);
intermissionAllThreads(query.maximumTime);
wordIndex.intermission(2 * query.maximumTime);
intermissionAllThreads(2 * query.maximumTime);
serverObjects prop = new serverObjects();
try {
char[] order = new char[2];
if (query.order[0].equals("quality")) order[0] = plasmaSearchResult.O_QUALITY; else order[0] = plasmaSearchResult.O_AGE;
if (query.order[1].equals("quality")) order[1] = plasmaSearchResult.O_QUALITY; else order[1] = plasmaSearchResult.O_AGE;
//char[] order = new char[2];
//if (query.order[0].equals("quality")) order[0] = plasmaSearchResult.O_QUALITY; else order[0] = plasmaSearchResult.O_AGE;
//if (query.order[1].equals("quality")) order[1] = plasmaSearchResult.O_QUALITY; else order[1] = plasmaSearchResult.O_AGE;
// filter out words that appear in bluelist
Iterator it = query.queryWords.iterator();
String word, gs = "";
while (it.hasNext()) {
word = (String) it.next();
if (blueList.contains(word)) it.remove(); else gs += "+" + word;
}
if (gs.length() > 0) gs = gs.substring(1);
query.filterOut(blueList);
// log
log.logInfo("INIT WORD SEARCH: " + gs + ":" + query.queryHashes + " - " + query.wantedResults + " links, " + (query.maximumTime / 1000) + " seconds");
log.logInfo("INIT WORD SEARCH: " + query.queryWords + ":" + query.queryHashes + " - " + query.wantedResults + " links, " + (query.maximumTime / 1000) + " seconds");
long timestamp = System.currentTimeMillis();
// start a presearch, which makes only sense if we idle afterwards.
// this is especially the case if we start a global search and idle until search
if (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) {
Thread preselect = new presearch(query.queryHashes, order, query.maximumTime / 10, query.urlMask, 10, 3);
preselect.start();
}
//if (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) {
// Thread preselect = new presearch(query.queryHashes, order, query.maximumTime / 10, query.urlMask, 10, 3);
// preselect.start();
//}
// do global fetching
int globalresults = 0;
@ -1479,13 +1476,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// now search locally (the global results should be now in the local db)
long remainingTime = query.maximumTime - (System.currentTimeMillis() - timestamp);
plasmaWordIndexEntity idx = searchManager.searchHashes(query.queryHashes, remainingTime * 8 / 10); // the search
plasmaSearchEvent theSearch = new plasmaSearchEvent(query, log, wordIndex, urlPool.loadedURL, snippetCache);
plasmaWordIndexEntity idx = theSearch.search(remainingTime * 8 / 10);
log.logFine("SEARCH TIME AFTER FINDING " + idx.size() + " ELEMENTS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
remainingTime = query.maximumTime - (System.currentTimeMillis() - timestamp);
if (remainingTime < 500) remainingTime = 500;
if (remainingTime > 3000) remainingTime = 3000;
plasmaSearchResult acc = searchManager.order(idx, query.queryHashes, stopwords, order, remainingTime, 10);
plasmaSearchResult acc = theSearch.order(idx, remainingTime, 10);
if (query.domType != plasmaSearchQuery.SEARCHDOM_GLOBALDHT)
snippetCache.fetch(acc.cloneSmart(), query.queryHashes, query.urlMask, 10);
log.logFine("SEARCH TIME AFTER ORDERING OF SEARCH RESULT: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
@ -1595,7 +1593,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// log
log.logInfo("EXIT WORD SEARCH: " + gs + " - " +
log.logInfo("EXIT WORD SEARCH: " + query.queryWords + " - " +
prop.get("totalcount", "0") + " links found, " +
prop.get("orderedcount", "0") + " links ordered, " +
prop.get("linkcount", "?") + " links selected, " +
@ -1607,21 +1605,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
}
public serverObjects searchFromRemote(Set hashes, int count, boolean global, long duetime) {
public serverObjects searchFromRemote(plasmaSearchQuery query) {
// tell all threads to do nothing for a specific time
wordIndex.intermission(duetime);
intermissionAllThreads(duetime);
wordIndex.intermission(2 * query.maximumTime);
intermissionAllThreads(2 * query.maximumTime);
if (hashes == null) hashes = new HashSet();
serverObjects prop = new serverObjects();
try {
log.logInfo("INIT HASH SEARCH: " + hashes + " - " + count + " links");
log.logInfo("INIT HASH SEARCH: " + query.queryHashes + " - " + query.wantedResults + " links");
long timestamp = System.currentTimeMillis();
plasmaWordIndexEntity idx = searchManager.searchHashes(hashes, duetime * 8 / 10); // a nameless temporary index, not sorted by special order but by hash
long remainingTime = duetime - (System.currentTimeMillis() - timestamp);
plasmaSearchEvent theSearch = new plasmaSearchEvent(query, log, wordIndex, urlPool.loadedURL, snippetCache);
plasmaWordIndexEntity idx = theSearch.search(query.maximumTime * 8 / 10);
long remainingTime = query.maximumTime - (System.currentTimeMillis() - timestamp);
if (remainingTime < 500) remainingTime = 500;
plasmaSearchResult acc = searchManager.order(idx, hashes, stopwords, new char[]{plasmaSearchResult.O_QUALITY, plasmaSearchResult.O_AGE}, remainingTime, 10);
plasmaSearchResult acc = theSearch.order(idx, remainingTime, 10);
// result is a List of urlEntry elements
if (acc == null) {
@ -1636,9 +1634,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
//plasmaIndexEntry pie;
plasmaCrawlLURL.Entry urlentry;
plasmaSnippetCache.result snippet;
while ((acc.hasMoreElements()) && (i < count)) {
while ((acc.hasMoreElements()) && (i < query.wantedResults)) {
urlentry = acc.nextElement();
snippet = snippetCache.retrieve(urlentry.url(), hashes, false, 260);
snippet = snippetCache.retrieve(urlentry.url(), query.queryHashes, false, 260);
if (snippet.source == plasmaSnippetCache.ERROR_NO_MATCH) {
// suppress line: there is no match in that resource
} else {
@ -1669,7 +1667,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("fwrec", ""); // peers that would have helped to construct this result (recommendations)
// log
log.logInfo("EXIT HASH SEARCH: " + hashes + " - " +
log.logInfo("EXIT HASH SEARCH: " + query.queryHashes + " - " +
((idx == null) ? "0" : (""+idx.size())) + " links found, " +
prop.get("linkcount", "?") + " links selected, " +
((System.currentTimeMillis() - timestamp) / 1000) + " seconds");

@ -53,6 +53,8 @@ import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.TreeSet;
import java.util.HashSet;
import java.util.Set;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.server.logging.serverLog;
@ -105,6 +107,28 @@ public final class plasmaWordIndex {
return ramCache.getIndex(wordHash, deleteIfEmpty);
}
public Set getEntities(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty) {
// retrieve entities that belong to the hashes
HashSet entities = new HashSet();
String singleHash;
plasmaWordIndexEntity singleEntity;
Iterator i = wordHashes.iterator();
while (i.hasNext()) {
// get next hash:
singleHash = (String) i.next();
// retrieve index
singleEntity = getEntity(singleHash, true);
// check result
if (((singleEntity == null) || (singleEntity.size() == 0)) && (interruptIfEmpty)) return null;
entities.add(singleEntity);
}
return entities;
}
public int size() {
return ramCache.size();
}

@ -46,6 +46,7 @@ import java.io.IOException;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Set;
import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroTree;
@ -293,6 +294,54 @@ public final class plasmaWordIndexEntity {
return l;
}
public static plasmaWordIndexEntity joinEntities(Set entities, long time) throws IOException {
long stamp = System.currentTimeMillis();
// order entities by their size
TreeMap map = new TreeMap();
plasmaWordIndexEntity singleEntity;
Iterator i = entities.iterator();
int count = 0;
while (i.hasNext()) {
// get next entity:
singleEntity = (plasmaWordIndexEntity) i.next();
// check result
if ((singleEntity == null) || (singleEntity.size() == 0)) return new plasmaWordIndexEntity(null); // as this is a cunjunction of searches, we have no result if any word is not known
// store result in order of result size
map.put(new Long(singleEntity.size() * 1000 + count), singleEntity);
count++;
}
// check if there is any result
if (map.size() == 0) return new plasmaWordIndexEntity(null); // no result, nothing found
// the map now holds the search results in order of number of hits per word
// we now must pairwise build up a conjunction of these sets
Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries
plasmaWordIndexEntity searchA, searchB, searchResult = (plasmaWordIndexEntity) map.remove(k);
while ((map.size() > 0) && (searchResult.size() > 0) && (time > 0)) {
// take the first element of map which is a result and combine it with result
k = (Long) map.firstKey(); // the next smallest...
time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis();
searchA = searchResult;
searchB = (plasmaWordIndexEntity) map.remove(k);
searchResult = plasmaWordIndexEntity.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1));
// close the input files/structures
if (searchA != searchResult) searchA.close();
if (searchB != searchResult) searchB.close();
}
searchA = null; // free resources
searchB = null; // free resources
// in 'searchResult' is now the combined search result
if (searchResult.size() == 0) return new plasmaWordIndexEntity(null);
return searchResult;
}
public static plasmaWordIndexEntity joinConstructive(plasmaWordIndexEntity i1, plasmaWordIndexEntity i2, long time) throws IOException {
if ((i1 == null) || (i2 == null)) return null;
if ((i1.size() == 0) || (i2.size() == 0)) return new plasmaWordIndexEntity(null);

@ -238,8 +238,10 @@ public abstract class serverAbstractThread extends Thread implements serverThrea
while (running) {
if (this.intermission > 0) {
if (this.intermission > System.currentTimeMillis()) {
ratz(this.intermission - System.currentTimeMillis());
long itime = this.intermission - System.currentTimeMillis();
if (itime > 0) {
logSystem("thread '" + this.getName() + "' breaks for intermission: " + (itime / 1000) + " seconds");
ratz(itime);
}
this.intermission = 0;
}

Loading…
Cancel
Save