fixed bug in remote search

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4419 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 7d875290b2
commit fa3b8f0ae1

@ -115,6 +115,10 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
return cardinal(new indexRWIVarEntry(new indexRWIRowEntry(key))); return cardinal(new indexRWIVarEntry(new indexRWIRowEntry(key)));
} }
public long cardinal(indexRWIRowEntry t) {
return cardinal(new indexRWIVarEntry(t));
}
public long cardinal(indexRWIVarEntry t) { public long cardinal(indexRWIVarEntry t) {
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords); //return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry // the normalizedEntry must be a normalized indexEntry

@ -89,12 +89,12 @@ public class indexRWIVarEntry implements indexRWIEntry {
} }
public boolean isNewer(indexRWIEntry other) { public boolean isNewer(indexRWIEntry other) {
// TODO Auto-generated method stub assert false; // should not be used
return false; return false;
} }
public boolean isOlder(indexRWIEntry other) { public boolean isOlder(indexRWIEntry other) {
// TODO Auto-generated method stub assert false; // should not be used
return false; return false;
} }
@ -131,12 +131,12 @@ public class indexRWIVarEntry implements indexRWIEntry {
} }
public Entry toKelondroEntry() { public Entry toKelondroEntry() {
// TODO Auto-generated method stub assert false; // should not be used
return null; return null;
} }
public String toPropertyForm() { public String toPropertyForm() {
// TODO Auto-generated method stub assert false; // should not be used
return null; return null;
} }

@ -115,7 +115,7 @@ public class indexURLEntry {
private kelondroRow.Entry entry; private kelondroRow.Entry entry;
private String snippet; private String snippet;
private indexRWIEntry word; // this is only used if the url is transported via remote search requests private indexRWIRowEntry word; // this is only used if the url is transported via remote search requests
private long ranking; // during generation of a search result this value is set private long ranking; // during generation of a search result this value is set
public indexURLEntry( public indexURLEntry(
@ -185,7 +185,7 @@ public class indexURLEntry {
return s.toString().getBytes(); return s.toString().getBytes();
} }
public indexURLEntry(kelondroRow.Entry entry, indexRWIEntry searchedWord, long ranking) { public indexURLEntry(kelondroRow.Entry entry, indexRWIRowEntry searchedWord, long ranking) {
this.entry = entry; this.entry = entry;
this.snippet = null; this.snippet = null;
this.word = searchedWord; this.word = searchedWord;
@ -287,7 +287,7 @@ public class indexURLEntry {
// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage()); // serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null"); // if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
// if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null"); // if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
// e.printStackTrace(); e.printStackTrace();
return null; return null;
} }
} }
@ -391,7 +391,7 @@ public class indexURLEntry {
return snippet; return snippet;
} }
public indexRWIEntry word() { public indexRWIRowEntry word() {
return word; return word;
} }

@ -66,7 +66,7 @@ import java.util.LinkedList;
import de.anomic.data.htmlTools; import de.anomic.data.htmlTools;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.http.httpc.response; import de.anomic.http.httpc.response;
import de.anomic.index.indexRWIEntry; import de.anomic.index.indexRWIRowEntry;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroCache;
@ -153,7 +153,7 @@ public final class plasmaCrawlLURL {
return 0; return 0;
} }
public synchronized indexURLEntry load(String urlHash, indexRWIEntry searchedWord, long ranking) { public synchronized indexURLEntry load(String urlHash, indexRWIRowEntry searchedWord, long ranking) {
// generates an plasmaLURLEntry using the url hash // generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered // to speed up the access, the url-hashes are buffered
// in the hash cache. // in the hash cache.

@ -213,7 +213,7 @@ public class plasmaDHTChunk {
final Iterator<indexContainer> indexContainerIterator = wordIndex.indexContainerSet(hash, ram, true, maxcount).iterator(); final Iterator<indexContainer> indexContainerIterator = wordIndex.indexContainerSet(hash, ram, true, maxcount).iterator();
indexContainer container; indexContainer container;
Iterator<indexRWIRowEntry> urlIter; Iterator<indexRWIRowEntry> urlIter;
indexRWIEntry iEntry; indexRWIRowEntry iEntry;
indexURLEntry lurl; indexURLEntry lurl;
int refcount = 0; int refcount = 0;
int wholesize; int wholesize;
@ -243,7 +243,7 @@ public class plasmaDHTChunk {
// CPU & IO reduce // CPU & IO reduce
// try { Thread.sleep(50); } catch (InterruptedException e) { } // try { Thread.sleep(50); } catch (InterruptedException e) { }
iEntry = (indexRWIEntry) urlIter.next(); iEntry = urlIter.next();
if ((iEntry == null) || (iEntry.urlHash() == null)) { if ((iEntry == null) || (iEntry.urlHash() == null)) {
urlIter.remove(); urlIter.remove();
continue; continue;
@ -263,7 +263,7 @@ public class plasmaDHTChunk {
// remove all remaining; we have enough // remove all remaining; we have enough
while (urlIter.hasNext()) { while (urlIter.hasNext()) {
iEntry = (indexRWIEntry) urlIter.next(); iEntry = urlIter.next();
urlIter.remove(); urlIter.remove();
} }

@ -347,7 +347,7 @@ public final class plasmaSearchEvent {
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) { if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
// attach text snippet // attach text snippet
startTime = System.currentTimeMillis(); startTime = System.currentTimeMillis();
plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000); plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000);
long snippetComputationTime = System.currentTimeMillis() - startTime; long snippetComputationTime = System.currentTimeMillis() - startTime;
serverLog.logInfo("SEARCH_EVENT", "text snippet load time for " + comp.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")"))); serverLog.logInfo("SEARCH_EVENT", "text snippet load time for " + comp.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));

@ -40,7 +40,6 @@ import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryOrder; import de.anomic.index.indexRWIEntryOrder;
import de.anomic.index.indexRWIRowEntry; import de.anomic.index.indexRWIRowEntry;
import de.anomic.index.indexRWIVarEntry;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBinSearch; import de.anomic.kelondro.kelondroBinSearch;
import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMScoreCluster;
@ -53,8 +52,8 @@ public final class plasmaSearchRankingProcess {
public static kelondroBinSearch[] ybrTables = null; // block-rank tables public static kelondroBinSearch[] ybrTables = null; // block-rank tables
private static boolean useYBR = true; private static boolean useYBR = true;
private TreeMap<Object, indexRWIEntry> sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String private TreeMap<Object, indexRWIRowEntry> sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String
private HashMap<String, TreeMap<Object, indexRWIEntry>> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries private HashMap<String, TreeMap<Object, indexRWIRowEntry>> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries
private HashMap<String, String> handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process private HashMap<String, String> handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process
private plasmaSearchQuery query; private plasmaSearchQuery query;
private int sortorder; private int sortorder;
@ -74,8 +73,8 @@ public final class plasmaSearchRankingProcess {
// attention: if minEntries is too high, this method will not terminate within the maxTime // attention: if minEntries is too high, this method will not terminate within the maxTime
// sortorder: 0 = hash, 1 = url, 2 = ranking // sortorder: 0 = hash, 1 = url, 2 = ranking
this.localSearchContainerMaps = null; this.localSearchContainerMaps = null;
this.sortedRWIEntries = new TreeMap<Object, indexRWIEntry>(); this.sortedRWIEntries = new TreeMap<Object, indexRWIRowEntry>();
this.doubleDomCache = new HashMap<String, TreeMap<Object, indexRWIEntry>>(); this.doubleDomCache = new HashMap<String, TreeMap<Object, indexRWIRowEntry>>();
this.handover = new HashMap<String, String>(); this.handover = new HashMap<String, String>();
this.filteredCount = 0; this.filteredCount = 0;
this.order = null; this.order = null;
@ -124,11 +123,11 @@ public final class plasmaSearchRankingProcess {
final Iterator<indexRWIRowEntry> en = index.entries(); final Iterator<indexRWIRowEntry> en = index.entries();
// generate a new map where the urls are sorted (not by hash but by the url text) // generate a new map where the urls are sorted (not by hash but by the url text)
indexRWIEntry ientry; indexRWIRowEntry ientry;
indexURLEntry uentry; indexURLEntry uentry;
String u; String u;
loop: while (en.hasNext()) { loop: while (en.hasNext()) {
ientry = (indexRWIEntry) en.next(); ientry = en.next();
// check constraints // check constraints
if (!testFlags(ientry)) continue loop; if (!testFlags(ientry)) continue loop;
@ -181,12 +180,12 @@ public final class plasmaSearchRankingProcess {
// normalize entries and get ranking // normalize entries and get ranking
timer = System.currentTimeMillis(); timer = System.currentTimeMillis();
Iterator<indexRWIRowEntry> i = index.entries(); Iterator<indexRWIRowEntry> i = index.entries();
indexRWIVarEntry iEntry, l; indexRWIRowEntry iEntry, l;
long biggestEntry = 0; long biggestEntry = 0;
//long s0 = System.currentTimeMillis(); //long s0 = System.currentTimeMillis();
Long r; Long r;
while (i.hasNext()) { while (i.hasNext()) {
iEntry = new indexRWIVarEntry(i.next()); iEntry = i.next();
if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue; if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
// increase flag counts // increase flag counts
@ -216,11 +215,11 @@ public final class plasmaSearchRankingProcess {
continue; continue;
} else { } else {
if (urlhashes.containsKey(iEntry.urlHash())) continue; if (urlhashes.containsKey(iEntry.urlHash())) continue;
l = (indexRWIVarEntry) sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey()); l = sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey());
urlhashes.remove(l.urlHash()); urlhashes.remove(l.urlHash());
while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1); while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1);
sortedRWIEntries.put(r, iEntry); sortedRWIEntries.put(r, iEntry);
biggestEntry = order.cardinal((indexRWIVarEntry) sortedRWIEntries.get(sortedRWIEntries.lastKey())); biggestEntry = order.cardinal(sortedRWIEntries.get(sortedRWIEntries.lastKey()));
} }
} }
@ -267,18 +266,18 @@ public final class plasmaSearchRankingProcess {
private synchronized Object[] /*{Object, indexRWIEntry}*/ bestRWI(boolean skipDoubleDom) { private synchronized Object[] /*{Object, indexRWIEntry}*/ bestRWI(boolean skipDoubleDom) {
// returns from the current RWI list the best entry and removed this entry from the list // returns from the current RWI list the best entry and removed this entry from the list
Object bestEntry; Object bestEntry;
TreeMap<Object, indexRWIEntry> m; TreeMap<Object, indexRWIRowEntry> m;
indexRWIEntry rwi; indexRWIRowEntry rwi;
while (sortedRWIEntries.size() > 0) { while (sortedRWIEntries.size() > 0) {
bestEntry = sortedRWIEntries.firstKey(); bestEntry = sortedRWIEntries.firstKey();
rwi = (indexRWIEntry) sortedRWIEntries.remove(bestEntry); rwi = sortedRWIEntries.remove(bestEntry);
if (!skipDoubleDom) return new Object[]{bestEntry, rwi}; if (!skipDoubleDom) return new Object[]{bestEntry, rwi};
// check doubledom // check doubledom
String domhash = rwi.urlHash().substring(6); String domhash = rwi.urlHash().substring(6);
m = (TreeMap<Object, indexRWIEntry>) this.doubleDomCache.get(domhash); m = this.doubleDomCache.get(domhash);
if (m == null) { if (m == null) {
// first appearance of dom // first appearance of dom
m = new TreeMap<Object, indexRWIEntry>(); m = new TreeMap<Object, indexRWIRowEntry>();
this.doubleDomCache.put(domhash, m); this.doubleDomCache.put(domhash, m);
return new Object[]{bestEntry, rwi}; return new Object[]{bestEntry, rwi};
} }
@ -287,20 +286,20 @@ public final class plasmaSearchRankingProcess {
} }
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache // no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
// find best entry from all caches // find best entry from all caches
Iterator<TreeMap<Object, indexRWIEntry>> i = this.doubleDomCache.values().iterator(); Iterator<TreeMap<Object, indexRWIRowEntry>> i = this.doubleDomCache.values().iterator();
bestEntry = null; bestEntry = null;
Object o; Object o;
indexRWIEntry bestrwi = null; indexRWIRowEntry bestrwi = null;
while (i.hasNext()) { while (i.hasNext()) {
m = i.next(); m = i.next();
if (m.size() == 0) continue; if (m.size() == 0) continue;
if (bestEntry == null) { if (bestEntry == null) {
bestEntry = m.firstKey(); bestEntry = m.firstKey();
bestrwi = (indexRWIEntry) m.remove(bestEntry); bestrwi = m.remove(bestEntry);
continue; continue;
} }
o = m.firstKey(); o = m.firstKey();
rwi = (indexRWIEntry) m.remove(o); rwi = m.remove(o);
if (o instanceof Long) { if (o instanceof Long) {
if (((Long) o).longValue() < ((Long) bestEntry).longValue()) { if (((Long) o).longValue() < ((Long) bestEntry).longValue()) {
bestEntry = o; bestEntry = o;
@ -326,7 +325,7 @@ public final class plasmaSearchRankingProcess {
while ((sortedRWIEntries.size() > 0) || (size() > 0)) { while ((sortedRWIEntries.size() > 0) || (size() > 0)) {
Object[] obrwi = bestRWI(skipDoubleDom); Object[] obrwi = bestRWI(skipDoubleDom);
Object bestEntry = obrwi[0]; Object bestEntry = obrwi[0];
indexRWIEntry ientry = (indexRWIEntry) obrwi[1]; indexRWIRowEntry ientry = (indexRWIRowEntry) obrwi[1];
long ranking = (bestEntry instanceof Long) ? ((Long) bestEntry).longValue() : 0; long ranking = (bestEntry instanceof Long) ? ((Long) bestEntry).longValue() : 0;
indexURLEntry u = wordIndex.loadedURL.load(ientry.urlHash(), ientry, ranking); indexURLEntry u = wordIndex.loadedURL.load(ientry.urlHash(), ientry, ranking);
if (u != null) { if (u != null) {
@ -342,7 +341,7 @@ public final class plasmaSearchRankingProcess {
public synchronized int size() { public synchronized int size() {
//assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size(); //assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
int c = sortedRWIEntries.size(); int c = sortedRWIEntries.size();
Iterator<TreeMap<Object, indexRWIEntry>> i = this.doubleDomCache.values().iterator(); Iterator<TreeMap<Object, indexRWIRowEntry>> i = this.doubleDomCache.values().iterator();
while (i.hasNext()) c += i.next().size(); while (i.hasNext()) c += i.next().size();
return c; return c;
} }

@ -59,6 +59,7 @@ import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.plasma.cache.IResourceInfo; import de.anomic.plasma.cache.IResourceInfo;
@ -246,9 +247,9 @@ public class plasmaSnippetCache {
} }
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static TextSnippet retrieveTextSnippet(yacyURL url, Set<String> queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen) { public static TextSnippet retrieveTextSnippet(indexURLEntry.Components comp, Set<String> queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen) {
// heise = "0OQUNU3JSs05" // heise = "0OQUNU3JSs05"
yacyURL url = comp.url();
if (queryhashes.size() == 0) { if (queryhashes.size() == 0) {
//System.out.println("found no queryhashes for URL retrieve " + url); //System.out.println("found no queryhashes for URL retrieve " + url);
return new TextSnippet(url, null, ERROR_NO_HASH_GIVEN, queryhashes, "no query hashes given"); return new TextSnippet(url, null, ERROR_NO_HASH_GIVEN, queryhashes, "no query hashes given");
@ -258,8 +259,8 @@ public class plasmaSnippetCache {
int source = SOURCE_CACHE; int source = SOURCE_CACHE;
String wordhashes = yacySearch.set2string(queryhashes); String wordhashes = yacySearch.set2string(queryhashes);
String line = retrieveFromCache(wordhashes, url.hash()); String line = retrieveFromCache(wordhashes, url.hash());
if (line != null) { if (line != null) {
//System.out.println("found snippet for URL " + url + " in cache: " + line); // found the snippet
return new TextSnippet(url, line, source, null, null, faviconCache.get(url.hash())); return new TextSnippet(url, line, source, null, null, faviconCache.get(url.hash()));
} }
@ -279,7 +280,11 @@ public class plasmaSnippetCache {
if ((resContentLength > maxDocLen) && (!fetchOnline)) { if ((resContentLength > maxDocLen) && (!fetchOnline)) {
// content may be too large to be parsed here. To be fast, we omit calculation of snippet here // content may be too large to be parsed here. To be fast, we omit calculation of snippet here
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes"); return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
} }/*
} else if (url.) {
// try to create the snippet from information given in the url itself
*/
} else if (fetchOnline) { } else if (fetchOnline) {
// if not found try to download it // if not found try to download it
@ -342,7 +347,7 @@ public class plasmaSnippetCache {
if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences",resFavicon); if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences",resFavicon);
Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength); Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength);
String textline = (tsr == null) ? null : (String) tsr[0]; String textline = (tsr == null) ? null : (String) tsr[0];
Set<String> remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1]; Set<String> remainingHashes = (tsr == null) ? queryhashes : (Set<String>) tsr[1];
// compute snippet from media // compute snippet from media
String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes); String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);

@ -598,7 +598,7 @@ public final class plasmaWordIndex implements indexRI {
public void run() { public void run() {
serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started"); serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started");
indexContainer container = null; indexContainer container = null;
indexRWIEntry entry = null; indexRWIRowEntry entry = null;
yacyURL url = null; yacyURL url = null;
HashSet<String> urlHashs = new HashSet<String>(); HashSet<String> urlHashs = new HashSet<String>();
Iterator<indexContainer> indexContainerIterator = indexContainerSet(startHash, false, false, 100).iterator(); Iterator<indexContainer> indexContainerIterator = indexContainerSet(startHash, false, false, 100).iterator();
@ -609,7 +609,7 @@ public final class plasmaWordIndex implements indexRI {
wordHashNow = container.getWordHash(); wordHashNow = container.getWordHash();
while (containerIterator.hasNext() && run) { while (containerIterator.hasNext() && run) {
waiter(); waiter();
entry = (indexRWIEntry) containerIterator.next(); entry = containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash: // System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash()); // "+entry.getUrlHash());
indexURLEntry ue = lurl.load(entry.urlHash(), entry, 0); indexURLEntry ue = lurl.load(entry.urlHash(), entry, 0);

Loading…
Cancel
Save