From fa3b8f0ae1855e7544d8c651dc70e77936e619ee Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 30 Jan 2008 00:15:43 +0000 Subject: [PATCH] fixed bug in remote search git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4419 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../de/anomic/index/indexRWIEntryOrder.java | 4 ++ source/de/anomic/index/indexRWIVarEntry.java | 8 ++-- source/de/anomic/index/indexURLEntry.java | 8 ++-- source/de/anomic/plasma/plasmaCrawlLURL.java | 4 +- source/de/anomic/plasma/plasmaDHTChunk.java | 6 +-- .../de/anomic/plasma/plasmaSearchEvent.java | 2 +- .../plasma/plasmaSearchRankingProcess.java | 43 +++++++++---------- .../de/anomic/plasma/plasmaSnippetCache.java | 17 +++++--- source/de/anomic/plasma/plasmaWordIndex.java | 4 +- 9 files changed, 52 insertions(+), 44 deletions(-) diff --git a/source/de/anomic/index/indexRWIEntryOrder.java b/source/de/anomic/index/indexRWIEntryOrder.java index 9e8b5b05e..9fef998b4 100644 --- a/source/de/anomic/index/indexRWIEntryOrder.java +++ b/source/de/anomic/index/indexRWIEntryOrder.java @@ -115,6 +115,10 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder return cardinal(new indexRWIVarEntry(new indexRWIRowEntry(key))); } + public long cardinal(indexRWIRowEntry t) { + return cardinal(new indexRWIVarEntry(t)); + } + public long cardinal(indexRWIVarEntry t) { //return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords); // the normalizedEntry must be a normalized indexEntry diff --git a/source/de/anomic/index/indexRWIVarEntry.java b/source/de/anomic/index/indexRWIVarEntry.java index 838591ba0..1cf7e73ca 100644 --- a/source/de/anomic/index/indexRWIVarEntry.java +++ b/source/de/anomic/index/indexRWIVarEntry.java @@ -89,12 +89,12 @@ public class indexRWIVarEntry implements indexRWIEntry { } public boolean isNewer(indexRWIEntry other) { - // TODO Auto-generated method stub + assert false; // should not be used return false; } public boolean isOlder(indexRWIEntry other) { - // TODO Auto-generated method stub + assert false; // should not be used return false; } @@ -131,12 +131,12 @@ public class indexRWIVarEntry implements indexRWIEntry { } public Entry toKelondroEntry() { - // TODO Auto-generated method stub + assert false; // should not be used return null; } public String toPropertyForm() { - // TODO Auto-generated method stub + assert false; // should not be used return null; } diff --git a/source/de/anomic/index/indexURLEntry.java b/source/de/anomic/index/indexURLEntry.java index 2faee9ac2..c317fe80f 100644 --- a/source/de/anomic/index/indexURLEntry.java +++ b/source/de/anomic/index/indexURLEntry.java @@ -115,7 +115,7 @@ public class indexURLEntry { private kelondroRow.Entry entry; private String snippet; - private indexRWIEntry word; // this is only used if the url is transported via remote search requests + private indexRWIRowEntry word; // this is only used if the url is transported via remote search requests private long ranking; // during generation of a search result this value is set public indexURLEntry( @@ -185,7 +185,7 @@ public class indexURLEntry { return s.toString().getBytes(); } - public indexURLEntry(kelondroRow.Entry entry, indexRWIEntry searchedWord, long ranking) { + public indexURLEntry(kelondroRow.Entry entry, indexRWIRowEntry searchedWord, long ranking) { this.entry = entry; this.snippet = null; this.word = searchedWord; @@ -287,7 +287,7 @@ public class indexURLEntry { // serverLog.logFailure("plasmaLURL.corePropList", e.getMessage()); // if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null"); // if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null"); - // e.printStackTrace(); + e.printStackTrace(); return null; } } @@ -391,7 +391,7 @@ public class indexURLEntry { return snippet; } - public indexRWIEntry word() { + public indexRWIRowEntry word() { return word; } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 59d754615..e2a6da400 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -66,7 +66,7 @@ import java.util.LinkedList; import de.anomic.data.htmlTools; import de.anomic.http.httpc; import de.anomic.http.httpc.response; -import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexRWIRowEntry; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroCache; @@ -153,7 +153,7 @@ public final class plasmaCrawlLURL { return 0; } - public synchronized indexURLEntry load(String urlHash, indexRWIEntry searchedWord, long ranking) { + public synchronized indexURLEntry load(String urlHash, indexRWIRowEntry searchedWord, long ranking) { // generates an plasmaLURLEntry using the url hash // to speed up the access, the url-hashes are buffered // in the hash cache. diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java index 9e6024b90..2e842aebb 100644 --- a/source/de/anomic/plasma/plasmaDHTChunk.java +++ b/source/de/anomic/plasma/plasmaDHTChunk.java @@ -213,7 +213,7 @@ public class plasmaDHTChunk { final Iterator indexContainerIterator = wordIndex.indexContainerSet(hash, ram, true, maxcount).iterator(); indexContainer container; Iterator urlIter; - indexRWIEntry iEntry; + indexRWIRowEntry iEntry; indexURLEntry lurl; int refcount = 0; int wholesize; @@ -243,7 +243,7 @@ public class plasmaDHTChunk { // CPU & IO reduce // try { Thread.sleep(50); } catch (InterruptedException e) { } - iEntry = (indexRWIEntry) urlIter.next(); + iEntry = urlIter.next(); if ((iEntry == null) || (iEntry.urlHash() == null)) { urlIter.remove(); continue; @@ -263,7 +263,7 @@ public class plasmaDHTChunk { // remove all remaining; we have enough while (urlIter.hasNext()) { - iEntry = (indexRWIEntry) urlIter.next(); + iEntry = urlIter.next(); urlIter.remove(); } diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index fbf8c6dde..685553c77 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -347,7 +347,7 @@ public final class plasmaSearchEvent { if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) { // attach text snippet startTime = System.currentTimeMillis(); - plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000); + plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000); long snippetComputationTime = System.currentTimeMillis() - startTime; serverLog.logInfo("SEARCH_EVENT", "text snippet load time for " + comp.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")"))); diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index f2fcbc557..6c212ce4a 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -40,7 +40,6 @@ import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; import de.anomic.index.indexRWIEntryOrder; import de.anomic.index.indexRWIRowEntry; -import de.anomic.index.indexRWIVarEntry; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBinSearch; import de.anomic.kelondro.kelondroMScoreCluster; @@ -53,8 +52,8 @@ public final class plasmaSearchRankingProcess { public static kelondroBinSearch[] ybrTables = null; // block-rank tables private static boolean useYBR = true; - private TreeMap sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String - private HashMap> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries + private TreeMap sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String + private HashMap> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries private HashMap handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process private plasmaSearchQuery query; private int sortorder; @@ -74,8 +73,8 @@ public final class plasmaSearchRankingProcess { // attention: if minEntries is too high, this method will not terminate within the maxTime // sortorder: 0 = hash, 1 = url, 2 = ranking this.localSearchContainerMaps = null; - this.sortedRWIEntries = new TreeMap(); - this.doubleDomCache = new HashMap>(); + this.sortedRWIEntries = new TreeMap(); + this.doubleDomCache = new HashMap>(); this.handover = new HashMap(); this.filteredCount = 0; this.order = null; @@ -124,11 +123,11 @@ public final class plasmaSearchRankingProcess { final Iterator en = index.entries(); // generate a new map where the urls are sorted (not by hash but by the url text) - indexRWIEntry ientry; + indexRWIRowEntry ientry; indexURLEntry uentry; String u; loop: while (en.hasNext()) { - ientry = (indexRWIEntry) en.next(); + ientry = en.next(); // check constraints if (!testFlags(ientry)) continue loop; @@ -181,12 +180,12 @@ public final class plasmaSearchRankingProcess { // normalize entries and get ranking timer = System.currentTimeMillis(); Iterator i = index.entries(); - indexRWIVarEntry iEntry, l; + indexRWIRowEntry iEntry, l; long biggestEntry = 0; //long s0 = System.currentTimeMillis(); Long r; while (i.hasNext()) { - iEntry = new indexRWIVarEntry(i.next()); + iEntry = i.next(); if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue; // increase flag counts @@ -216,11 +215,11 @@ public final class plasmaSearchRankingProcess { continue; } else { if (urlhashes.containsKey(iEntry.urlHash())) continue; - l = (indexRWIVarEntry) sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey()); + l = sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey()); urlhashes.remove(l.urlHash()); while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1); sortedRWIEntries.put(r, iEntry); - biggestEntry = order.cardinal((indexRWIVarEntry) sortedRWIEntries.get(sortedRWIEntries.lastKey())); + biggestEntry = order.cardinal(sortedRWIEntries.get(sortedRWIEntries.lastKey())); } } @@ -267,18 +266,18 @@ public final class plasmaSearchRankingProcess { private synchronized Object[] /*{Object, indexRWIEntry}*/ bestRWI(boolean skipDoubleDom) { // returns from the current RWI list the best entry and removed this entry from the list Object bestEntry; - TreeMap m; - indexRWIEntry rwi; + TreeMap m; + indexRWIRowEntry rwi; while (sortedRWIEntries.size() > 0) { bestEntry = sortedRWIEntries.firstKey(); - rwi = (indexRWIEntry) sortedRWIEntries.remove(bestEntry); + rwi = sortedRWIEntries.remove(bestEntry); if (!skipDoubleDom) return new Object[]{bestEntry, rwi}; // check doubledom String domhash = rwi.urlHash().substring(6); - m = (TreeMap) this.doubleDomCache.get(domhash); + m = this.doubleDomCache.get(domhash); if (m == null) { // first appearance of dom - m = new TreeMap(); + m = new TreeMap(); this.doubleDomCache.put(domhash, m); return new Object[]{bestEntry, rwi}; } @@ -287,20 +286,20 @@ public final class plasmaSearchRankingProcess { } // no more entries in sorted RWI entries. Now take Elements from the doubleDomCache // find best entry from all caches - Iterator> i = this.doubleDomCache.values().iterator(); + Iterator> i = this.doubleDomCache.values().iterator(); bestEntry = null; Object o; - indexRWIEntry bestrwi = null; + indexRWIRowEntry bestrwi = null; while (i.hasNext()) { m = i.next(); if (m.size() == 0) continue; if (bestEntry == null) { bestEntry = m.firstKey(); - bestrwi = (indexRWIEntry) m.remove(bestEntry); + bestrwi = m.remove(bestEntry); continue; } o = m.firstKey(); - rwi = (indexRWIEntry) m.remove(o); + rwi = m.remove(o); if (o instanceof Long) { if (((Long) o).longValue() < ((Long) bestEntry).longValue()) { bestEntry = o; @@ -326,7 +325,7 @@ public final class plasmaSearchRankingProcess { while ((sortedRWIEntries.size() > 0) || (size() > 0)) { Object[] obrwi = bestRWI(skipDoubleDom); Object bestEntry = obrwi[0]; - indexRWIEntry ientry = (indexRWIEntry) obrwi[1]; + indexRWIRowEntry ientry = (indexRWIRowEntry) obrwi[1]; long ranking = (bestEntry instanceof Long) ? ((Long) bestEntry).longValue() : 0; indexURLEntry u = wordIndex.loadedURL.load(ientry.urlHash(), ientry, ranking); if (u != null) { @@ -342,7 +341,7 @@ public final class plasmaSearchRankingProcess { public synchronized int size() { //assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size(); int c = sortedRWIEntries.size(); - Iterator> i = this.doubleDomCache.values().iterator(); + Iterator> i = this.doubleDomCache.values().iterator(); while (i.hasNext()) c += i.next().size(); return c; } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index caf195ae4..47b314544 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -59,6 +59,7 @@ import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.http.httpHeader; import de.anomic.http.httpc; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.plasma.cache.IResourceInfo; @@ -246,9 +247,9 @@ public class plasmaSnippetCache { } @SuppressWarnings("unchecked") - public static TextSnippet retrieveTextSnippet(yacyURL url, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen) { + public static TextSnippet retrieveTextSnippet(indexURLEntry.Components comp, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen) { // heise = "0OQUNU3JSs05" - + yacyURL url = comp.url(); if (queryhashes.size() == 0) { //System.out.println("found no queryhashes for URL retrieve " + url); return new TextSnippet(url, null, ERROR_NO_HASH_GIVEN, queryhashes, "no query hashes given"); @@ -258,8 +259,8 @@ public class plasmaSnippetCache { int source = SOURCE_CACHE; String wordhashes = yacySearch.set2string(queryhashes); String line = retrieveFromCache(wordhashes, url.hash()); - if (line != null) { - //System.out.println("found snippet for URL " + url + " in cache: " + line); + if (line != null) { + // found the snippet return new TextSnippet(url, line, source, null, null, faviconCache.get(url.hash())); } @@ -279,7 +280,11 @@ public class plasmaSnippetCache { if ((resContentLength > maxDocLen) && (!fetchOnline)) { // content may be too large to be parsed here. To be fast, we omit calculation of snippet here return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes"); - } + }/* + } else if (url.) { + // try to create the snippet from information given in the url itself + */ + } else if (fetchOnline) { // if not found try to download it @@ -342,7 +347,7 @@ public class plasmaSnippetCache { if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences",resFavicon); Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength); String textline = (tsr == null) ? null : (String) tsr[0]; - Set remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1]; + Set remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1]; // compute snippet from media String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index ce2410611..44c396942 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -598,7 +598,7 @@ public final class plasmaWordIndex implements indexRI { public void run() { serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started"); indexContainer container = null; - indexRWIEntry entry = null; + indexRWIRowEntry entry = null; yacyURL url = null; HashSet urlHashs = new HashSet(); Iterator indexContainerIterator = indexContainerSet(startHash, false, false, 100).iterator(); @@ -609,7 +609,7 @@ public final class plasmaWordIndex implements indexRI { wordHashNow = container.getWordHash(); while (containerIterator.hasNext() && run) { waiter(); - entry = (indexRWIEntry) containerIterator.next(); + entry = containerIterator.next(); // System.out.println("Wordhash: "+wordHash+" UrlHash: // "+entry.getUrlHash()); indexURLEntry ue = lurl.load(entry.urlHash(), entry, 0);