From fa3b8f0ae1855e7544d8c651dc70e77936e619ee Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Wed, 30 Jan 2008 00:15:43 +0000
Subject: [PATCH] fixed bug in remote search

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4419 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 .../de/anomic/index/indexRWIEntryOrder.java   |  4 ++
 source/de/anomic/index/indexRWIVarEntry.java  |  8 ++--
 source/de/anomic/index/indexURLEntry.java     |  8 ++--
 source/de/anomic/plasma/plasmaCrawlLURL.java  |  4 +-
 source/de/anomic/plasma/plasmaDHTChunk.java   |  6 +--
 .../de/anomic/plasma/plasmaSearchEvent.java   |  2 +-
 .../plasma/plasmaSearchRankingProcess.java    | 43 +++++++++----------
 .../de/anomic/plasma/plasmaSnippetCache.java  | 17 +++++---
 source/de/anomic/plasma/plasmaWordIndex.java  |  4 +-
 9 files changed, 52 insertions(+), 44 deletions(-)
diff --git a/source/de/anomic/index/indexRWIEntryOrder.java b/source/de/anomic/index/indexRWIEntryOrder.java
index 9e8b5b05e..9fef998b4 100644
--- a/source/de/anomic/index/indexRWIEntryOrder.java
+++ b/source/de/anomic/index/indexRWIEntryOrder.java
@@ -115,6 +115,10 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
         return cardinal(new indexRWIVarEntry(new indexRWIRowEntry(key)));
     }
 
+    public long cardinal(indexRWIRowEntry t) {
+        return cardinal(new indexRWIVarEntry(t));
+    }
+
     public long cardinal(indexRWIVarEntry t) {
         //return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
         // the normalizedEntry must be a normalized indexEntry
diff --git a/source/de/anomic/index/indexRWIVarEntry.java b/source/de/anomic/index/indexRWIVarEntry.java
index 838591ba0..1cf7e73ca 100644
--- a/source/de/anomic/index/indexRWIVarEntry.java
+++ b/source/de/anomic/index/indexRWIVarEntry.java
@@ -89,12 +89,12 @@ public class indexRWIVarEntry implements indexRWIEntry {
     }
 
     public boolean isNewer(indexRWIEntry other) {
-        // TODO Auto-generated method stub
+        assert false; // should not be used
         return false;
     }
 
     public boolean isOlder(indexRWIEntry other) {
-        // TODO Auto-generated method stub
+        assert false; // should not be used
         return false;
     }
 
@@ -131,12 +131,12 @@ public class indexRWIVarEntry implements indexRWIEntry {
     }
 
     public Entry toKelondroEntry() {
-        // TODO Auto-generated method stub
+        assert false; // should not be used
         return null;
     }
 
     public String toPropertyForm() {
-        // TODO Auto-generated method stub
+        assert false; // should not be used
         return null;
     }
 
diff --git a/source/de/anomic/index/indexURLEntry.java b/source/de/anomic/index/indexURLEntry.java
index 2faee9ac2..c317fe80f 100644
--- a/source/de/anomic/index/indexURLEntry.java
+++ b/source/de/anomic/index/indexURLEntry.java
@@ -115,7 +115,7 @@ public class indexURLEntry {
     
     private kelondroRow.Entry entry;
     private String snippet;
-    private indexRWIEntry word; // this is only used if the url is transported via remote search requests
+    private indexRWIRowEntry word; // this is only used if the url is transported via remote search requests
     private long ranking; // during generation of a search result this value is set
     
     public indexURLEntry(
@@ -185,7 +185,7 @@ public class indexURLEntry {
         return s.toString().getBytes();
     }
     
-    public indexURLEntry(kelondroRow.Entry entry, indexRWIEntry searchedWord, long ranking) {
+    public indexURLEntry(kelondroRow.Entry entry, indexRWIRowEntry searchedWord, long ranking) {
         this.entry = entry;
         this.snippet = null;
         this.word = searchedWord;
@@ -287,7 +287,7 @@ public class indexURLEntry {
             //          serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
             //          if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
             //          if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
-            //          e.printStackTrace();
+            e.printStackTrace();
             return null;
         }
     }
@@ -391,7 +391,7 @@ public class indexURLEntry {
         return snippet;
     }
 
-    public indexRWIEntry word() {
+    public indexRWIRowEntry word() {
         return word;
     }
 
diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java
index 59d754615..e2a6da400 100644
--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@@ -66,7 +66,7 @@ import java.util.LinkedList;
 import de.anomic.data.htmlTools;
 import de.anomic.http.httpc;
 import de.anomic.http.httpc.response;
-import de.anomic.index.indexRWIEntry;
+import de.anomic.index.indexRWIRowEntry;
 import de.anomic.index.indexURLEntry;
 import de.anomic.kelondro.kelondroBase64Order;
 import de.anomic.kelondro.kelondroCache;
@@ -153,7 +153,7 @@ public final class plasmaCrawlLURL {
         return 0;
     }
 
-    public synchronized indexURLEntry load(String urlHash, indexRWIEntry searchedWord, long ranking) {
+    public synchronized indexURLEntry load(String urlHash, indexRWIRowEntry searchedWord, long ranking) {
         // generates an plasmaLURLEntry using the url hash
         // to speed up the access, the url-hashes are buffered
         // in the hash cache.
diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java
index 9e6024b90..2e842aebb 100644
--- a/source/de/anomic/plasma/plasmaDHTChunk.java
+++ b/source/de/anomic/plasma/plasmaDHTChunk.java
@@ -213,7 +213,7 @@ public class plasmaDHTChunk {
             final Iterator<indexContainer> indexContainerIterator = wordIndex.indexContainerSet(hash, ram, true, maxcount).iterator();
             indexContainer container;
             Iterator<indexRWIRowEntry> urlIter;
-            indexRWIEntry iEntry;
+            indexRWIRowEntry iEntry;
             indexURLEntry lurl;
             int refcount = 0;
             int wholesize;
@@ -243,7 +243,7 @@ public class plasmaDHTChunk {
                         // CPU & IO reduce
                         // try { Thread.sleep(50); } catch (InterruptedException e) { }
 
-                        iEntry = (indexRWIEntry) urlIter.next();
+                        iEntry = urlIter.next();
                         if ((iEntry == null) || (iEntry.urlHash() == null)) {
                             urlIter.remove();
                             continue;
@@ -263,7 +263,7 @@ public class plasmaDHTChunk {
 
                     // remove all remaining; we have enough
                     while (urlIter.hasNext()) {
-                        iEntry = (indexRWIEntry) urlIter.next();
+                        iEntry = urlIter.next();
                         urlIter.remove();
                     }
 
diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java
index fbf8c6dde..685553c77 100644
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@@ -347,7 +347,7 @@ public final class plasmaSearchEvent {
         if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
             // attach text snippet
             startTime = System.currentTimeMillis();
-            plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000);
+            plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000);
             long snippetComputationTime = System.currentTimeMillis() - startTime;
             serverLog.logInfo("SEARCH_EVENT", "text snippet load time for " + comp.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
             
diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java
index f2fcbc557..6c212ce4a 100644
--- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java
+++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java
@@ -40,7 +40,6 @@ import de.anomic.index.indexContainer;
 import de.anomic.index.indexRWIEntry;
 import de.anomic.index.indexRWIEntryOrder;
 import de.anomic.index.indexRWIRowEntry;
-import de.anomic.index.indexRWIVarEntry;
 import de.anomic.index.indexURLEntry;
 import de.anomic.kelondro.kelondroBinSearch;
 import de.anomic.kelondro.kelondroMScoreCluster;
@@ -53,8 +52,8 @@ public final class plasmaSearchRankingProcess {
     public  static kelondroBinSearch[] ybrTables = null; // block-rank tables
     private static boolean useYBR = true;
     
-    private TreeMap<Object, indexRWIEntry> sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String
-    private HashMap<String, TreeMap<Object, indexRWIEntry>> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries
+    private TreeMap<Object, indexRWIRowEntry> sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String
+    private HashMap<String, TreeMap<Object, indexRWIRowEntry>> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries
     private HashMap<String, String> handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process
     private plasmaSearchQuery query;
     private int sortorder;
@@ -74,8 +73,8 @@ public final class plasmaSearchRankingProcess {
         // attention: if minEntries is too high, this method will not terminate within the maxTime
         // sortorder: 0 = hash, 1 = url, 2 = ranking
         this.localSearchContainerMaps = null;
-        this.sortedRWIEntries = new TreeMap<Object, indexRWIEntry>();
-        this.doubleDomCache = new HashMap<String, TreeMap<Object, indexRWIEntry>>();
+        this.sortedRWIEntries = new TreeMap<Object, indexRWIRowEntry>();
+        this.doubleDomCache = new HashMap<String, TreeMap<Object, indexRWIRowEntry>>();
         this.handover = new HashMap<String, String>();
         this.filteredCount = 0;
         this.order = null;
@@ -124,11 +123,11 @@ public final class plasmaSearchRankingProcess {
         final Iterator<indexRWIRowEntry> en = index.entries();
         // generate a new map where the urls are sorted (not by hash but by the url text)
         
-        indexRWIEntry ientry;
+        indexRWIRowEntry ientry;
         indexURLEntry uentry;
         String u;
         loop: while (en.hasNext()) {
-            ientry = (indexRWIEntry) en.next();
+            ientry = en.next();
 
             // check constraints
             if (!testFlags(ientry)) continue loop;
@@ -181,12 +180,12 @@ public final class plasmaSearchRankingProcess {
         // normalize entries and get ranking
         timer = System.currentTimeMillis();
         Iterator<indexRWIRowEntry> i = index.entries();
-        indexRWIVarEntry iEntry, l;
+        indexRWIRowEntry iEntry, l;
         long biggestEntry = 0;
         //long s0 = System.currentTimeMillis();
         Long r;
         while (i.hasNext()) {
-            iEntry = new indexRWIVarEntry(i.next());
+            iEntry = i.next();
             if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
 
             // increase flag counts
@@ -216,11 +215,11 @@ public final class plasmaSearchRankingProcess {
                     continue;
                 } else {
                     if (urlhashes.containsKey(iEntry.urlHash())) continue;
-                    l = (indexRWIVarEntry) sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey());
+                    l = sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey());
                     urlhashes.remove(l.urlHash());
                     while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1);
                     sortedRWIEntries.put(r, iEntry);
-                    biggestEntry = order.cardinal((indexRWIVarEntry) sortedRWIEntries.get(sortedRWIEntries.lastKey()));
+                    biggestEntry = order.cardinal(sortedRWIEntries.get(sortedRWIEntries.lastKey()));
                 }
             }
             
@@ -267,18 +266,18 @@ public final class plasmaSearchRankingProcess {
     private synchronized Object[] /*{Object, indexRWIEntry}*/ bestRWI(boolean skipDoubleDom) {
         // returns from the current RWI list the best entry and removed this entry from the list
         Object bestEntry;
-        TreeMap<Object, indexRWIEntry> m;
-        indexRWIEntry rwi;
+        TreeMap<Object, indexRWIRowEntry> m;
+        indexRWIRowEntry rwi;
         while (sortedRWIEntries.size() > 0) {
             bestEntry = sortedRWIEntries.firstKey();
-            rwi = (indexRWIEntry) sortedRWIEntries.remove(bestEntry);
+            rwi = sortedRWIEntries.remove(bestEntry);
             if (!skipDoubleDom) return new Object[]{bestEntry, rwi};
             // check doubledom
             String domhash = rwi.urlHash().substring(6);
-            m = (TreeMap<Object, indexRWIEntry>) this.doubleDomCache.get(domhash);
+            m = this.doubleDomCache.get(domhash);
             if (m == null) {
                 // first appearance of dom
-                m = new TreeMap<Object, indexRWIEntry>();
+                m = new TreeMap<Object, indexRWIRowEntry>();
                 this.doubleDomCache.put(domhash, m);
                 return new Object[]{bestEntry, rwi};
             }
@@ -287,20 +286,20 @@ public final class plasmaSearchRankingProcess {
         }
         // no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
         // find best entry from all caches
-        Iterator<TreeMap<Object, indexRWIEntry>> i = this.doubleDomCache.values().iterator();
+        Iterator<TreeMap<Object, indexRWIRowEntry>> i = this.doubleDomCache.values().iterator();
         bestEntry = null;
         Object o;
-        indexRWIEntry bestrwi = null;
+        indexRWIRowEntry bestrwi = null;
         while (i.hasNext()) {
             m = i.next();
             if (m.size() == 0) continue;
             if (bestEntry == null) {
                 bestEntry = m.firstKey();
-                bestrwi = (indexRWIEntry) m.remove(bestEntry);
+                bestrwi = m.remove(bestEntry);
                 continue;
             }
             o = m.firstKey();
-            rwi = (indexRWIEntry) m.remove(o);
+            rwi = m.remove(o);
             if (o instanceof Long) {
                 if (((Long) o).longValue() < ((Long) bestEntry).longValue()) {
                     bestEntry = o;
@@ -326,7 +325,7 @@ public final class plasmaSearchRankingProcess {
         while ((sortedRWIEntries.size() > 0) || (size() > 0)) {
             Object[] obrwi = bestRWI(skipDoubleDom);
             Object bestEntry = obrwi[0];
-            indexRWIEntry ientry = (indexRWIEntry) obrwi[1];
+            indexRWIRowEntry ientry = (indexRWIRowEntry) obrwi[1];
             long ranking = (bestEntry instanceof Long) ? ((Long) bestEntry).longValue() : 0;
             indexURLEntry u = wordIndex.loadedURL.load(ientry.urlHash(), ientry, ranking);
             if (u != null) {
@@ -342,7 +341,7 @@ public final class plasmaSearchRankingProcess {
     public synchronized int size() {
         //assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
         int c = sortedRWIEntries.size();
-        Iterator<TreeMap<Object, indexRWIEntry>> i = this.doubleDomCache.values().iterator();
+        Iterator<TreeMap<Object, indexRWIRowEntry>> i = this.doubleDomCache.values().iterator();
         while (i.hasNext()) c += i.next().size();
         return c;
     }
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index caf195ae4..47b314544 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -59,6 +59,7 @@ import java.util.TreeSet;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.http.httpHeader;
 import de.anomic.http.httpc;
+import de.anomic.index.indexURLEntry;
 import de.anomic.kelondro.kelondroMScoreCluster;
 import de.anomic.kelondro.kelondroMSetTools;
 import de.anomic.plasma.cache.IResourceInfo;
@@ -246,9 +247,9 @@ public class plasmaSnippetCache {
     }
     
     @SuppressWarnings("unchecked")
-    public static TextSnippet retrieveTextSnippet(yacyURL url, Set<String> queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen) {
+    public static TextSnippet retrieveTextSnippet(indexURLEntry.Components comp, Set<String> queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen) {
         // heise = "0OQUNU3JSs05"
-        
+        yacyURL url = comp.url();
         if (queryhashes.size() == 0) {
             //System.out.println("found no queryhashes for URL retrieve " + url);
             return new TextSnippet(url, null, ERROR_NO_HASH_GIVEN, queryhashes, "no query hashes given");
@@ -258,8 +259,8 @@ public class plasmaSnippetCache {
         int source = SOURCE_CACHE;
         String wordhashes = yacySearch.set2string(queryhashes);
         String line = retrieveFromCache(wordhashes, url.hash());
-        if (line != null) {        	
-            //System.out.println("found snippet for URL " + url + " in cache: " + line);
+        if (line != null) {
+            // found the snippet
             return new TextSnippet(url, line, source, null, null, faviconCache.get(url.hash()));
         }
         
@@ -279,7 +280,11 @@ public class plasmaSnippetCache {
                 if ((resContentLength > maxDocLen) && (!fetchOnline)) {
                     // content may be too large to be parsed here. To be fast, we omit calculation of snippet here
                     return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
-                }
+                }/*
+            } else if (url.) {
+                // try to create the snippet from information given in the url itself
+                */
+                
             } else if (fetchOnline) {
                 // if not found try to download it
                 
@@ -342,7 +347,7 @@ public class plasmaSnippetCache {
         if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences",resFavicon);
         Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength);
         String textline = (tsr == null) ? null : (String) tsr[0];
-        Set<String> remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1];
+        Set<String> remainingHashes = (tsr == null) ? queryhashes : (Set<String>) tsr[1];
         
         // compute snippet from media
         String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index ce2410611..44c396942 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -598,7 +598,7 @@ public final class plasmaWordIndex implements indexRI {
         public void run() {
             serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started");
             indexContainer container = null;
-            indexRWIEntry entry = null;
+            indexRWIRowEntry entry = null;
             yacyURL url = null;
             HashSet<String> urlHashs = new HashSet<String>();
             Iterator<indexContainer> indexContainerIterator = indexContainerSet(startHash, false, false, 100).iterator();
@@ -609,7 +609,7 @@ public final class plasmaWordIndex implements indexRI {
                 wordHashNow = container.getWordHash();
                 while (containerIterator.hasNext() && run) {
                     waiter();
-                    entry = (indexRWIEntry) containerIterator.next();
+                    entry = containerIterator.next();
                     // System.out.println("Wordhash: "+wordHash+" UrlHash:
                     // "+entry.getUrlHash());
                     indexURLEntry ue = lurl.load(entry.urlHash(), entry, 0);