From b9a2a2d28799e8891ccef72741a780da11de4703 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Thu, 24 Apr 2008 15:09:06 +0000
Subject: [PATCH] more search performance hacks

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4735 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 .../htmlFilter/htmlFilterAbstractScraper.java  | 18 +++++++++++-------
 .../htmlFilter/htmlFilterContentScraper.java   | 18 +++++-------------
 .../anomic/kelondro/kelondroRowCollection.java | 15 ++++++++++-----
 source/de/anomic/plasma/plasmaCondenser.java   |  8 +++-----
 .../plasma/plasmaSearchRankingProcess.java     | 10 +++++-----
 .../de/anomic/plasma/plasmaSnippetCache.java   | 14 ++++++++------
 source/de/anomic/yacy/yacyURL.java             |  8 +++++++-
 7 files changed, 49 insertions(+), 42 deletions(-)
diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
index 642c290e6..8aa8842bd 100644
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
@@ -27,8 +27,8 @@
 package de.anomic.htmlFilter;
 
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Properties;
-import java.util.TreeSet;
 
 import de.anomic.server.serverCharBuffer;
 
@@ -38,8 +38,8 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
     public static final char rb = '>';
     public static final char sl = '/';
  
-    private TreeSet<String> tags0;
-    private TreeSet<String> tags1;
+    private HashSet<String> tags0;
+    private HashSet<String> tags1;
 
     // define a translation table for html character codings
     private static HashMap<String, String> trans = new HashMap<String, String>(300);
@@ -289,18 +289,22 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
         trans.put("&rsaquo;", ""); //angewinkeltes einzelnes Anf.zeichen rechts
     }
 
-
-    public htmlFilterAbstractScraper(TreeSet<String> tags0, TreeSet<String> tags1) {
+    /**
+     * create a scraper. the tag sets must contain tags in lowercase!
+     * @param tags0
+     * @param tags1
+     */
+    public htmlFilterAbstractScraper(HashSet<String> tags0, HashSet<String> tags1) {
         this.tags0  = tags0;
         this.tags1  = tags1;
     }
 
     public boolean isTag0(String tag) {
-        return (tags0 != null) && (tags0.contains(tag));
+        return (tags0 != null) && (tags0.contains(tag.toLowerCase()));
     }
 
     public boolean isTag1(String tag) {
-        return (tags1 != null) && (tags1.contains(tag));
+        return (tags1 != null) && (tags1.contains(tag.toLowerCase()));
     }
 
     //the 'missing' method that shall be implemented:
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index e07ebb812..3e48b5f3e 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -51,15 +51,13 @@ import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.io.Writer;
 import java.net.MalformedURLException;
-import java.text.Collator;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
-import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
-import java.util.TreeSet;
 
 import javax.swing.event.EventListenerList;
 
@@ -71,17 +69,11 @@ import de.anomic.yacy.yacyURL;
 public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
 
     // statics: for initialization of the HTMLFilterAbstractScraper
-    private static TreeSet<String> linkTags0;
-    private static TreeSet<String> linkTags1;
+    private static HashSet<String> linkTags0;
+    private static HashSet<String> linkTags1;
 
-    private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
     static {
-        insensitiveCollator.setStrength(Collator.SECONDARY);
-        insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
-    }
-
-    static {
-        linkTags0 = new TreeSet<String>(insensitiveCollator);
+        linkTags0 = new HashSet<String>();
         linkTags0.add("img");
         linkTags0.add("base");
         linkTags0.add("frame");
@@ -91,7 +83,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
         linkTags0.add("embed");     //added by [MN]
         linkTags0.add("param");     //added by [MN]
 
-        linkTags1 = new TreeSet<String>(insensitiveCollator);
+        linkTags1 = new HashSet<String>();
         linkTags1.add("a");
         linkTags1.add("h1");
         linkTags1.add("h2");
diff --git a/source/de/anomic/kelondro/kelondroRowCollection.java b/source/de/anomic/kelondro/kelondroRowCollection.java
index 71246df93..1f7587ed6 100644
--- a/source/de/anomic/kelondro/kelondroRowCollection.java
+++ b/source/de/anomic/kelondro/kelondroRowCollection.java
@@ -234,7 +234,7 @@ public class kelondroRowCollection {
                     // grow instead of shrink, simply ignore the growfactor
         if (serverMemory.available() + 1000 < needed)
             return; // if the swap buffer is not available, we must give up.
-                    // This is not critical. Othervise we provoke a serious
+                    // This is not critical. Otherwise we provoke a serious
                     // problem with OOM
         byte[] newChunkcache = new byte[needed];
         System.arraycopy(chunkcache, 0, newChunkcache, 0, Math.min(
@@ -264,15 +264,20 @@ public class kelondroRowCollection {
         return b;
     }
     
-    public synchronized final kelondroRow.Entry get(int index, boolean clone) {
+    public final kelondroRow.Entry get(int index, boolean clone) {
         assert (index >= 0) : "get: access with index " + index + " is below zero";
         assert (index < chunkcount) : "get: access with index " + index + " is above chunkcount " + chunkcount + "; sortBound = " + sortBound;
         assert (index * rowdef.objectsize < chunkcache.length);
         if ((chunkcache == null) || (rowdef == null)) return null; // case may appear during shutdown
-        if (index >= chunkcount) return null;
-        if ((index + 1) * rowdef.objectsize > chunkcache.length) return null; // the whole chunk does not fit into the chunkcache
+        kelondroRow.Entry entry;
+        int addr = index * rowdef.objectsize;
+        synchronized (this) {
+            if (index >= chunkcount) return null;
+            if (addr + rowdef.objectsize > chunkcache.length) return null; // the whole chunk does not fit into the chunkcache
+            entry = rowdef.newEntry(chunkcache, addr, clone);
+        }
         this.lastTimeRead = System.currentTimeMillis();
-        return rowdef.newEntry(chunkcache, index * rowdef.objectsize, clone);
+        return entry;
     }
     
     public synchronized final void set(int index, kelondroRow.Entry a) {
diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
index 4f87c2fb2..9c9a86b79 100644
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@@ -611,11 +611,9 @@ public final class plasmaCondenser {
 
     }
     
-    public static StringBuffer trim(StringBuffer sb) {
-        synchronized (sb) {
-            while ((sb.length() > 0) && (sb.charAt(0) <= ' ')) sb = sb.deleteCharAt(0);
-            while ((sb.length() > 0) && (sb.charAt(sb.length() - 1) <= ' ')) sb = sb.deleteCharAt(sb.length() - 1);
-        }
+    private static StringBuffer trim(StringBuffer sb) {
+        while ((sb.length() > 0) && (sb.charAt(0) <= ' ')) sb = sb.deleteCharAt(0);
+        while ((sb.length() > 0) && (sb.charAt(sb.length() - 1) <= ' ')) sb = sb.deleteCharAt(sb.length() - 1);
         return sb;
     }
     
diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java
index 0bb9f8e19..ce9230d23 100644
--- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java
+++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java
@@ -246,12 +246,13 @@ public final class plasmaSearchRankingProcess {
     // - root-domain guessing to prefer the root domain over other urls if search word appears in domain name
     
     
-    private synchronized kelondroSortStack<indexRWIVarEntry>.stackElement bestRWI(boolean skipDoubleDom) {
-        // returns from the current RWI list the best entry and removed this entry from the list
+    private kelondroSortStack<indexRWIVarEntry>.stackElement bestRWI(boolean skipDoubleDom) {
+        // returns from the current RWI list the best entry and removes this entry from the list
         kelondroSortStack<indexRWIVarEntry> m;
         kelondroSortStack<indexRWIVarEntry>.stackElement rwi;
         while (stack.size() > 0) {
             rwi = stack.pop();
+            if (rwi == null) continue; // in case that a synchronization problem occurred just go lazy over it
             if (!skipDoubleDom) return rwi;
             // check doubledom
             String domhash = rwi.element.urlHash().substring(6);
@@ -272,6 +273,7 @@ public final class plasmaSearchRankingProcess {
         kelondroSortStack<indexRWIVarEntry>.stackElement o;
         while (i.hasNext()) {
             m = i.next();
+            if (m == null) continue;
             if (m.size() == 0) continue;
             if (bestEntry == null) {
                 bestEntry = m.top();
@@ -293,7 +295,6 @@ public final class plasmaSearchRankingProcess {
     public indexURLReference bestURL(boolean skipDoubleDom) {
         // returns from the current RWI list the best URL entry and removed this entry from the list
         while ((stack.size() > 0) || (size() > 0)) {
-            synchronized (this) {
                 if (((stack.size() == 0) && (size() == 0))) break;
                 kelondroSortStack<indexRWIVarEntry>.stackElement obrwi = bestRWI(skipDoubleDom);
                 indexURLReference u = wordIndex.getURL(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue());
@@ -303,12 +304,11 @@ public final class plasmaSearchRankingProcess {
                     return u;
                 }
                 misses.add(obrwi.element.urlHash());
-            }
         }
         return null;
     }
     
-    public synchronized int size() {
+    public int size() {
         //assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
         int c = stack.size();
         Iterator<kelondroSortStack<indexRWIVarEntry>> i = this.doubleDomCache.values().iterator();
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index 7db01e9ce..57e28584b 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -366,16 +366,16 @@ public class plasmaSnippetCache {
         Set<String> remainingHashes = (tsr == null) ? queryhashes : (Set<String>) tsr[1];
         
         // compute snippet from media
-        String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
-        String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes);
-        String appline = computeMediaSnippet(document.getApplinks(), queryhashes);
+        //String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
+        //String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes);
+        //String appline = computeMediaSnippet(document.getApplinks(), queryhashes);
         //String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes);
         //String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
         
         line = "";
-        if (audioline != null) line += (line.length() == 0) ? audioline : "<br />" + audioline;
-        if (videoline != null) line += (line.length() == 0) ? videoline : "<br />" + videoline;
-        if (appline   != null) line += (line.length() == 0) ? appline   : "<br />" + appline;
+        //if (audioline != null) line += (line.length() == 0) ? audioline : "<br />" + audioline;
+        //if (videoline != null) line += (line.length() == 0) ? videoline : "<br />" + videoline;
+        //if (appline   != null) line += (line.length() == 0) ? appline   : "<br />" + appline;
         //if (hrefline  != null) line += (line.length() == 0) ? hrefline  : "<br />" + hrefline;
         if (textline  != null) line += (line.length() == 0) ? textline  : "<br />" + textline;
         
@@ -494,6 +494,7 @@ public class plasmaSnippetCache {
         return snippetsCache.get(key);
     }
     
+    /*
     private static String computeMediaSnippet(Map<yacyURL, String> media, Set<String> queryhashes) {
         Iterator<Map.Entry<yacyURL, String>> i = media.entrySet().iterator();
         Map.Entry<yacyURL, String> entry;
@@ -519,6 +520,7 @@ public class plasmaSnippetCache {
         if (result.length() == 0) return null;
         return result.substring(6);
     }
+    */
     
     @SuppressWarnings("unchecked")
     private static Object[] /*{String - the snippet, Set - remaining hashes}*/
diff --git a/source/de/anomic/yacy/yacyURL.java b/source/de/anomic/yacy/yacyURL.java
index 47ba8aac3..0d4f2c62a 100644
--- a/source/de/anomic/yacy/yacyURL.java
+++ b/source/de/anomic/yacy/yacyURL.java
@@ -904,8 +904,14 @@ public class yacyURL {
                 ((this.port     == other.port    )));
     }
     
+    /**
+     * hash code computation for yacyURL: please don't mix this up with the YaCy-Hash
+     * this hash here is only used by hashing data structures, like a HashMap
+     * We do not use tha yacy hash here, because this needs the computation of a DNS
+     * which is very time-intensive
+     */
     public int hashCode() {
-        return this.hash().hashCode();
+        return this.toNormalform(true, false).hashCode();
     }
     
     public int compareTo(Object h) {