From ef78f22ee1bd535d10aa431d754b0fe9deefe136 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Wed, 25 Jan 2012 12:48:48 +0100
Subject: [PATCH] performance hack

---
 .../net/yacy/document/SnippetExtractor.java   | 42 +++++++++----------
 source/net/yacy/document/WordTokenizer.java   | 18 ++++----
 source/net/yacy/kelondro/order/Digest.java    | 10 +++--
 .../net/yacy/search/snippet/MediaSnippet.java |  8 ++--
 .../net/yacy/search/snippet/TextSnippet.java  |  3 +-
 5 files changed, 44 insertions(+), 37 deletions(-)
diff --git a/source/net/yacy/document/SnippetExtractor.java b/source/net/yacy/document/SnippetExtractor.java
index 5c23a6eac..417833418 100644
--- a/source/net/yacy/document/SnippetExtractor.java
+++ b/source/net/yacy/document/SnippetExtractor.java
@@ -7,12 +7,12 @@
  *  modify it under the terms of the GNU Lesser General Public
  *  License as published by the Free Software Foundation; either
  *  version 2.1 of the License, or (at your option) any later version.
- *  
+ *
  *  This library is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  Lesser General Public License for more details.
- *  
+ *
  *  You should have received a copy of the GNU Lesser General Public License
  *  along with this program in the file lgpl21.txt
  *  If not, see <http://www.gnu.org/licenses/>.
@@ -32,10 +32,10 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
 import net.yacy.kelondro.logging.Log;
 
 public class SnippetExtractor {
-    
+
     String snippetString;
     HandleSet remainingHashes;
-    
+
     public SnippetExtractor(final Collection<StringBuilder> sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException {
         if (sentences == null) throw new UnsupportedOperationException("sentence == null");
         if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
@@ -47,7 +47,7 @@ public class SnippetExtractor {
         int linenumber = 0;
         int fullmatchcounter = 0;
         lookup: for (final StringBuilder sentence: sentences) {
-            hs = WordTokenizer.hashSentence(sentence.toString(), null);
+            hs = WordTokenizer.hashSentence(sentence.toString(), null, 100);
             positions = new TreeSet<Integer>();
             for (final byte[] word: queryhashes) {
                 pos = hs.get(word);
@@ -69,7 +69,7 @@ public class SnippetExtractor {
             }
             linenumber++;
         }
-        
+
         StringBuilder sentence;
         SnippetExtractor tsr;
         while (!order.isEmpty()) {
@@ -79,27 +79,27 @@ public class SnippetExtractor {
             } catch (UnsupportedOperationException e) {
                 continue;
             }
-            snippetString = tsr.snippetString;
-            if (snippetString != null && snippetString.length() > 0) {
-                remainingHashes = tsr.remainingHashes;
-                if (remainingHashes.isEmpty()) {
+            this.snippetString = tsr.snippetString;
+            if (this.snippetString != null && this.snippetString.length() > 0) {
+                this.remainingHashes = tsr.remainingHashes;
+                if (this.remainingHashes.isEmpty()) {
                     // we have found the snippet
                     return; // finished!
-                } else if (remainingHashes.size() < queryhashes.size()) {
+                } else if (this.remainingHashes.size() < queryhashes.size()) {
                     // the result has not all words in it.
                     // find another sentence that represents the missing other words
                     // and find recursively more sentences
-                    maxLength = maxLength - snippetString.length();
+                    maxLength = maxLength - this.snippetString.length();
                     if (maxLength < 20) maxLength = 20;
                     try {
-                        tsr = new SnippetExtractor(order.values(), remainingHashes, maxLength);
+                        tsr = new SnippetExtractor(order.values(), this.remainingHashes, maxLength);
                     } catch (UnsupportedOperationException e) {
                         throw e;
                     }
                     final String nextSnippet = tsr.snippetString;
                     if (nextSnippet == null) return;
-                    snippetString = snippetString + (" / " + nextSnippet);
-                    remainingHashes = tsr.remainingHashes;
+                    this.snippetString = this.snippetString + (" / " + nextSnippet);
+                    this.remainingHashes = tsr.remainingHashes;
                     return;
                 } else {
                     // error
@@ -110,7 +110,7 @@ public class SnippetExtractor {
         }
         throw new UnsupportedOperationException("no snippet computed");
     }
-    
+
     private static int linelengthKey(int givenlength, int maxlength) {
         if (givenlength > maxlength) return 1;
         if (givenlength >= maxlength / 2 && givenlength < maxlength) return 7;
@@ -118,15 +118,15 @@ public class SnippetExtractor {
         if (givenlength >= maxlength / 8 && givenlength < maxlength / 4) return 3;
         return 0;
     }
-    
+
     private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException {
         try {
             if (sentence == null) throw new UnsupportedOperationException("no sentence given");
             if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
             byte[] hash;
-            
+
             // find all hashes that appear in the sentence
-            final Map<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null);
+            final Map<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null, 100);
             final Iterator<byte[]> j = queryhashes.iterator();
             Integer pos;
             int p, minpos = sentence.length(), maxpos = -1;
@@ -189,11 +189,11 @@ public class SnippetExtractor {
             throw new UnsupportedOperationException(e.getMessage());
         }
     }
-    
+
     public String getSnippet() {
         return this.snippetString;
     }
-    
+
     public HandleSet getRemainingWords() {
         return this.remainingHashes;
     }
diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java
index 664c4e8cc..6dc30bb75 100644
--- a/source/net/yacy/document/WordTokenizer.java
+++ b/source/net/yacy/document/WordTokenizer.java
@@ -68,10 +68,12 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
         return null;
     }
 
+    @Override
     public boolean hasMoreElements() {
         return this.buffer != null;
     }
 
+    @Override
     public StringBuilder nextElement() {
         final StringBuilder r = (this.buffer == null) ? null : this.buffer;
         this.buffer = nextElement0();
@@ -79,9 +81,9 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
         if (this.meaningLib != null) WordCache.learn(r);
         return r;
     }
-    
+
     public void close() {
-    	e.close();
+    	this.e.close();
     }
 
     private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
@@ -139,10 +141,12 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
             return r;
         }
 
+        @Override
         public boolean hasMoreElements() {
             return this.buffer != null;
         }
 
+        @Override
         public StringBuilder nextElement() {
             final StringBuilder r = this.buffer;
             this.buffer = nextElement0();
@@ -150,7 +154,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
         }
 
         public void close() {
-        	e.close();
+        	this.e.close();
         }
     }
 
@@ -177,7 +181,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
      * @param sentence the sentence to be tokenized
      * @return a ordered map containing word hashes as key and positions as value. The map is orderd by the hash ordering
      */
-    public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib) {
+    public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib, int maxlength) {
         final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
         final WordTokenizer words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib);
         try {
@@ -185,16 +189,16 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
 	        StringBuilder word;
 	        byte[] hash;
 	        Integer oldpos;
-	        while (words.hasMoreElements()) {
+	        while (words.hasMoreElements() && maxlength-- > 0) {
 	            word = words.nextElement();
 	            hash = Word.word2hash(word);
-	
+
 	            // don't overwrite old values, that leads to too far word distances
 	            oldpos = map.put(hash, LargeNumberCache.valueOf(pos));
 	            if (oldpos != null) {
 	                map.put(hash, oldpos);
 	            }
-	
+
 	            pos += word.length() + 1;
 	        }
 	        return map;
diff --git a/source/net/yacy/kelondro/order/Digest.java b/source/net/yacy/kelondro/order/Digest.java
index 9da0afcf2..d1efe21d7 100644
--- a/source/net/yacy/kelondro/order/Digest.java
+++ b/source/net/yacy/kelondro/order/Digest.java
@@ -65,7 +65,7 @@ public class Digest {
             md5Cache = new ConcurrentARC<String, byte[]>(1000, Math.max(8, 2 * Runtime.getRuntime().availableProcessors()));
         }
     }
-    
+
     public static String encodeHex(final long in, final int length) {
         String s = Long.toHexString(in);
         while (s.length() < length) s = "0" + s;
@@ -119,7 +119,7 @@ public class Digest {
 
         byte[] h = md5Cache.get(key);
         if (h != null) return h;
-        
+
     	MessageDigest digest = digestPool.poll();
     	if (digest == null) {
     	    // if there are no digest objects left, create some on the fly
@@ -129,12 +129,14 @@ public class Digest {
                 digest.reset();
             } catch (final NoSuchAlgorithmException e) {
             }
+    	} else {
+    	    digest.reset(); // they should all be reseted but anyway; this is safe
     	}
         byte[] keyBytes;
         keyBytes = UTF8.getBytes(key);
         digest.update(keyBytes);
         final byte[] result = digest.digest();
-        digest.reset();
+        digest.reset(); // to be prepared for next
         try {
             digestPool.put(digest);
             //System.out.println("Digest Pool size = " + digestPool.size());
@@ -390,7 +392,7 @@ public class Digest {
         }
 
         System.out.println("time: " + (System.currentTimeMillis() - start) + " ms");
-        
+
         // without this this method would never end
         Log.shutdown();
     }
diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java
index e7bff135f..6b4691655 100644
--- a/source/net/yacy/search/snippet/MediaSnippet.java
+++ b/source/net/yacy/search/snippet/MediaSnippet.java
@@ -27,7 +27,7 @@ package net.yacy.search.snippet;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Comparator;
-import java.util.Date;
+import java.util.Date;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -51,8 +51,8 @@ import net.yacy.kelondro.order.Base64Order;
 import net.yacy.kelondro.util.ByteArray;
 import net.yacy.repository.Blacklist;
 import net.yacy.search.Switchboard;
-import de.anomic.crawler.retrieval.Request;
 import de.anomic.crawler.ZURL.FailCategory;
+import de.anomic.crawler.retrieval.Request;
 
 
 public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> {
@@ -117,10 +117,12 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
         return Base64Order.enhancedCoder.equal(this.href.hash(), other.href.hash());
     }
 
+    @Override
     public int compareTo(final MediaSnippet o) {
         return Base64Order.enhancedCoder.compare(this.href.hash(), o.href.hash());
     }
 
+    @Override
     public int compare(final MediaSnippet o1, final MediaSnippet o2) {
         return o1.compareTo(o2);
     }
@@ -217,7 +219,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
     private static HandleSet removeAppearanceHashes(final String sentence, final HandleSet queryhashes) {
         // remove all hashes that appear in the sentence
         if (sentence == null) return queryhashes;
-        final SortedMap<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null);
+        final SortedMap<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null, 100);
         final Iterator<byte[]> j = queryhashes.iterator();
         byte[] hash;
         Integer pos;
diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java
index 3dd2d0c67..20c6b44b8 100644
--- a/source/net/yacy/search/snippet/TextSnippet.java
+++ b/source/net/yacy/search/snippet/TextSnippet.java
@@ -497,8 +497,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
 
     private static boolean containsAllHashes(
             final String sentence, final HandleSet queryhashes) {
-        final SortedMap<byte[], Integer> m =
-                WordTokenizer.hashSentence(sentence, null);
+        final SortedMap<byte[], Integer> m = WordTokenizer.hashSentence(sentence, null, 100);
         for (final byte[] b : queryhashes) {
             if (!(m.containsKey(b))) {
                 return false;