From ef78f22ee1bd535d10aa431d754b0fe9deefe136 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 25 Jan 2012 12:48:48 +0100 Subject: [PATCH] performance hack --- .../net/yacy/document/SnippetExtractor.java | 42 +++++++++---------- source/net/yacy/document/WordTokenizer.java | 18 ++++---- source/net/yacy/kelondro/order/Digest.java | 10 +++-- .../net/yacy/search/snippet/MediaSnippet.java | 8 ++-- .../net/yacy/search/snippet/TextSnippet.java | 3 +- 5 files changed, 44 insertions(+), 37 deletions(-) diff --git a/source/net/yacy/document/SnippetExtractor.java b/source/net/yacy/document/SnippetExtractor.java index 5c23a6eac..417833418 100644 --- a/source/net/yacy/document/SnippetExtractor.java +++ b/source/net/yacy/document/SnippetExtractor.java @@ -7,12 +7,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -32,10 +32,10 @@ import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; public class SnippetExtractor { - + String snippetString; HandleSet remainingHashes; - + public SnippetExtractor(final Collection sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException { if (sentences == null) throw new UnsupportedOperationException("sentence == null"); if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null"); @@ -47,7 +47,7 @@ public class SnippetExtractor { int linenumber = 0; int fullmatchcounter = 0; lookup: for (final StringBuilder sentence: sentences) { - hs = WordTokenizer.hashSentence(sentence.toString(), null); + hs = WordTokenizer.hashSentence(sentence.toString(), null, 100); positions = new TreeSet(); for (final byte[] word: queryhashes) { pos = hs.get(word); @@ -69,7 +69,7 @@ public class SnippetExtractor { } linenumber++; } - + StringBuilder sentence; SnippetExtractor tsr; while (!order.isEmpty()) { @@ -79,27 +79,27 @@ public class SnippetExtractor { } catch (UnsupportedOperationException e) { continue; } - snippetString = tsr.snippetString; - if (snippetString != null && snippetString.length() > 0) { - remainingHashes = tsr.remainingHashes; - if (remainingHashes.isEmpty()) { + this.snippetString = tsr.snippetString; + if (this.snippetString != null && this.snippetString.length() > 0) { + this.remainingHashes = tsr.remainingHashes; + if (this.remainingHashes.isEmpty()) { // we have found the snippet return; // finished! - } else if (remainingHashes.size() < queryhashes.size()) { + } else if (this.remainingHashes.size() < queryhashes.size()) { // the result has not all words in it. // find another sentence that represents the missing other words // and find recursively more sentences - maxLength = maxLength - snippetString.length(); + maxLength = maxLength - this.snippetString.length(); if (maxLength < 20) maxLength = 20; try { - tsr = new SnippetExtractor(order.values(), remainingHashes, maxLength); + tsr = new SnippetExtractor(order.values(), this.remainingHashes, maxLength); } catch (UnsupportedOperationException e) { throw e; } final String nextSnippet = tsr.snippetString; if (nextSnippet == null) return; - snippetString = snippetString + (" / " + nextSnippet); - remainingHashes = tsr.remainingHashes; + this.snippetString = this.snippetString + (" / " + nextSnippet); + this.remainingHashes = tsr.remainingHashes; return; } else { // error @@ -110,7 +110,7 @@ public class SnippetExtractor { } throw new UnsupportedOperationException("no snippet computed"); } - + private static int linelengthKey(int givenlength, int maxlength) { if (givenlength > maxlength) return 1; if (givenlength >= maxlength / 2 && givenlength < maxlength) return 7; @@ -118,15 +118,15 @@ public class SnippetExtractor { if (givenlength >= maxlength / 8 && givenlength < maxlength / 4) return 3; return 0; } - + private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException { try { if (sentence == null) throw new UnsupportedOperationException("no sentence given"); if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null"); byte[] hash; - + // find all hashes that appear in the sentence - final Map hs = WordTokenizer.hashSentence(sentence, null); + final Map hs = WordTokenizer.hashSentence(sentence, null, 100); final Iterator j = queryhashes.iterator(); Integer pos; int p, minpos = sentence.length(), maxpos = -1; @@ -189,11 +189,11 @@ public class SnippetExtractor { throw new UnsupportedOperationException(e.getMessage()); } } - + public String getSnippet() { return this.snippetString; } - + public HandleSet getRemainingWords() { return this.remainingHashes; } diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java index 664c4e8cc..6dc30bb75 100644 --- a/source/net/yacy/document/WordTokenizer.java +++ b/source/net/yacy/document/WordTokenizer.java @@ -68,10 +68,12 @@ public class WordTokenizer implements Enumeration { return null; } + @Override public boolean hasMoreElements() { return this.buffer != null; } + @Override public StringBuilder nextElement() { final StringBuilder r = (this.buffer == null) ? null : this.buffer; this.buffer = nextElement0(); @@ -79,9 +81,9 @@ public class WordTokenizer implements Enumeration { if (this.meaningLib != null) WordCache.learn(r); return r; } - + public void close() { - e.close(); + this.e.close(); } private static class unsievedWordsEnum implements Enumeration { @@ -139,10 +141,12 @@ public class WordTokenizer implements Enumeration { return r; } + @Override public boolean hasMoreElements() { return this.buffer != null; } + @Override public StringBuilder nextElement() { final StringBuilder r = this.buffer; this.buffer = nextElement0(); @@ -150,7 +154,7 @@ public class WordTokenizer implements Enumeration { } public void close() { - e.close(); + this.e.close(); } } @@ -177,7 +181,7 @@ public class WordTokenizer implements Enumeration { * @param sentence the sentence to be tokenized * @return a ordered map containing word hashes as key and positions as value. The map is orderd by the hash ordering */ - public static SortedMap hashSentence(final String sentence, final WordCache meaningLib) { + public static SortedMap hashSentence(final String sentence, final WordCache meaningLib, int maxlength) { final SortedMap map = new TreeMap(Base64Order.enhancedCoder); final WordTokenizer words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib); try { @@ -185,16 +189,16 @@ public class WordTokenizer implements Enumeration { StringBuilder word; byte[] hash; Integer oldpos; - while (words.hasMoreElements()) { + while (words.hasMoreElements() && maxlength-- > 0) { word = words.nextElement(); hash = Word.word2hash(word); - + // don't overwrite old values, that leads to too far word distances oldpos = map.put(hash, LargeNumberCache.valueOf(pos)); if (oldpos != null) { map.put(hash, oldpos); } - + pos += word.length() + 1; } return map; diff --git a/source/net/yacy/kelondro/order/Digest.java b/source/net/yacy/kelondro/order/Digest.java index 9da0afcf2..d1efe21d7 100644 --- a/source/net/yacy/kelondro/order/Digest.java +++ b/source/net/yacy/kelondro/order/Digest.java @@ -65,7 +65,7 @@ public class Digest { md5Cache = new ConcurrentARC(1000, Math.max(8, 2 * Runtime.getRuntime().availableProcessors())); } } - + public static String encodeHex(final long in, final int length) { String s = Long.toHexString(in); while (s.length() < length) s = "0" + s; @@ -119,7 +119,7 @@ public class Digest { byte[] h = md5Cache.get(key); if (h != null) return h; - + MessageDigest digest = digestPool.poll(); if (digest == null) { // if there are no digest objects left, create some on the fly @@ -129,12 +129,14 @@ public class Digest { digest.reset(); } catch (final NoSuchAlgorithmException e) { } + } else { + digest.reset(); // they should all be reseted but anyway; this is safe } byte[] keyBytes; keyBytes = UTF8.getBytes(key); digest.update(keyBytes); final byte[] result = digest.digest(); - digest.reset(); + digest.reset(); // to be prepared for next try { digestPool.put(digest); //System.out.println("Digest Pool size = " + digestPool.size()); @@ -390,7 +392,7 @@ public class Digest { } System.out.println("time: " + (System.currentTimeMillis() - start) + " ms"); - + // without this this method would never end Log.shutdown(); } diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java index e7bff135f..6b4691655 100644 --- a/source/net/yacy/search/snippet/MediaSnippet.java +++ b/source/net/yacy/search/snippet/MediaSnippet.java @@ -27,7 +27,7 @@ package net.yacy.search.snippet; import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; -import java.util.Date; +import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -51,8 +51,8 @@ import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.util.ByteArray; import net.yacy.repository.Blacklist; import net.yacy.search.Switchboard; -import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.ZURL.FailCategory; +import de.anomic.crawler.retrieval.Request; public class MediaSnippet implements Comparable, Comparator { @@ -117,10 +117,12 @@ public class MediaSnippet implements Comparable, Comparator, Comparator hs = WordTokenizer.hashSentence(sentence, null); + final SortedMap hs = WordTokenizer.hashSentence(sentence, null, 100); final Iterator j = queryhashes.iterator(); byte[] hash; Integer pos; diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index 3dd2d0c67..20c6b44b8 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -497,8 +497,7 @@ public class TextSnippet implements Comparable, Comparator m = - WordTokenizer.hashSentence(sentence, null); + final SortedMap m = WordTokenizer.hashSentence(sentence, null, 100); for (final byte[] b : queryhashes) { if (!(m.containsKey(b))) { return false;