diff --git a/source/net/yacy/document/SnippetExtractor.java b/source/net/yacy/document/SnippetExtractor.java
index 5c23a6eac..417833418 100644
--- a/source/net/yacy/document/SnippetExtractor.java
+++ b/source/net/yacy/document/SnippetExtractor.java
@@ -7,12 +7,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
- *
+ *
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
- *
+ *
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see .
@@ -32,10 +32,10 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
public class SnippetExtractor {
-
+
String snippetString;
HandleSet remainingHashes;
-
+
public SnippetExtractor(final Collection sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException {
if (sentences == null) throw new UnsupportedOperationException("sentence == null");
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
@@ -47,7 +47,7 @@ public class SnippetExtractor {
int linenumber = 0;
int fullmatchcounter = 0;
lookup: for (final StringBuilder sentence: sentences) {
- hs = WordTokenizer.hashSentence(sentence.toString(), null);
+ hs = WordTokenizer.hashSentence(sentence.toString(), null, 100);
positions = new TreeSet();
for (final byte[] word: queryhashes) {
pos = hs.get(word);
@@ -69,7 +69,7 @@ public class SnippetExtractor {
}
linenumber++;
}
-
+
StringBuilder sentence;
SnippetExtractor tsr;
while (!order.isEmpty()) {
@@ -79,27 +79,27 @@ public class SnippetExtractor {
} catch (UnsupportedOperationException e) {
continue;
}
- snippetString = tsr.snippetString;
- if (snippetString != null && snippetString.length() > 0) {
- remainingHashes = tsr.remainingHashes;
- if (remainingHashes.isEmpty()) {
+ this.snippetString = tsr.snippetString;
+ if (this.snippetString != null && this.snippetString.length() > 0) {
+ this.remainingHashes = tsr.remainingHashes;
+ if (this.remainingHashes.isEmpty()) {
// we have found the snippet
return; // finished!
- } else if (remainingHashes.size() < queryhashes.size()) {
+ } else if (this.remainingHashes.size() < queryhashes.size()) {
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
- maxLength = maxLength - snippetString.length();
+ maxLength = maxLength - this.snippetString.length();
if (maxLength < 20) maxLength = 20;
try {
- tsr = new SnippetExtractor(order.values(), remainingHashes, maxLength);
+ tsr = new SnippetExtractor(order.values(), this.remainingHashes, maxLength);
} catch (UnsupportedOperationException e) {
throw e;
}
final String nextSnippet = tsr.snippetString;
if (nextSnippet == null) return;
- snippetString = snippetString + (" / " + nextSnippet);
- remainingHashes = tsr.remainingHashes;
+ this.snippetString = this.snippetString + (" / " + nextSnippet);
+ this.remainingHashes = tsr.remainingHashes;
return;
} else {
// error
@@ -110,7 +110,7 @@ public class SnippetExtractor {
}
throw new UnsupportedOperationException("no snippet computed");
}
-
+
private static int linelengthKey(int givenlength, int maxlength) {
if (givenlength > maxlength) return 1;
if (givenlength >= maxlength / 2 && givenlength < maxlength) return 7;
@@ -118,15 +118,15 @@ public class SnippetExtractor {
if (givenlength >= maxlength / 8 && givenlength < maxlength / 4) return 3;
return 0;
}
-
+
private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException {
try {
if (sentence == null) throw new UnsupportedOperationException("no sentence given");
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
byte[] hash;
-
+
// find all hashes that appear in the sentence
- final Map hs = WordTokenizer.hashSentence(sentence, null);
+ final Map hs = WordTokenizer.hashSentence(sentence, null, 100);
final Iterator j = queryhashes.iterator();
Integer pos;
int p, minpos = sentence.length(), maxpos = -1;
@@ -189,11 +189,11 @@ public class SnippetExtractor {
throw new UnsupportedOperationException(e.getMessage());
}
}
-
+
public String getSnippet() {
return this.snippetString;
}
-
+
public HandleSet getRemainingWords() {
return this.remainingHashes;
}
diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java
index 664c4e8cc..6dc30bb75 100644
--- a/source/net/yacy/document/WordTokenizer.java
+++ b/source/net/yacy/document/WordTokenizer.java
@@ -68,10 +68,12 @@ public class WordTokenizer implements Enumeration {
return null;
}
+ @Override
public boolean hasMoreElements() {
return this.buffer != null;
}
+ @Override
public StringBuilder nextElement() {
final StringBuilder r = (this.buffer == null) ? null : this.buffer;
this.buffer = nextElement0();
@@ -79,9 +81,9 @@ public class WordTokenizer implements Enumeration {
if (this.meaningLib != null) WordCache.learn(r);
return r;
}
-
+
public void close() {
- e.close();
+ this.e.close();
}
private static class unsievedWordsEnum implements Enumeration {
@@ -139,10 +141,12 @@ public class WordTokenizer implements Enumeration {
return r;
}
+ @Override
public boolean hasMoreElements() {
return this.buffer != null;
}
+ @Override
public StringBuilder nextElement() {
final StringBuilder r = this.buffer;
this.buffer = nextElement0();
@@ -150,7 +154,7 @@ public class WordTokenizer implements Enumeration {
}
public void close() {
- e.close();
+ this.e.close();
}
}
@@ -177,7 +181,7 @@ public class WordTokenizer implements Enumeration {
* @param sentence the sentence to be tokenized
* @return a ordered map containing word hashes as key and positions as value. The map is orderd by the hash ordering
*/
- public static SortedMap hashSentence(final String sentence, final WordCache meaningLib) {
+ public static SortedMap hashSentence(final String sentence, final WordCache meaningLib, int maxlength) {
final SortedMap map = new TreeMap(Base64Order.enhancedCoder);
final WordTokenizer words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib);
try {
@@ -185,16 +189,16 @@ public class WordTokenizer implements Enumeration {
StringBuilder word;
byte[] hash;
Integer oldpos;
- while (words.hasMoreElements()) {
+ while (words.hasMoreElements() && maxlength-- > 0) {
word = words.nextElement();
hash = Word.word2hash(word);
-
+
// don't overwrite old values, that leads to too far word distances
oldpos = map.put(hash, LargeNumberCache.valueOf(pos));
if (oldpos != null) {
map.put(hash, oldpos);
}
-
+
pos += word.length() + 1;
}
return map;
diff --git a/source/net/yacy/kelondro/order/Digest.java b/source/net/yacy/kelondro/order/Digest.java
index 9da0afcf2..d1efe21d7 100644
--- a/source/net/yacy/kelondro/order/Digest.java
+++ b/source/net/yacy/kelondro/order/Digest.java
@@ -65,7 +65,7 @@ public class Digest {
md5Cache = new ConcurrentARC(1000, Math.max(8, 2 * Runtime.getRuntime().availableProcessors()));
}
}
-
+
public static String encodeHex(final long in, final int length) {
String s = Long.toHexString(in);
while (s.length() < length) s = "0" + s;
@@ -119,7 +119,7 @@ public class Digest {
byte[] h = md5Cache.get(key);
if (h != null) return h;
-
+
MessageDigest digest = digestPool.poll();
if (digest == null) {
// if there are no digest objects left, create some on the fly
@@ -129,12 +129,14 @@ public class Digest {
digest.reset();
} catch (final NoSuchAlgorithmException e) {
}
+ } else {
+ digest.reset(); // they should all be reseted but anyway; this is safe
}
byte[] keyBytes;
keyBytes = UTF8.getBytes(key);
digest.update(keyBytes);
final byte[] result = digest.digest();
- digest.reset();
+ digest.reset(); // to be prepared for next
try {
digestPool.put(digest);
//System.out.println("Digest Pool size = " + digestPool.size());
@@ -390,7 +392,7 @@ public class Digest {
}
System.out.println("time: " + (System.currentTimeMillis() - start) + " ms");
-
+
// without this this method would never end
Log.shutdown();
}
diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java
index e7bff135f..6b4691655 100644
--- a/source/net/yacy/search/snippet/MediaSnippet.java
+++ b/source/net/yacy/search/snippet/MediaSnippet.java
@@ -27,7 +27,7 @@ package net.yacy.search.snippet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
-import java.util.Date;
+import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -51,8 +51,8 @@ import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.ByteArray;
import net.yacy.repository.Blacklist;
import net.yacy.search.Switchboard;
-import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.ZURL.FailCategory;
+import de.anomic.crawler.retrieval.Request;
public class MediaSnippet implements Comparable, Comparator {
@@ -117,10 +117,12 @@ public class MediaSnippet implements Comparable, Comparator, Comparator hs = WordTokenizer.hashSentence(sentence, null);
+ final SortedMap hs = WordTokenizer.hashSentence(sentence, null, 100);
final Iterator j = queryhashes.iterator();
byte[] hash;
Integer pos;
diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java
index 3dd2d0c67..20c6b44b8 100644
--- a/source/net/yacy/search/snippet/TextSnippet.java
+++ b/source/net/yacy/search/snippet/TextSnippet.java
@@ -497,8 +497,7 @@ public class TextSnippet implements Comparable, Comparator m =
- WordTokenizer.hashSentence(sentence, null);
+ final SortedMap m = WordTokenizer.hashSentence(sentence, null, 100);
for (final byte[] b : queryhashes) {
if (!(m.containsKey(b))) {
return false;