diff --git a/htroot/ViewFile.html b/htroot/ViewFile.html
index 858186e6b..46fd69a9a 100644
--- a/htroot/ViewFile.html
+++ b/htroot/ViewFile.html
@@ -31,6 +31,7 @@
+
@@ -93,6 +94,12 @@
#[attr]# |
#{/links}#
+::
+
#(/viewMode)#
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index dbdc6bf3f..21cfef6c2 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -29,6 +29,7 @@ import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder;
+import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
@@ -59,6 +60,7 @@ public class ViewFile {
public static final int VIEW_MODE_AS_PARSED_SENTENCES = 3;
public static final int VIEW_MODE_AS_IFRAME = 4;
public static final int VIEW_MODE_AS_LINKLIST = 5;
+ public static final int VIEW_MODE_AS_PARSED_WORDS = 6;
private static final String HIGHLIGHT_CSS = "searchHighlight";
private static final int MAX_HIGHLIGHTS = 6;
@@ -240,7 +242,7 @@ public class ViewFile {
prop.put("viewMode", VIEW_MODE_AS_IFRAME);
prop.put("viewMode_url", url.toNormalform(false, true));
- } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("links")) {
+ } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("words") || viewMode.equals("links")) {
// parsing the resource content
plasmaParserDocument document = null;
try {
@@ -297,6 +299,33 @@ public class ViewFile {
}
prop.put("viewMode_sentences", i);
+ } else if (viewMode.equals("words")) {
+ prop.put("viewMode", VIEW_MODE_AS_PARSED_WORDS);
+ final Iterator sentences = document.getSentences(pre);
+
+ boolean dark = true;
+ int i = 0;
+ String sentence, token;
+ if (sentences != null) {
+
+ // Search word highlighting
+ while (sentences.hasNext()) {
+ sentence = sentences.next().toString();
+ Enumeration tokens = plasmaCondenser.wordTokenizer(sentence, "UTF-8");
+ while (tokens.hasMoreElements()) {
+ token = tokens.nextElement().toString();
+ if (token.length() > 0) {
+ prop.put("viewMode_words_" + i + "_nr", i + 1);
+ prop.put("viewMode_words_" + i + "_word", token);
+ prop.put("viewMode_words_" + i + "_dark", dark ? "1" : "0");
+ dark = !dark;
+ i++;
+ }
+ }
+ }
+ }
+ prop.put("viewMode_words", i);
+
} else if (viewMode.equals("links")) {
prop.put("viewMode", VIEW_MODE_AS_LINKLIST);
boolean dark = true;
diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
index b0a92d097..d2238a65a 100644
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@@ -108,13 +108,15 @@ public final class plasmaCondenser {
this.wordcut = 2;
this.words = new TreeMap();
this.RESULT_FLAGS = new kelondroBitfield(4);
+
+ // construct flag set for document
+ if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
+ if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
+ if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
+ if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
this.languageIdentificator = new Identificator();
- //System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia));
-
- // add the URL components to the word list
- insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS, false);
Map.Entry entry;
if (indexText) {
@@ -161,6 +163,9 @@ public final class plasmaCondenser {
this.RESULT_DIFF_SENTENCES = 0;
}
+ // add the URL components to the word list
+ insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS, false);
+
if (indexMedia) {
// add anchor descriptions: here, we also add the url components
// audio
@@ -209,12 +214,6 @@ public final class plasmaCondenser {
}
}
}
-
- // construct flag set for document
- if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
- if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
- if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
- if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
}
private void insertTextToWords(final String text, final int phrase, final int flagpos, final kelondroBitfield flagstemplate, boolean useForLanguageIdentification) {
@@ -360,7 +359,7 @@ public final class plasmaCondenser {
this.RESULT_FLAGS.set(flag_cat_indexof, true);
wordenum.pre(true); // parse lines as they come with CRLF
}
- if ((last_index) && (word.equals("of"))) comb_indexof = true;
+ if ((last_index) && (wordminsize > 2 || (word.equals("of")))) comb_indexof = true;
last_last = word.equals("last");
last_index = word.equals("index");
@@ -491,10 +490,10 @@ public final class plasmaCondenser {
else
return true;
}
-
+
public static Enumeration wordTokenizer(final String s, final String charset) {
try {
- return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes("UTF-8")));
+ return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset)));
} catch (final Exception e) {
return null;
}
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index e6c340e2b..347762564 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -330,13 +330,9 @@ public class plasmaSnippetCache {
// trying to load the resource from the cache
resContent = plasmaHTCache.getResourceContentStream(url);
responseHeader = plasmaHTCache.loadResponseHeader(url);
- if (resContent != null) {
- // if the content was found
- resContentLength = plasmaHTCache.getResourceContentLength(url);
- if ((resContentLength > maxDocLen) && (!fetchOnline)) {
- // content may be too large to be parsed here. To be fast, we omit calculation of snippet here
- return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
- }
+ if (resContent != null && ((resContentLength = plasmaHTCache.getResourceContentLength(url)) > maxDocLen) && (!fetchOnline)) {
+ // content may be too large to be parsed here. To be fast, we omit calculation of snippet here
+ return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
} else if (containsAllHashes(comp.dc_title(), queryhashes)) {
// try to create the snippet from information given in the url itself
return new TextSnippet(url, (comp.dc_subject().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
@@ -346,7 +342,7 @@ public class plasmaSnippetCache {
} else if (containsAllHashes(comp.dc_subject(), queryhashes)) {
// try to create the snippet from information given in the subject metadata
return new TextSnippet(url, (comp.dc_creator().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
- } else if (containsAllHashes(comp.url().toNormalform(true, true), queryhashes)) {
+ } else if (containsAllHashes(comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) {
// try to create the snippet from information given in the subject metadata
return new TextSnippet(url, (comp.dc_creator().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
} else if (fetchOnline) {
@@ -673,7 +669,7 @@ public class plasmaSnippetCache {
final int newlen = Math.max(10, maxpos - minpos + 10);
final int around = (maxLength - newlen) / 2;
assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
- assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
+ //assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
minpos = around;
maxpos = sentence.length() - around - 5;
diff --git a/source/de/anomic/yacy/yacyPeerSelection.java b/source/de/anomic/yacy/yacyPeerSelection.java
index 7efe954c5..ca7435e9a 100644
--- a/source/de/anomic/yacy/yacyPeerSelection.java
+++ b/source/de/anomic/yacy/yacyPeerSelection.java
@@ -100,7 +100,7 @@ public class yacyPeerSelection {
this.remaining = max;
this.doublecheck = new HashSet();
this.nextSeed = nextInternal();
- this.alsoMyOwn = alsoMyOwn && (kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) > 0);
+ this.alsoMyOwn = alsoMyOwn && nextSeed != null && (kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) > 0);
}
public boolean hasNext() {