diff --git a/htroot/ViewFile.html b/htroot/ViewFile.html index 858186e6b..46fd69a9a 100644 --- a/htroot/ViewFile.html +++ b/htroot/ViewFile.html @@ -31,6 +31,7 @@ + @@ -93,6 +94,12 @@ #[attr]# #{/links}# +:: +
Parsed Tokens +
    #{words}# +
  1. #[word]#
  2. #{/words}# +
+
#(/viewMode)#

diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index dbdc6bf3f..21cfef6c2 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -29,6 +29,7 @@ import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URLDecoder; +import java.util.Enumeration; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -59,6 +60,7 @@ public class ViewFile { public static final int VIEW_MODE_AS_PARSED_SENTENCES = 3; public static final int VIEW_MODE_AS_IFRAME = 4; public static final int VIEW_MODE_AS_LINKLIST = 5; + public static final int VIEW_MODE_AS_PARSED_WORDS = 6; private static final String HIGHLIGHT_CSS = "searchHighlight"; private static final int MAX_HIGHLIGHTS = 6; @@ -240,7 +242,7 @@ public class ViewFile { prop.put("viewMode", VIEW_MODE_AS_IFRAME); prop.put("viewMode_url", url.toNormalform(false, true)); - } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("links")) { + } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("words") || viewMode.equals("links")) { // parsing the resource content plasmaParserDocument document = null; try { @@ -297,6 +299,33 @@ public class ViewFile { } prop.put("viewMode_sentences", i); + } else if (viewMode.equals("words")) { + prop.put("viewMode", VIEW_MODE_AS_PARSED_WORDS); + final Iterator sentences = document.getSentences(pre); + + boolean dark = true; + int i = 0; + String sentence, token; + if (sentences != null) { + + // Search word highlighting + while (sentences.hasNext()) { + sentence = sentences.next().toString(); + Enumeration tokens = plasmaCondenser.wordTokenizer(sentence, "UTF-8"); + while (tokens.hasMoreElements()) { + token = tokens.nextElement().toString(); + if (token.length() > 0) { + prop.put("viewMode_words_" + i + "_nr", i + 1); + prop.put("viewMode_words_" + i + "_word", token); + prop.put("viewMode_words_" + i + "_dark", dark ? "1" : "0"); + dark = !dark; + i++; + } + } + } + } + prop.put("viewMode_words", i); + } else if (viewMode.equals("links")) { prop.put("viewMode", VIEW_MODE_AS_LINKLIST); boolean dark = true; diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index b0a92d097..d2238a65a 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -108,13 +108,15 @@ public final class plasmaCondenser { this.wordcut = 2; this.words = new TreeMap(); this.RESULT_FLAGS = new kelondroBitfield(4); + + // construct flag set for document + if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true); + if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true); + if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true); + if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true); this.languageIdentificator = new Identificator(); - //System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia)); - - // add the URL components to the word list - insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS, false); Map.Entry entry; if (indexText) { @@ -161,6 +163,9 @@ public final class plasmaCondenser { this.RESULT_DIFF_SENTENCES = 0; } + // add the URL components to the word list + insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS, false); + if (indexMedia) { // add anchor descriptions: here, we also add the url components // audio @@ -209,12 +214,6 @@ public final class plasmaCondenser { } } } - - // construct flag set for document - if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true); - if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true); - if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true); - if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true); } private void insertTextToWords(final String text, final int phrase, final int flagpos, final kelondroBitfield flagstemplate, boolean useForLanguageIdentification) { @@ -360,7 +359,7 @@ public final class plasmaCondenser { this.RESULT_FLAGS.set(flag_cat_indexof, true); wordenum.pre(true); // parse lines as they come with CRLF } - if ((last_index) && (word.equals("of"))) comb_indexof = true; + if ((last_index) && (wordminsize > 2 || (word.equals("of")))) comb_indexof = true; last_last = word.equals("last"); last_index = word.equals("index"); @@ -491,10 +490,10 @@ public final class plasmaCondenser { else return true; } - + public static Enumeration wordTokenizer(final String s, final String charset) { try { - return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes("UTF-8"))); + return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset))); } catch (final Exception e) { return null; } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index e6c340e2b..347762564 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -330,13 +330,9 @@ public class plasmaSnippetCache { // trying to load the resource from the cache resContent = plasmaHTCache.getResourceContentStream(url); responseHeader = plasmaHTCache.loadResponseHeader(url); - if (resContent != null) { - // if the content was found - resContentLength = plasmaHTCache.getResourceContentLength(url); - if ((resContentLength > maxDocLen) && (!fetchOnline)) { - // content may be too large to be parsed here. To be fast, we omit calculation of snippet here - return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes"); - } + if (resContent != null && ((resContentLength = plasmaHTCache.getResourceContentLength(url)) > maxDocLen) && (!fetchOnline)) { + // content may be too large to be parsed here. To be fast, we omit calculation of snippet here + return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes"); } else if (containsAllHashes(comp.dc_title(), queryhashes)) { // try to create the snippet from information given in the url itself return new TextSnippet(url, (comp.dc_subject().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash())); @@ -346,7 +342,7 @@ public class plasmaSnippetCache { } else if (containsAllHashes(comp.dc_subject(), queryhashes)) { // try to create the snippet from information given in the subject metadata return new TextSnippet(url, (comp.dc_creator().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash())); - } else if (containsAllHashes(comp.url().toNormalform(true, true), queryhashes)) { + } else if (containsAllHashes(comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) { // try to create the snippet from information given in the subject metadata return new TextSnippet(url, (comp.dc_creator().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash())); } else if (fetchOnline) { @@ -673,7 +669,7 @@ public class plasmaSnippetCache { final int newlen = Math.max(10, maxpos - minpos + 10); final int around = (maxLength - newlen) / 2; assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); - assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); + //assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]"; minpos = around; maxpos = sentence.length() - around - 5; diff --git a/source/de/anomic/yacy/yacyPeerSelection.java b/source/de/anomic/yacy/yacyPeerSelection.java index 7efe954c5..ca7435e9a 100644 --- a/source/de/anomic/yacy/yacyPeerSelection.java +++ b/source/de/anomic/yacy/yacyPeerSelection.java @@ -100,7 +100,7 @@ public class yacyPeerSelection { this.remaining = max; this.doublecheck = new HashSet(); this.nextSeed = nextInternal(); - this.alsoMyOwn = alsoMyOwn && (kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) > 0); + this.alsoMyOwn = alsoMyOwn && nextSeed != null && (kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) > 0); } public boolean hasNext() {