From fad044fb5403d58238812d70af75167890980cbd Mon Sep 17 00:00:00 2001 From: lotus Date: Sat, 8 Nov 2008 10:08:53 +0000 Subject: [PATCH] update to snippet marker: - do not display indexed html (solves xss issues) the single words are analyzed for already marked parts. this is needed to avoid false encoding of the marker () tags. - improved speed for existing routine heavy used regex pattern are precompiled now git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5322 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../de/anomic/plasma/plasmaSnippetCache.java | 172 ++++++++++++------ 1 file changed, 118 insertions(+), 54 deletions(-) diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 733dbce20..f7f37e3d1 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -35,7 +35,10 @@ import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import de.anomic.htmlFilter.htmlFilterCharacterCoding; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.http.HttpClient; import de.anomic.http.httpResponseHeader; @@ -102,6 +105,27 @@ public class plasmaSnippetCache { Set remaingHashes; private final yacyURL favicon; + /** + * \\A[^\\p{L}\\p{N}].+ + */ + private final static Pattern p1 = Pattern.compile("\\A[^\\p{L}\\p{N}].+"); + /** + * .+[^\\p{L}\\p{N}]\\Z + */ + private final static Pattern p2 = Pattern.compile(".+[^\\p{L}\\p{N}]\\Z"); + /** + * \\A[\\p{L}\\p{N}]+[^\\p{L}\\p{N}].+\\Z + */ + private final static Pattern p3 = Pattern.compile("\\A[\\p{L}\\p{N}]+[^\\p{L}\\p{N}].+\\Z"); + /** + * [^\\p{L}\\p{N}] + */ + private final static Pattern p4 = Pattern.compile("[^\\p{L}\\p{N}]"); + /** + * (.*?)(\\<b\\>.+?\\</b\\>)(.*) + */ + private final static Pattern p01 = Pattern.compile("(.*?)(\\.+?\\)(.*)"); // marked words are in -tags + public TextSnippet(final yacyURL url, final String line, final int errorCode, final Set remaingHashes, final String errortext) { this(url,line,errorCode,remaingHashes,errortext,null); } @@ -117,6 +141,9 @@ public class plasmaSnippetCache { public yacyURL getUrl() { return this.url; } + public yacyURL getFavicon() { + return this.favicon; + } public boolean exists() { return line != null; } @@ -142,60 +169,18 @@ public class plasmaSnippetCache { final Iterator i = queryHashes.iterator(); String h; final String[] w = line.split(" "); - String prefix = ""; - String postfix = ""; - int len = 0; while (i.hasNext()) { h = i.next(); for (int j = 0; j < w.length; j++) { - //ignore punctuation marks (contrib [MN]) - //note to myself: - //For details on regex see "Mastering regular expressions" by J.E.F. Friedl - //especially p. 123 and p. 390/391 (in the German version of the 2nd edition) - - prefix = ""; - postfix = ""; - - // cut off prefix if it contains of non-characters or non-numbers - while(w[j].matches("\\A[^\\p{L}\\p{N}].+")) { - prefix = prefix + w[j].substring(0,1); - w[j] = w[j].substring(1); - } - - // cut off postfix if it contains of non-characters or non-numbers - while(w[j].matches(".+[^\\p{L}\\p{N}]\\Z")) { - len = w[j].length(); - postfix = w[j].substring(len-1,len) + postfix; - w[j] = w[j].substring(0,len-1); - } - - //special treatment if there is a special character in the word - if(w[j].matches("\\A[\\p{L}\\p{N}]+[^\\p{L}\\p{N}].+\\Z")) { - String out = ""; - String temp = ""; - for(int k=0; k < w[j].length(); k++) { - //is character a special character? - if(w[j].substring(k,k+1).matches("[^\\p{L}\\p{N}]")) { - if (indexWord.word2hash(temp).equals(h)) temp = "" + temp + ""; - out = out + temp + w[j].substring(k,k+1); - temp = ""; - } - //last character - else if(k == (w[j].length()-1)) { - temp = temp + w[j].substring(k,k+1); - if (indexWord.word2hash(temp).equals(h)) temp = "" + temp + ""; - out = out + temp; - temp = ""; - } - else temp = temp + w[j].substring(k,k+1); - } - w[j] = out; - } - - //end contrib [MN] - else if (indexWord.word2hash(w[j]).equals(h)) w[j] = "" + w[j] + ""; - - w[j] = prefix + w[j] + postfix; + final ArrayList al = markedWordArrayList(w[j]); // mark special character separated words correctly if more than 1 word has to be marked + w[j] = ""; + for (int k = 0; k < al.size(); k++) { + if(k % 2 == 0){ // word has not been marked + w[j] += getWordMarked(al.get(k), h); + } else { // word has been marked, do not encode again + w[j] += al.get(k); + } + } } } final StringBuffer l = new StringBuffer(line.length() + queryHashes.size() * 8); @@ -205,10 +190,89 @@ public class plasmaSnippetCache { } return l.toString().trim(); } - - public yacyURL getFavicon() { - return this.favicon; + + /** + * mark words with <b>-tags + * @param word the word to mark + * @param h the hash of the word to mark + * @return the marked word if hash matches, else the unmarked word + * @see #getLineMarked(Set) + */ + private static String getWordMarked(String word, String h){ + //ignore punctuation marks (contrib [MN]) + //note to myself: + //For details on regex see "Mastering regular expressions" by J.E.F. Friedl + //especially p. 123 and p. 390/391 (in the German version of the 2nd edition) + + String prefix = ""; + String postfix = ""; + int len = 0; + + // cut off prefix if it contains of non-characters or non-numbers + while(p1.matcher(word).find()) { + prefix = prefix + word.substring(0,1); + word = word.substring(1); + } + + // cut off postfix if it contains of non-characters or non-numbers + while(p2.matcher(word).find()) { + len = word.length(); + postfix = word.substring(len-1,len) + postfix; + word = word.substring(0,len-1); + } + + //special treatment if there is a special character in the word + if(p3.matcher(word).find()) { + String out = ""; + String temp = ""; + for(int k=0; k < word.length(); k++) { + //is character a special character? + if(p4.matcher(word.substring(k,k+1)).find()) { + if (indexWord.word2hash(temp).equals(h)) temp = "" + htmlFilterCharacterCoding.unicode2html(temp, false) + ""; + out = out + temp + htmlFilterCharacterCoding.unicode2html(word.substring(k,k+1), false); + temp = ""; + } + //last character + else if(k == (word.length()-1)) { + temp = temp + word.substring(k,k+1); + if (indexWord.word2hash(temp).equals(h)) temp = "" + htmlFilterCharacterCoding.unicode2html(temp, false) + ""; + out = out + temp; + temp = ""; + } + else temp = temp + word.substring(k,k+1); + } + word = out; + } + + //end contrib [MN] + else if (indexWord.word2hash(word).equals(h)) word = "" + htmlFilterCharacterCoding.unicode2html(word, false) + ""; + + word = htmlFilterCharacterCoding.unicode2html(prefix, false) + + word + + htmlFilterCharacterCoding.unicode2html(postfix, false); + return word; } + + /** + * words that already has been marked has index (i % 2 == 1) + * words that has not yet been marked has index (i % 2 == 0) + * @param string the String to be processed + * @return words that already has and has not yet been marked + * @author [DW], 08.11.2008 + */ + private static ArrayList markedWordArrayList(String string){ + ArrayList al = new java.util.ArrayList(1); + Matcher m = p01.matcher(string); + while (m.find()) { + al.add(m.group(1)); + al.add(m.group(2)); + string = m.group(3); // the postfix + m = p01.matcher(string); + } + al.add(string); + return al; + } + } public static class MediaSnippet {