From fad044fb5403d58238812d70af75167890980cbd Mon Sep 17 00:00:00 2001
From: lotus <lotus@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Sat, 8 Nov 2008 10:08:53 +0000
Subject: [PATCH] update to snippet marker: - do not display indexed html
 (solves xss issues) the single words are analyzed for already marked parts.
 this is needed to avoid false encoding of the marker (<b>) tags. - improved
 speed for existing routine heavy used regex pattern are precompiled now

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5322 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 .../de/anomic/plasma/plasmaSnippetCache.java  | 172 ++++++++++++------
 1 file changed, 118 insertions(+), 54 deletions(-)
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index 733dbce20..f7f37e3d1 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -35,7 +35,10 @@ import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
+import de.anomic.htmlFilter.htmlFilterCharacterCoding;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.http.HttpClient;
 import de.anomic.http.httpResponseHeader;
@@ -102,6 +105,27 @@ public class plasmaSnippetCache {
         Set<String> remaingHashes;
         private final yacyURL favicon;
         
+        /**
+         * <code>\\A[^\\p{L}\\p{N}].+</code>
+         */
+        private final static Pattern p1 = Pattern.compile("\\A[^\\p{L}\\p{N}].+");
+        /**
+         * <code>.+[^\\p{L}\\p{N}]\\Z</code>
+         */
+        private final static Pattern p2 = Pattern.compile(".+[^\\p{L}\\p{N}]\\Z");
+        /**
+         * <code>\\A[\\p{L}\\p{N}]+[^\\p{L}\\p{N}].+\\Z</code>
+         */
+        private final static Pattern p3 = Pattern.compile("\\A[\\p{L}\\p{N}]+[^\\p{L}\\p{N}].+\\Z");
+        /**
+         * <code>[^\\p{L}\\p{N}]</code>
+         */
+        private final static Pattern p4 = Pattern.compile("[^\\p{L}\\p{N}]");
+        /**
+         * <code>(.*?)(\\&lt;b\\&gt;.+?\\&lt;/b\\&gt;)(.*)</code>
+         */
+		private final static Pattern p01 = Pattern.compile("(.*?)(\\<b\\>.+?\\</b\\>)(.*)"); // marked words are in <b>-tags
+        
         public TextSnippet(final yacyURL url, final String line, final int errorCode, final Set<String> remaingHashes, final String errortext) {
         	this(url,line,errorCode,remaingHashes,errortext,null);
         }
@@ -117,6 +141,9 @@ public class plasmaSnippetCache {
         public yacyURL getUrl() {
             return this.url;
         }
+        public yacyURL getFavicon() {
+        	return this.favicon;
+        }
         public boolean exists() {
             return line != null;
         }
@@ -142,60 +169,18 @@ public class plasmaSnippetCache {
             final Iterator<String> i = queryHashes.iterator();
             String h;
             final String[] w = line.split(" ");
-            String prefix = "";
-            String postfix = "";
-            int len = 0;
             while (i.hasNext()) {
                 h = i.next();
                 for (int j = 0; j < w.length; j++) {
-                    //ignore punctuation marks (contrib [MN])
-                    //note to myself:
-                    //For details on regex see "Mastering regular expressions" by J.E.F. Friedl
-                    //especially p. 123 and p. 390/391 (in the German version of the 2nd edition)
-
-                    prefix = "";
-                    postfix = "";
-
-                    // cut off prefix if it contains of non-characters or non-numbers
-                    while(w[j].matches("\\A[^\\p{L}\\p{N}].+")) {
-                        prefix = prefix + w[j].substring(0,1);
-                        w[j] = w[j].substring(1);
-                    }
-
-                    // cut off postfix if it contains of non-characters or non-numbers
-                    while(w[j].matches(".+[^\\p{L}\\p{N}]\\Z")) {
-                        len = w[j].length();
-                        postfix = w[j].substring(len-1,len) + postfix;
-                        w[j] = w[j].substring(0,len-1);
-                    }
-
-                    //special treatment if there is a special character in the word
-                    if(w[j].matches("\\A[\\p{L}\\p{N}]+[^\\p{L}\\p{N}].+\\Z")) {
-                        String out = "";
-                        String temp = "";
-                        for(int k=0; k < w[j].length(); k++) {
-                            //is character a special character?
-                            if(w[j].substring(k,k+1).matches("[^\\p{L}\\p{N}]")) {
-                                if (indexWord.word2hash(temp).equals(h)) temp = "<b>" + temp + "</b>";
-                                out = out + temp + w[j].substring(k,k+1);
-                                temp = "";
-                            }
-                            //last character
-                            else if(k == (w[j].length()-1)) {
-                                temp = temp + w[j].substring(k,k+1);
-                                if (indexWord.word2hash(temp).equals(h)) temp = "<b>" + temp + "</b>";
-                                out = out + temp;
-                                temp = "";
-                            }
-                            else temp = temp + w[j].substring(k,k+1);
-                        }
-                        w[j] = out;
-                    }
-
-                    //end contrib [MN]
-                    else if (indexWord.word2hash(w[j]).equals(h)) w[j] = "<b>" + w[j] + "</b>";
-
-                    w[j] = prefix + w[j] + postfix;
+            		final ArrayList<String> al = markedWordArrayList(w[j]); // mark special character separated words correctly if more than 1 word has to be marked
+            		w[j] = "";
+            		for (int k = 0; k < al.size(); k++) {
+            			if(k % 2 == 0){ // word has not been marked
+            				w[j] += getWordMarked(al.get(k), h);
+            			} else { // word has been marked, do not encode again
+            				w[j] += al.get(k);
+            			}
+            		}
                 }
             }
             final StringBuffer l = new StringBuffer(line.length() + queryHashes.size() * 8);
@@ -205,10 +190,89 @@ public class plasmaSnippetCache {
             }
             return l.toString().trim();
         }
-        
-        public yacyURL getFavicon() {
-        	return this.favicon;
+
+        /**
+         * mark words with &lt;b&gt;-tags
+         * @param word the word to mark
+         * @param h the hash of the word to mark
+         * @return the marked word if hash matches, else the unmarked word
+         * @see #getLineMarked(Set)
+         */
+        private static String getWordMarked(String word, String h){
+            //ignore punctuation marks (contrib [MN])
+            //note to myself:
+            //For details on regex see "Mastering regular expressions" by J.E.F. Friedl
+            //especially p. 123 and p. 390/391 (in the German version of the 2nd edition)
+
+            String prefix = "";
+            String postfix = "";
+            int len = 0;
+
+            // cut off prefix if it contains of non-characters or non-numbers
+            while(p1.matcher(word).find()) {
+                prefix = prefix + word.substring(0,1);
+                word = word.substring(1);
+            }
+
+            // cut off postfix if it contains of non-characters or non-numbers
+            while(p2.matcher(word).find()) {
+                len = word.length();
+                postfix = word.substring(len-1,len) + postfix;
+                word = word.substring(0,len-1);
+            }
+
+            //special treatment if there is a special character in the word
+            if(p3.matcher(word).find()) {
+                String out = "";
+                String temp = "";
+                for(int k=0; k < word.length(); k++) {
+                    //is character a special character?
+                    if(p4.matcher(word.substring(k,k+1)).find()) {
+                        if (indexWord.word2hash(temp).equals(h)) temp = "<b>" + htmlFilterCharacterCoding.unicode2html(temp, false) + "</b>";
+                        out = out + temp + htmlFilterCharacterCoding.unicode2html(word.substring(k,k+1), false);
+                        temp = "";
+                    }
+                    //last character
+                    else if(k == (word.length()-1)) {
+                        temp = temp + word.substring(k,k+1);
+                        if (indexWord.word2hash(temp).equals(h)) temp = "<b>" + htmlFilterCharacterCoding.unicode2html(temp, false) + "</b>";
+                        out = out + temp;
+                        temp = "";
+                    }
+                    else temp = temp + word.substring(k,k+1);
+                }
+                word = out;
+            }
+
+            //end contrib [MN]
+            else if (indexWord.word2hash(word).equals(h)) word = "<b>" + htmlFilterCharacterCoding.unicode2html(word, false) + "</b>";
+
+            word = htmlFilterCharacterCoding.unicode2html(prefix, false)
+            	+ word
+            	+ htmlFilterCharacterCoding.unicode2html(postfix, false);
+            return word;
         }
+        
+    	/**
+    	 * words that already has been marked has index <code>(i % 2 == 1)</code>
+    	 * words that has not yet been marked has index <code>(i % 2 == 0)</code>
+    	 * @param string the String to be processed
+    	 * @return words that already has and has not yet been marked
+    	 * @author [DW], 08.11.2008
+    	 */
+    	private static ArrayList<String> markedWordArrayList(String string){
+    	    ArrayList<String> al = new java.util.ArrayList<String>(1);
+    		Matcher m = p01.matcher(string);
+    		while (m.find()) {
+    			al.add(m.group(1));
+    			al.add(m.group(2));
+    			string = m.group(3); // the postfix
+    			m = p01.matcher(string);
+    			}
+    		al.add(string);
+    		return al;
+    	}
+
     }
     
     public static class MediaSnippet {