removed lowercase of snippets (and other things):

- added new sentence parser to condenser - sentence parsing can now handle charsets to do: charsets must be handed over to new sentence parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2712 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 1969522dc1
parent 43614f1b36
commit 1969522dc1
6 changed files with 123 additions and 33 deletions
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@ -48,6 +48,7 @@

 import java.io.File;
 import java.io.Writer;
+import java.util.Enumeration;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.TreeSet;
@ -128,9 +129,9 @@ public class CacheAdmin_p {
                        .append("<b>EMAIL:</b><br>").append(formatAnchor(document.getEmaillinks())).append("<br>")
                        .append("<b>TEXT:</b><br><span class=\"small\">").append(new String(scraper.getText())).append("</span><br>")
                        .append("<b>LINES:</b><br><span class=\"small\">");
-                    final String[] sentences = document.getSentences();
-                    for (int i = 0; i < sentences.length; i++) {
-                        info.append(sentences[i]).append("<br>");
+                    final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
+                    if (sentences != null) while (sentences.hasMoreElements()) {
+                        info.append((String) sentences.nextElement()).append("<br>");
                    }
                    info.append("</span><br>");
                    if (document != null) document.close();
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -49,6 +49,7 @@ import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.net.URLDecoder;
 import java.net.URLEncoder;
+import java.util.Enumeration;

 import de.anomic.data.wikiCode;
 import de.anomic.http.httpHeader;
@ -262,11 +263,12 @@ public class ViewFile {
                    prop.put("viewMode_parsedText",content);
                } else {
                    prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
-                    String[] sentences = document.getSentences();
+                    final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset

                    boolean dark = true;
-                    for (int i=0; i < sentences.length; i++) {
-                        String currentSentence = wikiCode.replaceHTML(sentences[i]);
+                    int i = 0;
+                    if (sentences != null) while (sentences.hasMoreElements()) {
+                        String currentSentence = wikiCode.replaceHTML((String) sentences.nextElement());

                        // Search word highlighting
                        String words = post.get("words",null);
@ -286,8 +288,9 @@ public class ViewFile {
                        prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1)); 
                        prop.put("viewMode_sentences_" + i + "_text",currentSentence);   
                        prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
+                        i++;
                    }
-                    prop.put("viewMode_sentences",sentences.length);
+                    prop.put("viewMode_sentences", i);

                } 
                if (document != null) document.close();
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@ -51,6 +51,8 @@ import java.io.FileWriter;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
 import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.HashSet;
@ -469,7 +471,7 @@ public final class plasmaCondenser {
    }

    protected final static boolean punctuation(char c) {
-        return ("!?.".indexOf(c) >= 0);
+        return (c == '.') || (c == '!') || (c == '?');
    }

    public final static boolean invisible(char c) {
@ -648,7 +650,89 @@ public final class plasmaCondenser {
            return counter;
        }
    }
+    
+    public static Enumeration sentencesFromInputStream(InputStream is, String charset) {
+        try {
+            return new sentencesFromInputStreamEnum(is, charset);
+        } catch (UnsupportedEncodingException e) {
+            return null;
+        }
+    }
+    
+    private static class sentencesFromInputStreamEnum implements Enumeration {
+        // read sentences from a given input stream
+        // this enumerates String objects
+        
+        Object buffer = null;
+        BufferedReader raf;
+        int counter = 0;
+
+        public sentencesFromInputStreamEnum(InputStream is, String charset) throws UnsupportedEncodingException {
+            raf = new BufferedReader((charset == null) ? new InputStreamReader(is) : new InputStreamReader(is, charset));
+            buffer = nextElement0();
+            counter = 0;
+        }
+
+        private Object nextElement0() {
+            try {
+                String s = readSentence(raf);
+                if (s == null) {
+                    raf.close();
+                    return null;
+                }
+                return s;
+            } catch (IOException e) {
+                try {
+                    raf.close();
+                } catch (Exception ee) {
+                }
+                return null;
+            }
+        }
+
+        public boolean hasMoreElements() {
+            return buffer != null;
+        }
+
+        public Object nextElement() {
+            if (buffer == null) {
+                return null;
+            } else {
+                counter = counter + ((String) buffer).length() + 1;
+                Object r = buffer;
+                buffer = nextElement0();
+                return r;
+            }
+        }

+        public int count() {
+            return counter;
+        }
+    }
+
+    static String readSentence(Reader reader) throws IOException {
+        StringBuffer s = new StringBuffer();
+        int nextChar;
+        char c;
+        
+        // find sentence end
+        for (;;) {
+            nextChar = reader.read();
+            if (nextChar < 0) return null;
+            c = (char) nextChar;
+            s.append(c);
+            if (punctuation(c)) break;
+        }
+
+        // replace line endings and tabs by blanks
+        for (int i = 0; i < s.length(); i++) {
+            if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) || (s.charAt(i) == (char) 8)) s.setCharAt(i, ' ');
+        }
+        // remove all double-spaces
+        int p; while ((p = s.indexOf("  ")) >= 0) s.deleteCharAt(p);
+        return new String(s);
+        
+    }
    /*
    private static void addLineSearchProp(Properties prop, String s, String[] searchwords, HashSet foundsearch) {
        // we store lines containing a key in search vector
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -55,6 +55,7 @@ import java.io.InputStream;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.util.Arrays;
+import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Hashtable;
@ -815,12 +816,12 @@ public final class plasmaParser {
                System.out.println(document.getMainLongTitle());
                
                // found text
-                String[] sentences = document.getSentences();
-                if (sentences != null) {
-                    for (int i = 0; i < sentences.length; i++) {
+                final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
+                int i = 0;
+                if (sentences != null) while (sentences.hasMoreElements()) {
                        System.out.print("line " + i + ": ");
-                        System.out.println(sentences[i]);
-                    }
+                        System.out.println((String) sentences.nextElement());
+                        i++;
                }
                
                // found links
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -50,6 +50,7 @@ import java.io.InputStream;
 import java.net.MalformedURLException;
 import de.anomic.server.serverFileUtils;

+import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
@ -189,13 +190,9 @@ public class plasmaParserDocument {
        return -1; 
    }
    
-    public plasmaCondenser getCondenser() {
-        if (condenser == null) condenser = new plasmaCondenser(getText(), 0, 0);
-        return condenser;
-    }
-    
-    public String[] getSentences() {
-        return getCondenser().sentences();
+    public Enumeration getSentences(String charset) {
+        if (this.text == null) return null;
+        return plasmaCondenser.sentencesFromInputStream(getText(), charset);
    }
    
    public String getKeywords(char separator) {
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -47,6 +47,7 @@ package de.anomic.plasma;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
 import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.HashSet;
@ -249,10 +250,10 @@ public class plasmaSnippetCache {
        if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
                
        //System.out.println("loaded document for URL " + url);
-        String[] sentences = document.getSentences();
+        final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
        document.close();
        //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
-        if ((sentences == null) || (sentences.length == 0)) {
+        if (sentences == null) {
            //System.out.println("found no sentences in url " + url);
            return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
        }
@ -357,26 +358,30 @@ public class plasmaSnippetCache {
        return (String) snippetsCache.get(key);
    }
    
-    private String computeSnippet(String[] sentences, Set queryhashes, int minLength, int maxLength) {
+    private String computeSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
        try {
-            if ((sentences == null) || (sentences.length == 0)) return null;
+            if (sentences == null) return null;
            if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
            kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
            Iterator j;
            HashMap hs;
            String hash;
-            for (int i = 0; i < sentences.length; i++) {
+            ArrayList sb = new ArrayList();
+            String sentence;
+            while (sentences.hasMoreElements()) {
+                sentence = (String) sentences.nextElement();
                //System.out.println("Sentence " + i + ": " + sentences[i]);
-                if (sentences[i].length() > minLength) {
-                    hs = hashSentence(sentences[i]);
+                if (sentence.length() > minLength) {
+                    hs = hashSentence(sentence);
                    j = queryhashes.iterator();
                    while (j.hasNext()) {
                        hash = (String) j.next();
                        if (hs.containsKey(hash)) {
                            //System.out.println("hash " + hash + " appears in line " + i);
-                            hitTable.incScore(new Integer(i));
+                            hitTable.incScore(new Integer(sb.size()));
                        }
                    }
+                    sb.add(sentence);
                }
            }
            int score = hitTable.getMaxScore(); // best number of hits
@ -385,15 +390,14 @@ public class plasmaSnippetCache {
            // now find the shortest line of these hits
            int shortLineIndex = -1;
            int shortLineLength = Integer.MAX_VALUE;
-            for (int i = 0; i < sentences.length; i++) {
-                if ((hitTable.getScore(new Integer(i)) == score) &&
-                (sentences[i].length() < shortLineLength)) {
+            for (int i = 0; i < sb.size(); i++) {
+                if ((hitTable.getScore(new Integer(i)) == score) && (((String) sb.get(i)).length() < shortLineLength)) {
                    shortLineIndex = i;
-                    shortLineLength = sentences[i].length();
+                    shortLineLength = ((String) sb.get(i)).length();
                }
            }
            // find a first result
-            String result = sentences[shortLineIndex];
+            String result = (String) sb.get(shortLineIndex);
            // remove all hashes that appear in the result
            hs = hashSentence(result);
            j = queryhashes.iterator();