From 1969522dc194ef385f54fbd6f3910ff620857da9 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Sat, 7 Oct 2006 00:06:09 +0000
Subject: [PATCH] removed lowercase of snippets (and other things): - added new
 sentence parser to condenser - sentence parsing can now handle charsets

to do: charsets must be handed over to new sentence parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2712 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/CacheAdmin_p.java                      |  7 +-
 htroot/ViewFile.java                          | 11 ++-
 source/de/anomic/plasma/plasmaCondenser.java  | 86 ++++++++++++++++++-
 source/de/anomic/plasma/plasmaParser.java     | 11 +--
 .../anomic/plasma/plasmaParserDocument.java   | 11 +--
 .../de/anomic/plasma/plasmaSnippetCache.java  | 30 ++++---
 6 files changed, 123 insertions(+), 33 deletions(-)
diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
index 40f414288..f3e1041c0 100644
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@@ -48,6 +48,7 @@
 
 import java.io.File;
 import java.io.Writer;
+import java.util.Enumeration;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.TreeSet;
@@ -128,9 +129,9 @@ public class CacheAdmin_p {
                         .append("<b>EMAIL:</b><br>").append(formatAnchor(document.getEmaillinks())).append("<br>")
                         .append("<b>TEXT:</b><br><span class=\"small\">").append(new String(scraper.getText())).append("</span><br>")
                         .append("<b>LINES:</b><br><span class=\"small\">");
-                    final String[] sentences = document.getSentences();
-                    for (int i = 0; i < sentences.length; i++) {
-                        info.append(sentences[i]).append("<br>");
+                    final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
+                    if (sentences != null) while (sentences.hasMoreElements()) {
+                        info.append((String) sentences.nextElement()).append("<br>");
                     }
                     info.append("</span><br>");
                     if (document != null) document.close();
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index 525c9d7e4..8681df3b4 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -49,6 +49,7 @@ import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.net.URLDecoder;
 import java.net.URLEncoder;
+import java.util.Enumeration;
 
 import de.anomic.data.wikiCode;
 import de.anomic.http.httpHeader;
@@ -262,11 +263,12 @@ public class ViewFile {
                     prop.put("viewMode_parsedText",content);
                 } else {
                     prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
-                    String[] sentences = document.getSentences();
+                    final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
 
                     boolean dark = true;
-                    for (int i=0; i < sentences.length; i++) {
-                        String currentSentence = wikiCode.replaceHTML(sentences[i]);
+                    int i = 0;
+                    if (sentences != null) while (sentences.hasMoreElements()) {
+                        String currentSentence = wikiCode.replaceHTML((String) sentences.nextElement());
 
                         // Search word highlighting
                         String words = post.get("words",null);
@@ -286,8 +288,9 @@ public class ViewFile {
                         prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1)); 
                         prop.put("viewMode_sentences_" + i + "_text",currentSentence);   
                         prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
+                        i++;
                     }
-                    prop.put("viewMode_sentences",sentences.length);
+                    prop.put("viewMode_sentences", i);
 
                 } 
                 if (document != null) document.close();
diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
index d72eb43f8..6dc70d2f4 100644
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@@ -51,6 +51,8 @@ import java.io.FileWriter;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
 import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -469,7 +471,7 @@ public final class plasmaCondenser {
     }
 
     protected final static boolean punctuation(char c) {
-        return ("!?.".indexOf(c) >= 0);
+        return (c == '.') || (c == '!') || (c == '?');
     }
 
     public final static boolean invisible(char c) {
@@ -648,7 +650,89 @@ public final class plasmaCondenser {
             return counter;
         }
     }
+    
+    public static Enumeration sentencesFromInputStream(InputStream is, String charset) {
+        try {
+            return new sentencesFromInputStreamEnum(is, charset);
+        } catch (UnsupportedEncodingException e) {
+            return null;
+        }
+    }
+    
+    private static class sentencesFromInputStreamEnum implements Enumeration {
+        // read sentences from a given input stream
+        // this enumerates String objects
+        
+        Object buffer = null;
+        BufferedReader raf;
+        int counter = 0;
+
+        public sentencesFromInputStreamEnum(InputStream is, String charset) throws UnsupportedEncodingException {
+            raf = new BufferedReader((charset == null) ? new InputStreamReader(is) : new InputStreamReader(is, charset));
+            buffer = nextElement0();
+            counter = 0;
+        }
+
+        private Object nextElement0() {
+            try {
+                String s = readSentence(raf);
+                if (s == null) {
+                    raf.close();
+                    return null;
+                }
+                return s;
+            } catch (IOException e) {
+                try {
+                    raf.close();
+                } catch (Exception ee) {
+                }
+                return null;
+            }
+        }
+
+        public boolean hasMoreElements() {
+            return buffer != null;
+        }
+
+        public Object nextElement() {
+            if (buffer == null) {
+                return null;
+            } else {
+                counter = counter + ((String) buffer).length() + 1;
+                Object r = buffer;
+                buffer = nextElement0();
+                return r;
+            }
+        }
 
+        public int count() {
+            return counter;
+        }
+    }
+
+    static String readSentence(Reader reader) throws IOException {
+        StringBuffer s = new StringBuffer();
+        int nextChar;
+        char c;
+        
+        // find sentence end
+        for (;;) {
+            nextChar = reader.read();
+            if (nextChar < 0) return null;
+            c = (char) nextChar;
+            s.append(c);
+            if (punctuation(c)) break;
+        }
+
+        // replace line endings and tabs by blanks
+        for (int i = 0; i < s.length(); i++) {
+            if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) || (s.charAt(i) == (char) 8)) s.setCharAt(i, ' ');
+        }
+        // remove all double-spaces
+        int p; while ((p = s.indexOf("  ")) >= 0) s.deleteCharAt(p);
+        return new String(s);
+        
+    }
     /*
     private static void addLineSearchProp(Properties prop, String s, String[] searchwords, HashSet foundsearch) {
         // we store lines containing a key in search vector
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index b420b7ffc..81404dc06 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -55,6 +55,7 @@ import java.io.InputStream;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.util.Arrays;
+import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Hashtable;
@@ -815,12 +816,12 @@ public final class plasmaParser {
                 System.out.println(document.getMainLongTitle());
                 
                 // found text
-                String[] sentences = document.getSentences();
-                if (sentences != null) {
-                    for (int i = 0; i < sentences.length; i++) {
+                final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
+                int i = 0;
+                if (sentences != null) while (sentences.hasMoreElements()) {
                         System.out.print("line " + i + ": ");
-                        System.out.println(sentences[i]);
-                    }
+                        System.out.println((String) sentences.nextElement());
+                        i++;
                 }
                 
                 // found links
diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java
index a9b276d1f..d032d7b9e 100644
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@@ -50,6 +50,7 @@ import java.io.InputStream;
 import java.net.MalformedURLException;
 import de.anomic.server.serverFileUtils;
 
+import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
@@ -189,13 +190,9 @@ public class plasmaParserDocument {
         return -1; 
     }
     
-    public plasmaCondenser getCondenser() {
-        if (condenser == null) condenser = new plasmaCondenser(getText(), 0, 0);
-        return condenser;
-    }
-    
-    public String[] getSentences() {
-        return getCondenser().sentences();
+    public Enumeration getSentences(String charset) {
+        if (this.text == null) return null;
+        return plasmaCondenser.sentencesFromInputStream(getText(), charset);
     }
     
     public String getKeywords(char separator) {
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index 80a63a1a1..e1e7e71eb 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -47,6 +47,7 @@ package de.anomic.plasma;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
 import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -249,10 +250,10 @@ public class plasmaSnippetCache {
         if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
                 
         //System.out.println("loaded document for URL " + url);
-        String[] sentences = document.getSentences();
+        final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
         document.close();
         //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
-        if ((sentences == null) || (sentences.length == 0)) {
+        if (sentences == null) {
             //System.out.println("found no sentences in url " + url);
             return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
         }
@@ -357,26 +358,30 @@ public class plasmaSnippetCache {
         return (String) snippetsCache.get(key);
     }
     
-    private String computeSnippet(String[] sentences, Set queryhashes, int minLength, int maxLength) {
+    private String computeSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
         try {
-            if ((sentences == null) || (sentences.length == 0)) return null;
+            if (sentences == null) return null;
             if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
             kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
             Iterator j;
             HashMap hs;
             String hash;
-            for (int i = 0; i < sentences.length; i++) {
+            ArrayList sb = new ArrayList();
+            String sentence;
+            while (sentences.hasMoreElements()) {
+                sentence = (String) sentences.nextElement();
                 //System.out.println("Sentence " + i + ": " + sentences[i]);
-                if (sentences[i].length() > minLength) {
-                    hs = hashSentence(sentences[i]);
+                if (sentence.length() > minLength) {
+                    hs = hashSentence(sentence);
                     j = queryhashes.iterator();
                     while (j.hasNext()) {
                         hash = (String) j.next();
                         if (hs.containsKey(hash)) {
                             //System.out.println("hash " + hash + " appears in line " + i);
-                            hitTable.incScore(new Integer(i));
+                            hitTable.incScore(new Integer(sb.size()));
                         }
                     }
+                    sb.add(sentence);
                 }
             }
             int score = hitTable.getMaxScore(); // best number of hits
@@ -385,15 +390,14 @@ public class plasmaSnippetCache {
             // now find the shortest line of these hits
             int shortLineIndex = -1;
             int shortLineLength = Integer.MAX_VALUE;
-            for (int i = 0; i < sentences.length; i++) {
-                if ((hitTable.getScore(new Integer(i)) == score) &&
-                (sentences[i].length() < shortLineLength)) {
+            for (int i = 0; i < sb.size(); i++) {
+                if ((hitTable.getScore(new Integer(i)) == score) && (((String) sb.get(i)).length() < shortLineLength)) {
                     shortLineIndex = i;
-                    shortLineLength = sentences[i].length();
+                    shortLineLength = ((String) sb.get(i)).length();
                 }
             }
             // find a first result
-            String result = sentences[shortLineIndex];
+            String result = (String) sb.get(shortLineIndex);
             // remove all hashes that appear in the result
             hs = hashSentence(result);
             j = queryhashes.iterator();