From 1969522dc194ef385f54fbd6f3910ff620857da9 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sat, 7 Oct 2006 00:06:09 +0000 Subject: [PATCH] removed lowercase of snippets (and other things): - added new sentence parser to condenser - sentence parsing can now handle charsets to do: charsets must be handed over to new sentence parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2712 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CacheAdmin_p.java | 7 +- htroot/ViewFile.java | 11 ++- source/de/anomic/plasma/plasmaCondenser.java | 86 ++++++++++++++++++- source/de/anomic/plasma/plasmaParser.java | 11 +-- .../anomic/plasma/plasmaParserDocument.java | 11 +-- .../de/anomic/plasma/plasmaSnippetCache.java | 30 ++++--- 6 files changed, 123 insertions(+), 33 deletions(-) diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index 40f414288..f3e1041c0 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -48,6 +48,7 @@ import java.io.File; import java.io.Writer; +import java.util.Enumeration; import java.util.Iterator; import java.util.Map; import java.util.TreeSet; @@ -128,9 +129,9 @@ public class CacheAdmin_p { .append("EMAIL:
").append(formatAnchor(document.getEmaillinks())).append("
") .append("TEXT:
").append(new String(scraper.getText())).append("
") .append("LINES:
"); - final String[] sentences = document.getSentences(); - for (int i = 0; i < sentences.length; i++) { - info.append(sentences[i]).append("
"); + final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset + if (sentences != null) while (sentences.hasMoreElements()) { + info.append((String) sentences.nextElement()).append("
"); } info.append("

"); if (document != null) document.close(); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 525c9d7e4..8681df3b4 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -49,6 +49,7 @@ import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; +import java.util.Enumeration; import de.anomic.data.wikiCode; import de.anomic.http.httpHeader; @@ -262,11 +263,12 @@ public class ViewFile { prop.put("viewMode_parsedText",content); } else { prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES); - String[] sentences = document.getSentences(); + final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset boolean dark = true; - for (int i=0; i < sentences.length; i++) { - String currentSentence = wikiCode.replaceHTML(sentences[i]); + int i = 0; + if (sentences != null) while (sentences.hasMoreElements()) { + String currentSentence = wikiCode.replaceHTML((String) sentences.nextElement()); // Search word highlighting String words = post.get("words",null); @@ -286,8 +288,9 @@ public class ViewFile { prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1)); prop.put("viewMode_sentences_" + i + "_text",currentSentence); prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark; + i++; } - prop.put("viewMode_sentences",sentences.length); + prop.put("viewMode_sentences", i); } if (document != null) document.close(); diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index d72eb43f8..6dc70d2f4 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -51,6 +51,8 @@ import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.Reader; +import java.io.UnsupportedEncodingException; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; @@ -469,7 +471,7 @@ public final class plasmaCondenser { } protected final static boolean punctuation(char c) { - return ("!?.".indexOf(c) >= 0); + return (c == '.') || (c == '!') || (c == '?'); } public final static boolean invisible(char c) { @@ -648,7 +650,89 @@ public final class plasmaCondenser { return counter; } } + + public static Enumeration sentencesFromInputStream(InputStream is, String charset) { + try { + return new sentencesFromInputStreamEnum(is, charset); + } catch (UnsupportedEncodingException e) { + return null; + } + } + + private static class sentencesFromInputStreamEnum implements Enumeration { + // read sentences from a given input stream + // this enumerates String objects + + Object buffer = null; + BufferedReader raf; + int counter = 0; + + public sentencesFromInputStreamEnum(InputStream is, String charset) throws UnsupportedEncodingException { + raf = new BufferedReader((charset == null) ? new InputStreamReader(is) : new InputStreamReader(is, charset)); + buffer = nextElement0(); + counter = 0; + } + + private Object nextElement0() { + try { + String s = readSentence(raf); + if (s == null) { + raf.close(); + return null; + } + return s; + } catch (IOException e) { + try { + raf.close(); + } catch (Exception ee) { + } + return null; + } + } + + public boolean hasMoreElements() { + return buffer != null; + } + + public Object nextElement() { + if (buffer == null) { + return null; + } else { + counter = counter + ((String) buffer).length() + 1; + Object r = buffer; + buffer = nextElement0(); + return r; + } + } + public int count() { + return counter; + } + } + + static String readSentence(Reader reader) throws IOException { + StringBuffer s = new StringBuffer(); + int nextChar; + char c; + + // find sentence end + for (;;) { + nextChar = reader.read(); + if (nextChar < 0) return null; + c = (char) nextChar; + s.append(c); + if (punctuation(c)) break; + } + + // replace line endings and tabs by blanks + for (int i = 0; i < s.length(); i++) { + if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) || (s.charAt(i) == (char) 8)) s.setCharAt(i, ' '); + } + // remove all double-spaces + int p; while ((p = s.indexOf(" ")) >= 0) s.deleteCharAt(p); + return new String(s); + + } /* private static void addLineSearchProp(Properties prop, String s, String[] searchwords, HashSet foundsearch) { // we store lines containing a key in search vector diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index b420b7ffc..81404dc06 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -55,6 +55,7 @@ import java.io.InputStream; import java.net.MalformedURLException; import java.net.URI; import java.util.Arrays; +import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Hashtable; @@ -815,12 +816,12 @@ public final class plasmaParser { System.out.println(document.getMainLongTitle()); // found text - String[] sentences = document.getSentences(); - if (sentences != null) { - for (int i = 0; i < sentences.length; i++) { + final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset + int i = 0; + if (sentences != null) while (sentences.hasMoreElements()) { System.out.print("line " + i + ": "); - System.out.println(sentences[i]); - } + System.out.println((String) sentences.nextElement()); + i++; } // found links diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index a9b276d1f..d032d7b9e 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -50,6 +50,7 @@ import java.io.InputStream; import java.net.MalformedURLException; import de.anomic.server.serverFileUtils; +import java.util.Enumeration; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -189,13 +190,9 @@ public class plasmaParserDocument { return -1; } - public plasmaCondenser getCondenser() { - if (condenser == null) condenser = new plasmaCondenser(getText(), 0, 0); - return condenser; - } - - public String[] getSentences() { - return getCondenser().sentences(); + public Enumeration getSentences(String charset) { + if (this.text == null) return null; + return plasmaCondenser.sentencesFromInputStream(getText(), charset); } public String getKeywords(char separator) { diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 80a63a1a1..e1e7e71eb 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -47,6 +47,7 @@ package de.anomic.plasma; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; @@ -249,10 +250,10 @@ public class plasmaSnippetCache { if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed //System.out.println("loaded document for URL " + url); - String[] sentences = document.getSentences(); + final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset document.close(); //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]); - if ((sentences == null) || (sentences.length == 0)) { + if (sentences == null) { //System.out.println("found no sentences in url " + url); return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences"); } @@ -357,26 +358,30 @@ public class plasmaSnippetCache { return (String) snippetsCache.get(key); } - private String computeSnippet(String[] sentences, Set queryhashes, int minLength, int maxLength) { + private String computeSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) { try { - if ((sentences == null) || (sentences.length == 0)) return null; + if (sentences == null) return null; if ((queryhashes == null) || (queryhashes.size() == 0)) return null; kelondroMScoreCluster hitTable = new kelondroMScoreCluster(); Iterator j; HashMap hs; String hash; - for (int i = 0; i < sentences.length; i++) { + ArrayList sb = new ArrayList(); + String sentence; + while (sentences.hasMoreElements()) { + sentence = (String) sentences.nextElement(); //System.out.println("Sentence " + i + ": " + sentences[i]); - if (sentences[i].length() > minLength) { - hs = hashSentence(sentences[i]); + if (sentence.length() > minLength) { + hs = hashSentence(sentence); j = queryhashes.iterator(); while (j.hasNext()) { hash = (String) j.next(); if (hs.containsKey(hash)) { //System.out.println("hash " + hash + " appears in line " + i); - hitTable.incScore(new Integer(i)); + hitTable.incScore(new Integer(sb.size())); } } + sb.add(sentence); } } int score = hitTable.getMaxScore(); // best number of hits @@ -385,15 +390,14 @@ public class plasmaSnippetCache { // now find the shortest line of these hits int shortLineIndex = -1; int shortLineLength = Integer.MAX_VALUE; - for (int i = 0; i < sentences.length; i++) { - if ((hitTable.getScore(new Integer(i)) == score) && - (sentences[i].length() < shortLineLength)) { + for (int i = 0; i < sb.size(); i++) { + if ((hitTable.getScore(new Integer(i)) == score) && (((String) sb.get(i)).length() < shortLineLength)) { shortLineIndex = i; - shortLineLength = sentences[i].length(); + shortLineLength = ((String) sb.get(i)).length(); } } // find a first result - String result = sentences[shortLineIndex]; + String result = (String) sb.get(shortLineIndex); // remove all hashes that appear in the result hs = hashSentence(result); j = queryhashes.iterator();