removed lowercase of snippets (and other things):

- added new sentence parser to condenser
- sentence parsing can now handle charsets

to do: charsets must be handed over to new sentence parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2712 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 43614f1b36
commit 1969522dc1

@ -48,6 +48,7 @@
import java.io.File;
import java.io.Writer;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
@ -128,9 +129,9 @@ public class CacheAdmin_p {
.append("<b>EMAIL:</b><br>").append(formatAnchor(document.getEmaillinks())).append("<br>")
.append("<b>TEXT:</b><br><span class=\"small\">").append(new String(scraper.getText())).append("</span><br>")
.append("<b>LINES:</b><br><span class=\"small\">");
final String[] sentences = document.getSentences();
for (int i = 0; i < sentences.length; i++) {
info.append(sentences[i]).append("<br>");
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
if (sentences != null) while (sentences.hasMoreElements()) {
info.append((String) sentences.nextElement()).append("<br>");
}
info.append("</span><br>");
if (document != null) document.close();

@ -49,6 +49,7 @@ import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.Enumeration;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
@ -262,11 +263,12 @@ public class ViewFile {
prop.put("viewMode_parsedText",content);
} else {
prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
String[] sentences = document.getSentences();
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
boolean dark = true;
for (int i=0; i < sentences.length; i++) {
String currentSentence = wikiCode.replaceHTML(sentences[i]);
int i = 0;
if (sentences != null) while (sentences.hasMoreElements()) {
String currentSentence = wikiCode.replaceHTML((String) sentences.nextElement());
// Search word highlighting
String words = post.get("words",null);
@ -286,8 +288,9 @@ public class ViewFile {
prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1));
prop.put("viewMode_sentences_" + i + "_text",currentSentence);
prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
i++;
}
prop.put("viewMode_sentences",sentences.length);
prop.put("viewMode_sentences", i);
}
if (document != null) document.close();

@ -51,6 +51,8 @@ import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
@ -469,7 +471,7 @@ public final class plasmaCondenser {
}
protected final static boolean punctuation(char c) {
return ("!?.".indexOf(c) >= 0);
return (c == '.') || (c == '!') || (c == '?');
}
public final static boolean invisible(char c) {
@ -648,7 +650,89 @@ public final class plasmaCondenser {
return counter;
}
}
public static Enumeration sentencesFromInputStream(InputStream is, String charset) {
try {
return new sentencesFromInputStreamEnum(is, charset);
} catch (UnsupportedEncodingException e) {
return null;
}
}
private static class sentencesFromInputStreamEnum implements Enumeration {
// read sentences from a given input stream
// this enumerates String objects
Object buffer = null;
BufferedReader raf;
int counter = 0;
public sentencesFromInputStreamEnum(InputStream is, String charset) throws UnsupportedEncodingException {
raf = new BufferedReader((charset == null) ? new InputStreamReader(is) : new InputStreamReader(is, charset));
buffer = nextElement0();
counter = 0;
}
private Object nextElement0() {
try {
String s = readSentence(raf);
if (s == null) {
raf.close();
return null;
}
return s;
} catch (IOException e) {
try {
raf.close();
} catch (Exception ee) {
}
return null;
}
}
public boolean hasMoreElements() {
return buffer != null;
}
public Object nextElement() {
if (buffer == null) {
return null;
} else {
counter = counter + ((String) buffer).length() + 1;
Object r = buffer;
buffer = nextElement0();
return r;
}
}
public int count() {
return counter;
}
}
static String readSentence(Reader reader) throws IOException {
StringBuffer s = new StringBuffer();
int nextChar;
char c;
// find sentence end
for (;;) {
nextChar = reader.read();
if (nextChar < 0) return null;
c = (char) nextChar;
s.append(c);
if (punctuation(c)) break;
}
// replace line endings and tabs by blanks
for (int i = 0; i < s.length(); i++) {
if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) || (s.charAt(i) == (char) 8)) s.setCharAt(i, ' ');
}
// remove all double-spaces
int p; while ((p = s.indexOf(" ")) >= 0) s.deleteCharAt(p);
return new String(s);
}
/*
private static void addLineSearchProp(Properties prop, String s, String[] searchwords, HashSet foundsearch) {
// we store lines containing a key in search vector

@ -55,6 +55,7 @@ import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
@ -815,12 +816,12 @@ public final class plasmaParser {
System.out.println(document.getMainLongTitle());
// found text
String[] sentences = document.getSentences();
if (sentences != null) {
for (int i = 0; i < sentences.length; i++) {
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
int i = 0;
if (sentences != null) while (sentences.hasMoreElements()) {
System.out.print("line " + i + ": ");
System.out.println(sentences[i]);
}
System.out.println((String) sentences.nextElement());
i++;
}
// found links

@ -50,6 +50,7 @@ import java.io.InputStream;
import java.net.MalformedURLException;
import de.anomic.server.serverFileUtils;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
@ -189,13 +190,9 @@ public class plasmaParserDocument {
return -1;
}
public plasmaCondenser getCondenser() {
if (condenser == null) condenser = new plasmaCondenser(getText(), 0, 0);
return condenser;
}
public String[] getSentences() {
return getCondenser().sentences();
public Enumeration getSentences(String charset) {
if (this.text == null) return null;
return plasmaCondenser.sentencesFromInputStream(getText(), charset);
}
public String getKeywords(char separator) {

@ -47,6 +47,7 @@ package de.anomic.plasma;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
@ -249,10 +250,10 @@ public class plasmaSnippetCache {
if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
//System.out.println("loaded document for URL " + url);
String[] sentences = document.getSentences();
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
document.close();
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
if ((sentences == null) || (sentences.length == 0)) {
if (sentences == null) {
//System.out.println("found no sentences in url " + url);
return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
}
@ -357,26 +358,30 @@ public class plasmaSnippetCache {
return (String) snippetsCache.get(key);
}
private String computeSnippet(String[] sentences, Set queryhashes, int minLength, int maxLength) {
private String computeSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
try {
if ((sentences == null) || (sentences.length == 0)) return null;
if (sentences == null) return null;
if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
Iterator j;
HashMap hs;
String hash;
for (int i = 0; i < sentences.length; i++) {
ArrayList sb = new ArrayList();
String sentence;
while (sentences.hasMoreElements()) {
sentence = (String) sentences.nextElement();
//System.out.println("Sentence " + i + ": " + sentences[i]);
if (sentences[i].length() > minLength) {
hs = hashSentence(sentences[i]);
if (sentence.length() > minLength) {
hs = hashSentence(sentence);
j = queryhashes.iterator();
while (j.hasNext()) {
hash = (String) j.next();
if (hs.containsKey(hash)) {
//System.out.println("hash " + hash + " appears in line " + i);
hitTable.incScore(new Integer(i));
hitTable.incScore(new Integer(sb.size()));
}
}
sb.add(sentence);
}
}
int score = hitTable.getMaxScore(); // best number of hits
@ -385,15 +390,14 @@ public class plasmaSnippetCache {
// now find the shortest line of these hits
int shortLineIndex = -1;
int shortLineLength = Integer.MAX_VALUE;
for (int i = 0; i < sentences.length; i++) {
if ((hitTable.getScore(new Integer(i)) == score) &&
(sentences[i].length() < shortLineLength)) {
for (int i = 0; i < sb.size(); i++) {
if ((hitTable.getScore(new Integer(i)) == score) && (((String) sb.get(i)).length() < shortLineLength)) {
shortLineIndex = i;
shortLineLength = sentences[i].length();
shortLineLength = ((String) sb.get(i)).length();
}
}
// find a first result
String result = sentences[shortLineIndex];
String result = (String) sb.get(shortLineIndex);
// remove all hashes that appear in the result
hs = hashSentence(result);
j = queryhashes.iterator();

Loading…
Cancel
Save