fix for snippet-generation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3060 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 559f41a001
commit 937ccd4e76

@ -48,7 +48,6 @@
import java.io.File;
import java.io.Writer;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
@ -132,9 +131,9 @@ public class CacheAdmin_p {
.append("<b>EMAIL:</b><br>").append(formatAnchor(document.getEmaillinks())).append("<br>")
.append("<b>TEXT:</b><br><span class=\"small\">").append(new String(scraper.getText())).append("</span><br>")
.append("<b>LINES:</b><br><span class=\"small\">");
final Enumeration sentences = document.getSentences(false);
if (sentences != null) while (sentences.hasMoreElements()) {
info.append((String) sentences.nextElement()).append("<br>");
final Iterator sentences = document.getSentences(false);
if (sentences != null) while (sentences.hasNext()) {
info.append((String) sentences.next()).append("<br>");
}
info.append("</span><br>");
if (document != null) document.close();

@ -50,7 +50,6 @@ import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
@ -312,7 +311,7 @@ public class ViewFile {
prop.put("viewMode_parsedText", content);
} else if (viewMode.equals("sentences")) {
prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES);
final Enumeration sentences = document.getSentences(pre);
final Iterator sentences = document.getSentences(pre);
boolean dark = true;
int i = 0;
@ -320,9 +319,9 @@ public class ViewFile {
String[] wordArray = wordArray(post.get("words", null));
// Search word highlighting
while (sentences.hasMoreElements()) {
while (sentences.hasNext()) {
prop.put("viewMode_sentences_" + i + "_nr", Integer.toString(i + 1));
prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, (String) sentences.nextElement()));
prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, (String) sentences.next()));
prop.put("viewMode_sentences_" + i + "_dark", ((dark) ? 1 : 0));
dark = !dark;
i++;

@ -14,6 +14,7 @@ import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
public class snippet {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) throws MalformedURLException {
@ -53,6 +54,7 @@ public class snippet {
} else {
String error = snippet.getError();
if ((remove) && (error.equals("no matching snippet found"))) {
serverLog.logInfo("snippet-fetch", "no snippet found, remove words '" + querystring + "' for url = " + url.toNormalform());
switchboard.wordIndex.removeReferences(query, plasmaURL.urlHash(url));
}
prop.put("text", error);

@ -28,9 +28,11 @@ package de.anomic.index;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBase64Order;
@ -49,7 +51,7 @@ public final class indexRAMRI implements indexRI {
// class variables
private final File databaseRoot;
protected final TreeMap cache; // wordhash-container
protected final SortedMap cache; // wordhash-container
private final kelondroMScoreCluster hashScore;
private final kelondroMScoreCluster hashDate;
private long initTime;
@ -72,7 +74,7 @@ public final class indexRAMRI implements indexRI {
// creates a new index cache
// the cache has a back-end where indexes that do not fit in the cache are flushed
this.databaseRoot = databaseRoot;
this.cache = new TreeMap();
this.cache = Collections.synchronizedSortedMap(new TreeMap());
this.hashScore = new kelondroMScoreCluster();
this.hashDate = new kelondroMScoreCluster();
this.initTime = System.currentTimeMillis();

@ -740,8 +740,8 @@ public final class plasmaCondenser {
StringBuffer sb;
char c;
while (s.length() == 0) {
if (e.hasMoreElements()) {
r = (String) e.nextElement();
if (e.hasNext()) {
r = (String) e.next();
if (r == null) return null;
r = r.trim();
sb = new StringBuffer(r.length() * 2);
@ -788,7 +788,7 @@ public final class plasmaCondenser {
}
}
public static class sentencesFromInputStreamEnum implements Enumeration {
public static class sentencesFromInputStreamEnum implements Iterator {
// read sentences from a given input stream
// this enumerates String objects
@ -826,11 +826,11 @@ public final class plasmaCondenser {
}
}
public boolean hasMoreElements() {
public boolean hasNext() {
return buffer != null;
}
public Object nextElement() {
public Object next() {
if (buffer == null) {
return null;
} else {
@ -844,6 +844,10 @@ public final class plasmaCondenser {
public int count() {
return counter;
}
public void remove() {
throw new UnsupportedOperationException();
}
}
static String readSentence(Reader reader, boolean pre) throws IOException {

@ -397,6 +397,7 @@ public final class plasmaCrawlStacker {
indexURLEntry oldEntry = null;
oldEntry = this.sb.wordIndex.loadedURL.load(nexturlhash, null);
boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
// FIXME: this does not work correctly?
if ((dbocc != null) && (!(recrawl))) {
reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")";
//this.log.logFine("URL '" + nexturlString + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");

@ -56,7 +56,6 @@ import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
@ -955,11 +954,11 @@ public final class plasmaParser {
System.out.println(document.getMainLongTitle());
// found text
final Enumeration sentences = document.getSentences(false);
final Iterator sentences = document.getSentences(false);
int i = 0;
if (sentences != null) while (sentences.hasMoreElements()) {
if (sentences != null) while (sentences.hasNext()) {
System.out.print("line " + i + ": ");
System.out.println((String) sentences.nextElement());
System.out.println((String) sentences.next());
i++;
}

@ -50,7 +50,6 @@ import java.io.InputStream;
import java.net.MalformedURLException;
import de.anomic.server.serverFileUtils;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
@ -194,7 +193,7 @@ public class plasmaParserDocument {
return -1;
}
public Enumeration getSentences(boolean pre) {
public Iterator getSentences(boolean pre) {
if (this.text == null) return null;
plasmaCondenser.sentencesFromInputStreamEnum e = plasmaCondenser.sentencesFromInputStream(getText(), this.charset);
e.pre(pre);

@ -47,13 +47,13 @@ package de.anomic.plasma;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
@ -264,7 +264,7 @@ public class plasmaSnippetCache {
// we have found a parseable non-empty file: use the lines
// compute snippet from text
final Enumeration sentences = document.getSentences(pre);
final Iterator sentences = document.getSentences(pre);
if (sentences == null) return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
String textline = computeTextSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
@ -282,7 +282,7 @@ public class plasmaSnippetCache {
//if (hrefline != null) line += (line.length() == 0) ? hrefline : "<br />" + hrefline;
if (textline != null) line += (line.length() == 0) ? textline : "<br />" + textline;
if (line == null) return new Snippet(null, ERROR_NO_MATCH, "no matching snippet found");
if ((line == null) || (line.length() < 3 /*snippetMinLength*/)) return new Snippet(null, ERROR_NO_MATCH, "no matching snippet found");
if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);
// finally store this snippet in our own cache
@ -403,58 +403,64 @@ public class plasmaSnippetCache {
return result.substring(6);
}
private String computeTextSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
private String computeTextSnippet(Iterator sentences, Set queryhashes, int minLength, int maxLength) {
try {
if (sentences == null) return null;
if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
Iterator j;
HashMap hs;
String hash;
ArrayList sb = new ArrayList();
String sentence;
while (sentences.hasMoreElements()) {
sentence = (String) sentences.nextElement();
TreeMap os = new TreeMap();
int uniqCounter = 9999;
int score;
while (sentences.hasNext()) {
sentence = (String) sentences.next();
//System.out.println("Snippet-Sentence :" + sentence); // DEBUG
if (sentence.length() > minLength) {
hs = hashSentence(sentence);
j = queryhashes.iterator();
score = 0;
while (j.hasNext()) {
hash = (String) j.next();
if (hs.containsKey(hash)) {
//System.out.println("hash " + hash + " appears in line " + i);
hitTable.incScore(new Integer(sb.size()));
score++;
}
}
sb.add(sentence);
if (score > 0) {
os.put(new Integer(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence);
}
}
}
int score = hitTable.getMaxScore(); // best number of hits
if (score <= 0) return null;
// we found (a) line(s) that have <score> hits.
// now find the shortest line of these hits
int shortLineIndex = -1;
int shortLineLength = Integer.MAX_VALUE;
for (int i = 0; i < sb.size(); i++) {
if ((hitTable.getScore(new Integer(i)) == score) && (((String) sb.get(i)).length() < shortLineLength)) {
shortLineIndex = i;
shortLineLength = ((String) sb.get(i)).length();
String result;
Set remaininghashes;
while (os.size() > 0) {
sentence = (String) os.remove((Integer) os.lastKey()); // sentence with the biggest score
result = computeTextSnippet(sentence, queryhashes, minLength, maxLength);
if ((result != null) && (result.length() > 0)) {
remaininghashes = removeAppearanceHashes(result, queryhashes);
if (remaininghashes.size() == 0) {
// we have found the snippet
return result;
} else if (remaininghashes.size() < queryhashes.size()) {
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - result.length();
if (maxLength < 20) maxLength = 20;
String nextSnippet = computeTextSnippet(os.values().iterator(), remaininghashes, minLength / 2, maxLength);
if ((nextSnippet == null) || (nextSnippet.length() < (minLength / 2))) return null; // no success
return result + (" / " + nextSnippet);
} else {
// error
//assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'";
continue;
}
}
}
// find a first result
String result = computeTextSnippet((String) sb.get(shortLineIndex), queryhashes, minLength, maxLength);
Set remaininghashes = removeAppearanceHashes(result, queryhashes);
if (remaininghashes.size() == 0) return result;
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - result.length();
if (maxLength < 20) maxLength = 20;
String nextSnippet = computeTextSnippet(sentences, remaininghashes, minLength, maxLength);
if (nextSnippet == null) return null;
return result + (" / " + nextSnippet);
return null;
} catch (IndexOutOfBoundsException e) {
log.logSevere("computeSnippet: error with string generation", e);
return "";

Loading…
Cancel
Save