simplified snippet computation process and separated the algorithm into two classes

also enhances selection criteria for best snippet line computation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7182 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 4450c240b7
commit 10a9cb1971

@ -29,6 +29,7 @@ import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.util.Collection;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
@ -243,7 +244,7 @@ public class ViewFile {
} else if (viewMode.equals("sentences")) {
prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES);
final Iterator<StringBuilder> sentences = document.getSentences(pre);
final Collection<StringBuilder> sentences = document.getSentences(pre);
boolean dark = true;
int i = 0;
@ -251,8 +252,8 @@ public class ViewFile {
if (sentences != null) {
// Search word highlighting
while (sentences.hasNext()) {
sentence = sentences.next().toString();
for (StringBuilder s: sentences) {
sentence = s.toString();
if (sentence.trim().length() > 0) {
prop.put("viewMode_sentences_" + i + "_nr", i + 1);
prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, sentence));
@ -266,7 +267,7 @@ public class ViewFile {
} else if (viewMode.equals("words")) {
prop.put("viewMode", VIEW_MODE_AS_PARSED_WORDS);
final Iterator<StringBuilder> sentences = document.getSentences(pre);
final Collection<StringBuilder> sentences = document.getSentences(pre);
boolean dark = true;
int i = 0;
@ -274,8 +275,8 @@ public class ViewFile {
if (sentences != null) {
// Search word highlighting
while (sentences.hasNext()) {
sentence = sentences.next().toString();
for (StringBuilder s: sentences) {
sentence = s.toString();
Enumeration<StringBuilder> tokens = Condenser.wordTokenizer(sentence, "UTF-8");
while (tokens.hasMoreElements()) {
token = tokens.nextElement().toString();

@ -80,6 +80,8 @@ public class yacysearch {
final boolean searchAllowed = sb.getConfigBool("publicSearchpage", true) || sb.verifyAuthentication(header, false);
final boolean authenticated = sb.adminAuthenticated(header) >= 2;
final boolean localhostAccess = sb.accessFromLocalhost(header);
int display = (post == null) ? 0 : post.getInt("display", 0);
if (!authenticated) display = 2;
// display == 0: shop top menu
@ -234,34 +236,34 @@ public class yacysearch {
global = false;
snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
block = true;
Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: BLACKLISTED CLIENT FROM " + client + " gets no permission to search");
Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: BLACKLISTED CLIENT FROM " + client + " gets no permission to search");
} else if (Domains.matchesList(client, sb.networkWhitelist)) {
Log.logInfo("LOCAL_SEARCH", "ACCECC CONTROL: WHITELISTED CLIENT FROM " + client + " gets no search restrictions");
} else if (!authenticated && (global || snippetFetchStrategy.isAllowedToFetchOnline())) {
Log.logInfo("LOCAL_SEARCH", "ACCESS CONTROL: WHITELISTED CLIENT FROM " + client + " gets no search restrictions");
} else if (!authenticated && !localhostAccess) {
// in case that we do a global search or we want to fetch snippets, we check for DoS cases
synchronized (trackerHandles) {
int accInOneSecond = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 1000)).size();
int accInThreeSeconds = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 3000)).size();
int accInOneMinute = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 60000)).size();
int accInTenMinutes = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size();
if (accInTenMinutes > 600) {
global = false;
snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
block = true;
Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInTenMinutes + " searches in ten minutes, fully blocked (no results generated)");
} else if (accInOneMinute > 200) {
global = false;
snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
// protections against too strong YaCy network load, reduces remote search
if (global) {
if (accInTenMinutes >= 30 || accInOneMinute >= 6 || accInThreeSeconds >= 1) {
global = false;
Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + client + ": " + accInOneSecond + "/1s, " + accInThreeSeconds + "/3s, " + accInOneMinute + "/60s, " + accInTenMinutes + "/600s, " + " requests, disallowed global search");
}
}
// protection against too many remote server snippet loads (protects traffic on server)
if (snippetFetchStrategy.isAllowedToFetchOnline()) {
if (accInTenMinutes >= 20 || accInOneMinute >= 4 || accInThreeSeconds >= 1) {
snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + client + ": " + accInOneSecond + "/1s, " + accInThreeSeconds + "/3s, " + accInOneMinute + "/60s, " + accInTenMinutes + "/600s, " + " requests, disallowed remote snippet loading");
}
}
// general load protection
if (accInTenMinutes >= 2000 || accInOneMinute >= 600 || accInOneSecond >= 20) {
block = true;
Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInOneMinute + " searches in one minute, fully blocked (no results generated)");
} else if (accInThreeSeconds > 1) {
global = false;
snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInThreeSeconds + " searches in three seconds, blocked global search and snippets");
} else if (accInOneSecond > 2) {
global = false;
snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInOneSecond + " searches in one second, blocked global search and snippets");
Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + client + ": " + accInOneSecond + "/1s, " + accInThreeSeconds + "/3s, " + accInOneMinute + "/60s, " + accInTenMinutes + "/600s, " + " requests, disallowed search");
}
}
}

@ -256,10 +256,9 @@ public final class HTTPDemon implements serverHandler, Cloneable {
public static int staticAdminAuthenticated(final String authorization, final serverSwitch sw) {
// the authorization string must be given with the truncated 6 bytes at the beginning
if (authorization == null) return 1;
//if (authorization.length() < 6) return 1; // no authentication information given
final String adminAccountBase64MD5 = sw.getConfig(ADMIN_ACCOUNT_B64MD5, "");
if (adminAccountBase64MD5.length() == 0) return 2; // no password stored
if (authorization == null || authorization.length() == 0) return 1;
if (adminAccountBase64MD5.equals(Digest.encodeMD5Hex(authorization))) return 4; // hard-authenticated, all ok
return 1;
}

@ -29,17 +29,20 @@ import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.MimeTable;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.ByteArray;
@ -157,8 +160,8 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
entry = i.next();
url = new DigestURI(entry.getKey());
desc = entry.getValue();
int ranking = TextSnippet.removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() +
TextSnippet.removeAppearanceHashes(desc, queryhashes).size();
int ranking = removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() +
removeAppearanceHashes(desc, queryhashes).size();
if (ranking < 2 * queryhashes.size()) {
result.add(new MediaSnippet(mediatype, url, MimeTable.url2mime(url), desc, document.getTextLength(), null, ranking, source));
}
@ -186,41 +189,40 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
if (ientry.width() > 0 && ientry.width() < 64) continue;
desc = ientry.alt();
int appcount = queryhashes.size() * 2 -
TextSnippet.removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() -
TextSnippet.removeAppearanceHashes(desc, queryhashes).size();
removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() -
removeAppearanceHashes(desc, queryhashes).size();
final long ranking = Long.MAX_VALUE - (ientry.height() + 1) * (ientry.width() + 1) * (appcount + 1);
result.add(new MediaSnippet(ContentDomain.IMAGE, url, MimeTable.url2mime(url), desc, ientry.fileSize(), ientry.width(), ientry.height(), ranking, source));
}
return result;
}
/*
private static String computeMediaSnippet(Map<yacyURL, String> media, Set<String> queryhashes) {
Iterator<Map.Entry<yacyURL, String>> i = media.entrySet().iterator();
Map.Entry<yacyURL, String> entry;
yacyURL url;
String desc;
Set<String> s;
String result = "";
while (i.hasNext()) {
entry = i.next();
url = entry.getKey();
desc = entry.getValue();
s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes);
if (isEmpty()) {
result += "<br /><a href=\"" + url + "\">" + ((desc.length() == 0) ? url : desc) + "</a>";
continue;
}
s = removeAppearanceHashes(desc, s);
if (isEmpty()) {
result += "<br /><a href=\"" + url + "\">" + ((desc.length() == 0) ? url : desc) + "</a>";
continue;
/**
* removed all word hashes that can be computed as tokens from a given sentence from a given hash set
* @param sentence
* @param queryhashes
* @return the given hash set minus the hashes from the tokenization of the given sentence
*/
private static HandleSet removeAppearanceHashes(final String sentence, final HandleSet queryhashes) {
// remove all hashes that appear in the sentence
if (sentence == null) return queryhashes;
final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence);
final Iterator<byte[]> j = queryhashes.iterator();
byte[] hash;
Integer pos;
final HandleSet remaininghashes = new HandleSet(queryhashes.row().primaryKeyLength, queryhashes.comparator(), queryhashes.size());
while (j.hasNext()) {
hash = j.next();
pos = hs.get(hash);
if (pos == null) {
try {
remaininghashes.put(hash);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
}
}
if (result.length() == 0) return null;
return result.substring(6);
return remaininghashes;
}
*/
}

@ -220,7 +220,7 @@ public class ResultFetcher {
if (query.contentdom == ContentDomain.TEXT) {
// attach text snippet
startTime = System.currentTimeMillis();
final TextSnippet snippet = TextSnippet.retrieveTextSnippet(
final TextSnippet snippet = new TextSnippet(
this.loader,
metadata,
snippetFetchWordHashes,

@ -2057,12 +2057,19 @@ public final class Switchboard extends serverSwitch {
}
}
public int adminAuthenticated(final RequestHeader requestHeader) {
public boolean accessFromLocalhost(final RequestHeader requestHeader) {
// authorization for localhost, only if flag is set to grant localhost access as admin
final String clientIP = requestHeader.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "");
if (!Domains.isLocal(clientIP)) return false;
final String refererHost = requestHeader.refererHost();
boolean accessFromLocalhost = Domains.isLocal(clientIP) && (refererHost == null || refererHost.length() == 0 || Domains.isLocal(refererHost));
return refererHost == null || refererHost.length() == 0 || Domains.isLocal(refererHost);
}
public int adminAuthenticated(final RequestHeader requestHeader) {
// authorization for localhost, only if flag is set to grant localhost access as admin
boolean accessFromLocalhost = accessFromLocalhost(requestHeader);
if (getConfigBool("adminAccountForLocalhost", false) && accessFromLocalhost) return 3; // soft-authenticated for localhost
// get the authorization string from the header

@ -25,6 +25,7 @@
package de.anomic.search;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.Iterator;
import java.util.TreeMap;
@ -36,20 +37,18 @@ import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.SnippetExtractor;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.ByteArray;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
import de.anomic.http.client.Cache;
import de.anomic.yacy.yacySearch;
public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnippet> {
@ -68,66 +67,6 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
public static final int ERROR_PARSER_NO_LINES = 15;
public static final int ERROR_NO_MATCH = 16;
private static final ARC<String, String> snippetsCache = new ConcurrentARC<String, String>(maxCache, Math.max(10, Runtime.getRuntime().availableProcessors()));
private static final ARC<String, DigestURI> faviconCache = new ConcurrentARC<String, DigestURI>(maxCache, Math.max(10, Runtime.getRuntime().availableProcessors()));
private final DigestURI url;
private String line;
private final String error;
private final int errorCode;
private HandleSet remaingHashes;
private final DigestURI favicon;
public static boolean existsInCache(final DigestURI url, final HandleSet queryhashes) {
final String hashes = yacySearch.set2string(queryhashes);
return retrieveFromCache(hashes, new String(url.hash())) != null;
}
public static void storeToCache(final String wordhashes, final String urlhash, final String snippet) {
// generate key
String key = urlhash + wordhashes;
// do nothing if snippet is known
if (snippetsCache.containsKey(key)) return;
// learn new snippet
snippetsCache.put(key, snippet);
}
public static String retrieveFromCache(final String wordhashes, final String urlhash) {
// generate key
final String key = urlhash + wordhashes;
return snippetsCache.get(key);
}
/**
* removed all word hashes that can be computed as tokens from a given sentence from a given hash set
* @param sentence
* @param queryhashes
* @return the given hash set minus the hashes from the tokenization of the given sentence
*/
public static HandleSet removeAppearanceHashes(final String sentence, final HandleSet queryhashes) {
// remove all hashes that appear in the sentence
if (sentence == null) return queryhashes;
final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence);
final Iterator<byte[]> j = queryhashes.iterator();
byte[] hash;
Integer pos;
final HandleSet remaininghashes = new HandleSet(queryhashes.row().primaryKeyLength, queryhashes.comparator(), queryhashes.size());
while (j.hasNext()) {
hash = j.next();
pos = hs.get(hash);
if (pos == null) {
try {
remaininghashes.put(hash);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
}
}
return remaininghashes;
}
/**
* <code>\\A[^\\p{L}\\p{N}].+</code>
*/
@ -149,53 +88,205 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
*/
private final static Pattern p01 = Pattern.compile("(.*?)(\\<b\\>.+?\\</b\\>)(.*)"); // marked words are in <b>-tags
public TextSnippet(final DigestURI url, final String line, final int errorCode, final HandleSet remaingHashes, final String errortext) {
this(url, line, errorCode, remaingHashes, errortext, null);
public static class Cache {
private final ARC<String, String> cache;
public Cache() {
cache = new ConcurrentARC<String, String>(maxCache, Math.max(10, Runtime.getRuntime().availableProcessors()));
}
public void put(final String wordhashes, final String urlhash, final String snippet) {
// generate key
String key = urlhash + wordhashes;
// do nothing if snippet is known
if (cache.containsKey(key)) return;
// learn new snippet
cache.put(key, snippet);
}
public String get(final String wordhashes, final String urlhash) {
// generate key
final String key = urlhash + wordhashes;
return cache.get(key);
}
public boolean contains(final String wordhashes, final String urlhash) {
return cache.containsKey(urlhash + wordhashes);
}
}
public static final Cache snippetsCache = new Cache();
private byte[] urlhash;
private String line;
private String error;
private int errorCode;
public TextSnippet(final byte[] urlhash, final String line, final int errorCode, final String errortext) {
init(urlhash, line, errorCode, errortext);
}
public TextSnippet(final DigestURI url, final String line, final int errorCode, final HandleSet remaingHashes, final String errortext, final DigestURI favicon) {
this.url = url;
public TextSnippet(final LoaderDispatcher loader, final URIMetadataRow.Components comp, final HandleSet queryhashes, final CrawlProfile.CacheStrategy cacheStrategy, final boolean pre, final int snippetMaxLength, final int maxDocLen, final boolean reindexing) {
// heise = "0OQUNU3JSs05"
final DigestURI url = comp.url();
if (queryhashes.isEmpty()) {
//System.out.println("found no queryhashes for URL retrieve " + url);
init(url.hash(), null, ERROR_NO_HASH_GIVEN, "no query hashes given");
return;
}
// try to get snippet from snippetCache
int source = SOURCE_CACHE;
final String wordhashes = yacySearch.set2string(queryhashes);
final String urls = new String(url.hash());
String line = snippetsCache.get(wordhashes, urls);
if (line != null) {
// found the snippet
init(url.hash(), line, source, null);
return;
}
/* ===========================================================================
* LOAD RESOURCE DATA
* =========================================================================== */
// if the snippet is not in the cache, we can try to get it from the htcache
Response response;
try {
// first try to get the snippet from metadata
String loc;
boolean objectWasInCache = de.anomic.http.client.Cache.has(url);
boolean useMetadata = !objectWasInCache && !cacheStrategy.mustBeOffline();
if (useMetadata && containsAllHashes(loc = comp.dc_title(), queryhashes)) {
// try to create the snippet from information given in the url itself
init(url.hash(), loc, SOURCE_METADATA, null);
return;
} else if (useMetadata && containsAllHashes(loc = comp.dc_creator(), queryhashes)) {
// try to create the snippet from information given in the creator metadata
init(url.hash(), loc, SOURCE_METADATA, null);
return;
} else if (useMetadata && containsAllHashes(loc = comp.dc_subject(), queryhashes)) {
// try to create the snippet from information given in the subject metadata
init(url.hash(), loc, SOURCE_METADATA, null);
return;
} else if (useMetadata && containsAllHashes(loc = comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) {
// try to create the snippet from information given in the url
init(url.hash(), loc, SOURCE_METADATA, null);
return;
} else {
// try to load the resource from the cache
response = loader.load(loader.request(url, true, reindexing), cacheStrategy, Long.MAX_VALUE);
if (response == null) {
// in case that we did not get any result we can still return a success when we are not allowed to go online
if (cacheStrategy.mustBeOffline()) {
init(url.hash(), null, ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry");
return;
}
// if it is still not available, report an error
init(url.hash(), null, ERROR_RESOURCE_LOADING, "error loading resource from net, no cache entry");
return;
}
if (!objectWasInCache) {
// place entry on indexing queue
Switchboard.getSwitchboard().toIndexer(response);
source = SOURCE_WEB;
}
}
} catch (final Exception e) {
//Log.logException(e);
init(url.hash(), null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
return;
}
/* ===========================================================================
* PARSE RESOURCE
* =========================================================================== */
Document document = null;
try {
document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
} catch (final Parser.Failure e) {
init(url.hash(), null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
return;
}
if (document == null) {
init(url.hash(), null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
return;
}
/* ===========================================================================
* COMPUTE SNIPPET
* =========================================================================== */
// we have found a parseable non-empty file: use the lines
// compute snippet from text
final Collection<StringBuilder> sentences = document.getSentences(pre);
if (sentences == null) {
init(url.hash(), null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
return;
}
final SnippetExtractor tsr;
String textline = null;
HandleSet remainingHashes = queryhashes;
try {
tsr = new SnippetExtractor(sentences, queryhashes, snippetMaxLength);
textline = tsr.getSnippet();
remainingHashes = tsr.getRemainingWords();
} catch (UnsupportedOperationException e) {
init(url.hash(), null, ERROR_NO_MATCH, "no matching snippet found");
return;
}
// compute snippet from media
//String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
//String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes);
//String appline = computeMediaSnippet(document.getApplinks(), queryhashes);
//String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes);
//String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
line = "";
//if (audioline != null) line += (line.length() == 0) ? audioline : "<br />" + audioline;
//if (videoline != null) line += (line.length() == 0) ? videoline : "<br />" + videoline;
//if (appline != null) line += (line.length() == 0) ? appline : "<br />" + appline;
//if (hrefline != null) line += (line.length() == 0) ? hrefline : "<br />" + hrefline;
if (textline != null) line += (line.length() == 0) ? textline : "<br />" + textline;
if (line == null || !remainingHashes.isEmpty()) {
init(url.hash(), null, ERROR_NO_MATCH, "no matching snippet found");
return;
}
if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);
// finally store this snippet in our own cache
snippetsCache.put(wordhashes, urls, line);
document.close();
init(url.hash(), line, source, null);
}
private void init(final byte[] urlhash, final String line, final int errorCode, final String errortext) {
this.urlhash = urlhash;
this.line = line;
this.errorCode = errorCode;
this.error = errortext;
this.remaingHashes = remaingHashes;
this.favicon = favicon;
}
public DigestURI getUrl() {
return this.url;
}
public DigestURI getFavicon() {
return this.favicon;
}
public boolean exists() {
return line != null;
}
public int compareTo(TextSnippet o) {
return Base64Order.enhancedCoder.compare(this.url.hash(), o.url.hash());
}
public int compare(TextSnippet o1, TextSnippet o2) {
return o1.compareTo(o2);
}
public int hashCode() {
return ByteArray.hashCode(this.url.hash());
}
@Override
public String toString() {
return (line == null) ? "" : line;
}
public String getLineRaw() {
return (line == null) ? "" : line;
}
public String getError() {
return (error == null) ? "" : error.trim();
}
public int getErrorCode() {
return errorCode;
}
public HandleSet getRemainingHashes() {
return this.remaingHashes;
}
public String getLineMarked(final HandleSet queryHashes) {
if (line == null) return "";
if (queryHashes == null || queryHashes.isEmpty()) return line.trim();
@ -225,6 +316,23 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return l.toString().trim();
}
public int compareTo(TextSnippet o) {
return Base64Order.enhancedCoder.compare(this.urlhash, o.urlhash);
}
public int compare(TextSnippet o1, TextSnippet o2) {
return o1.compareTo(o2);
}
public int hashCode() {
return ByteArray.hashCode(this.urlhash);
}
@Override
public String toString() {
return (line == null) ? "" : line;
}
/**
* mark words with &lt;b&gt;-tags
* @param word the word to mark
@ -307,119 +415,6 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return al;
}
public static TextSnippet retrieveTextSnippet(final LoaderDispatcher loader, final URIMetadataRow.Components comp, final HandleSet queryhashes, final CrawlProfile.CacheStrategy cacheStrategy, final boolean pre, final int snippetMaxLength, final int maxDocLen, final boolean reindexing) {
// heise = "0OQUNU3JSs05"
final DigestURI url = comp.url();
if (queryhashes.isEmpty()) {
//System.out.println("found no queryhashes for URL retrieve " + url);
return new TextSnippet(url, null, ERROR_NO_HASH_GIVEN, queryhashes, "no query hashes given");
}
// try to get snippet from snippetCache
int source = SOURCE_CACHE;
final String wordhashes = yacySearch.set2string(queryhashes);
final String urls = new String(url.hash());
String line = retrieveFromCache(wordhashes, urls);
if (line != null) {
// found the snippet
return new TextSnippet(url, line, source, null, null, faviconCache.get(urls));
}
/* ===========================================================================
* LOADING RESOURCE DATA
* =========================================================================== */
// if the snippet is not in the cache, we can try to get it from the htcache
Response response;
try {
// first try to get the snippet from metadata
String loc;
if (containsAllHashes(loc = comp.dc_title(), queryhashes)) {
// try to create the snippet from information given in the url itself
return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(urls));
} else if (containsAllHashes(loc = comp.dc_creator(), queryhashes)) {
// try to create the snippet from information given in the creator metadata
return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(urls));
} else if (containsAllHashes(loc = comp.dc_subject(), queryhashes)) {
// try to create the snippet from information given in the subject metadata
return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(urls));
} else if (containsAllHashes(loc = comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) {
// try to create the snippet from information given in the subject metadata
return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(urls));
} else {
// trying to load the resource from the cache
boolean objectWasInCache = Cache.has(url);
response = loader.load(loader.request(url, true, reindexing), cacheStrategy, Long.MAX_VALUE);
if (response == null) {
// in case that we did not get any result we can still return a success when we are not allowed to go online
if (cacheStrategy.mustBeOffline()) {
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "omitted network load (not allowed), no cache entry");
}
// if it is still not available, report an error
return new TextSnippet(url, null, ERROR_RESOURCE_LOADING, queryhashes, "error loading resource from net, no cache entry");
}
if (!objectWasInCache) {
// place entry on indexing queue
Switchboard.getSwitchboard().toIndexer(response);
source = SOURCE_WEB;
}
}
} catch (final Exception e) {
//Log.logException(e);
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "error loading resource: " + e.getMessage());
}
/* ===========================================================================
* PARSING RESOURCE
* =========================================================================== */
Document document = null;
try {
document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
} catch (final Parser.Failure e) {
return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed
}
if (document == null) return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, "parser error/failed"); // cannot be parsed
/* ===========================================================================
* COMPUTE SNIPPET
* =========================================================================== */
final DigestURI resFavicon = (document.getFavicon() == null) ? null : new DigestURI(document.getFavicon());
if (resFavicon != null) faviconCache.put(urls, resFavicon);
// we have found a parseable non-empty file: use the lines
// compute snippet from text
final Iterator<StringBuilder> sentences = document.getSentences(pre);
if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences",resFavicon);
final Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength);
final String textline = (tsr == null) ? null : (String) tsr[0];
final HandleSet remainingHashes = (tsr == null) ? queryhashes : (HandleSet) tsr[1];
// compute snippet from media
//String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
//String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes);
//String appline = computeMediaSnippet(document.getApplinks(), queryhashes);
//String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes);
//String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
line = "";
//if (audioline != null) line += (line.length() == 0) ? audioline : "<br />" + audioline;
//if (videoline != null) line += (line.length() == 0) ? videoline : "<br />" + videoline;
//if (appline != null) line += (line.length() == 0) ? appline : "<br />" + appline;
//if (hrefline != null) line += (line.length() == 0) ? hrefline : "<br />" + hrefline;
if (textline != null) line += (line.length() == 0) ? textline : "<br />" + textline;
if (line == null || !remainingHashes.isEmpty()) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found",resFavicon);
if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);
// finally store this snippet in our own cache
storeToCache(wordhashes, urls, line);
document.close();
return new TextSnippet(url, line, source, null, null, resFavicon);
}
private static boolean containsAllHashes(final String sentence, final HandleSet queryhashes) {
final TreeMap<byte[], Integer> m = Condenser.hashSentence(sentence);
for (byte[] b: queryhashes) {
@ -428,136 +423,4 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return true;
}
private static Object[] /*{String - the snippet, HandleSet - remaining hashes}*/
computeTextSnippet(final Iterator<StringBuilder> sentences, final HandleSet queryhashes, int maxLength) {
try {
if (sentences == null) return null;
if ((queryhashes == null) || (queryhashes.isEmpty())) return null;
Iterator<byte[]> j;
TreeMap<byte[], Integer> hs;
StringBuilder sentence;
final TreeMap<Integer, StringBuilder> os = new TreeMap<Integer, StringBuilder>();
int uniqCounter = 9999;
int score;
while (sentences.hasNext()) {
sentence = sentences.next();
hs = Condenser.hashSentence(sentence.toString());
j = queryhashes.iterator();
score = 0;
while (j.hasNext()) {if (hs.containsKey(j.next())) score++;}
if (score > 0) {
os.put(Integer.valueOf(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence);
}
}
String result;
HandleSet remaininghashes;
while (!os.isEmpty()) {
sentence = os.remove(os.lastKey()); // sentence with the biggest score
Object[] tsr = computeTextSnippet(sentence.toString(), queryhashes, maxLength);
if (tsr == null) continue;
result = (String) tsr[0];
if ((result != null) && (result.length() > 0)) {
remaininghashes = (HandleSet) tsr[1];
if (remaininghashes.isEmpty()) {
// we have found the snippet
return new Object[]{result, remaininghashes};
} else if (remaininghashes.size() < queryhashes.size()) {
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - result.length();
if (maxLength < 20) maxLength = 20;
tsr = computeTextSnippet(os.values().iterator(), remaininghashes, maxLength);
if (tsr == null) return null;
final String nextSnippet = (String) tsr[0];
if (nextSnippet == null) return tsr;
return new Object[]{result + (" / " + nextSnippet), tsr[1]};
} else {
// error
//assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'";
continue;
}
}
}
return null;
} catch (final IndexOutOfBoundsException e) {
Log.logSevere("computeSnippet", "error with string generation", e);
return new Object[]{null, queryhashes};
}
}
private static Object[] /*{String - the snippet, HandleSet - remaining hashes}*/
computeTextSnippet(String sentence, final HandleSet queryhashes, final int maxLength) {
try {
if (sentence == null) return null;
if ((queryhashes == null) || (queryhashes.isEmpty())) return null;
byte[] hash;
// find all hashes that appear in the sentence
final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence);
final Iterator<byte[]> j = queryhashes.iterator();
Integer pos;
int p, minpos = sentence.length(), maxpos = -1;
final HandleSet remainingHashes = new HandleSet(queryhashes.row().primaryKeyLength, queryhashes.comparator(), 0);
while (j.hasNext()) {
hash = j.next();
pos = hs.get(hash);
if (pos == null) {
try {
remainingHashes.put(hash);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
} else {
p = pos.intValue();
if (p > maxpos) maxpos = p;
if (p < minpos) minpos = p;
}
}
// check result size
maxpos = maxpos + 10;
if (maxpos > sentence.length()) maxpos = sentence.length();
if (minpos < 0) minpos = 0;
// we have a result, but is it short enough?
if (maxpos - minpos + 10 > maxLength) {
// the string is too long, even if we cut at both ends
// so cut here in the middle of the string
final int lenb = sentence.length();
sentence = sentence.substring(0, (minpos + 20 > sentence.length()) ? sentence.length() : minpos + 20).trim() +
" [..] " +
sentence.substring((maxpos + 26 > sentence.length()) ? sentence.length() : maxpos + 26).trim();
maxpos = maxpos + lenb - sentence.length() + 6;
}
if (maxpos > maxLength) {
// the string is too long, even if we cut it at the end
// so cut it here at both ends at once
assert maxpos >= minpos;
final int newlen = Math.max(10, maxpos - minpos + 10);
final int around = (maxLength - newlen) / 2;
assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
//assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
minpos = around;
maxpos = sentence.length() - around - 5;
}
if (sentence.length() > maxLength) {
// trim sentence, 1st step (cut at right side)
sentence = sentence.substring(0, maxpos).trim() + " [..]";
}
if (sentence.length() > maxLength) {
// trim sentence, 2nd step (cut at left side)
sentence = "[..] " + sentence.substring(minpos).trim();
}
if (sentence.length() > maxLength) {
// trim sentence, 3rd step (cut in the middle)
sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
}
return new Object[] {sentence, remainingHashes};
} catch (final IndexOutOfBoundsException e) {
Log.logSevere("computeSnippet", "error with string generation", e);
return null;
}
}
}

@ -480,7 +480,7 @@ public final class yacyClient {
// because they are search-specific.
// instead, they are placed in a snipped-search cache.
// System.out.println("--- RECEIVED SNIPPET '" + urlEntry.snippet() + "'");
TextSnippet.storeToCache(wordhashes, new String(urlEntry.hash()), urlEntry.snippet());
TextSnippet.snippetsCache.put(wordhashes, new String(urlEntry.hash()), urlEntry.snippet());
}
// add the url entry to the word indexes

@ -226,7 +226,7 @@ public class WeakPriorityBlockingQueue<E> {
}
/**
* return the specific amount of entrie as they would be retrievable with element()
* return the specific amount of entries as they would be retrievable with element()
* if count is < 0 then all elements are taken
* the returned list is not cloned from the internal list and shall not be modified in any way (read-only)
* @param count
@ -234,15 +234,23 @@ public class WeakPriorityBlockingQueue<E> {
*/
public synchronized ArrayList<E> list(final int count) {
if (count < 0) {
// shift all elements
while (!this.queue.isEmpty()) this.poll();
return this.drained;
return list();
}
if (count > sizeAvailable()) throw new RuntimeException("list(" + count + ") exceeded avaiable number of elements (" + sizeAvailable() + ")");
while (count > this.drained.size()) this.poll();
return this.drained;
}
/**
* return all entries as they would be retrievable with element()
* @return a list of all elements in the stack
*/
public synchronized ArrayList<E> list() {
// shift all elements
while (!this.queue.isEmpty()) this.poll();
return this.drained;
}
/**
* iterate over all elements available. All elements that are still in the queue are drained to recorded positions
* @return an iterator over all drained positions.

@ -33,6 +33,7 @@ import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
@ -275,11 +276,15 @@ dc_rights
return -1;
}
public Iterator<StringBuilder> getSentences(final boolean pre) {
public List<StringBuilder> getSentences(final boolean pre) {
if (this.text == null) return null;
final Condenser.sentencesFromInputStreamEnum e = Condenser.sentencesFromInputStream(getText());
e.pre(pre);
return e;
ArrayList<StringBuilder> sentences = new ArrayList<StringBuilder>();
while (e.hasNext()) {
sentences.add(e.next());
}
return sentences;
}
public List<String> getKeywords() {

@ -0,0 +1,196 @@
/**
* SnippetExtractor
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
* First released 22.10.2010 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document;
import java.util.Collection;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.TreeSet;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
public class SnippetExtractor {
String snippetString;
HandleSet remainingHashes;
public SnippetExtractor(final Collection<StringBuilder> sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException {
if (sentences == null) throw new UnsupportedOperationException("sentence == null");
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
TreeMap<byte[], Integer> hs;
final TreeMap<Long, StringBuilder> order = new TreeMap<Long, StringBuilder>();
long uniqCounter = 999L;
Integer pos;
TreeSet<Integer> positions;
int linenumber = 0;
for (StringBuilder sentence: sentences) {
hs = Condenser.hashSentence(sentence.toString());
positions = new TreeSet<Integer>();
for (byte[] word: queryhashes) {
pos = hs.get(word);
if (pos != null) {
positions.add(pos);
}
}
int worddistance = positions.size() > 1 ? positions.last() - positions.first() : 0;
// sort by
// - 1st order: number of matching words
// - 2nd order: word distance
// - 3th order: line length (not too short and not too long)
// - 4rd order: line number
if (positions.size() > 0) {
order.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence);
if (order.size() > 5) order.remove(order.firstEntry().getKey());
}
linenumber++;
}
StringBuilder sentence;
SnippetExtractor tsr;
while (!order.isEmpty()) {
sentence = order.remove(order.lastKey()); // sentence with the biggest score
try {
tsr = new SnippetExtractor(sentence.toString(), queryhashes, maxLength);
} catch (UnsupportedOperationException e) {
continue;
}
snippetString = tsr.snippetString;
if (snippetString != null && snippetString.length() > 0) {
remainingHashes = tsr.remainingHashes;
if (remainingHashes.isEmpty()) {
// we have found the snippet
return; // finished!
} else if (remainingHashes.size() < queryhashes.size()) {
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - snippetString.length();
if (maxLength < 20) maxLength = 20;
try {
tsr = new SnippetExtractor(order.values(), remainingHashes, maxLength);
} catch (UnsupportedOperationException e) {
throw e;
}
final String nextSnippet = tsr.snippetString;
if (nextSnippet == null) return;
snippetString = snippetString + (" / " + nextSnippet);
remainingHashes = tsr.remainingHashes;
return;
} else {
// error
//assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'";
continue;
}
}
}
throw new UnsupportedOperationException("no snippet computed");
}
private static int linelengthKey(int givenlength, int maxlength) {
if (givenlength > maxlength) return 1;
if (givenlength >= maxlength / 2 && givenlength < maxlength) return 7;
if (givenlength >= maxlength / 4 && givenlength < maxlength / 2) return 5;
if (givenlength >= maxlength / 8 && givenlength < maxlength / 4) return 3;
return 0;
}
private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException {
try {
if (sentence == null) throw new UnsupportedOperationException("no sentence given");
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
byte[] hash;
// find all hashes that appear in the sentence
final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence);
final Iterator<byte[]> j = queryhashes.iterator();
Integer pos;
int p, minpos = sentence.length(), maxpos = -1;
final HandleSet remainingHashes = new HandleSet(queryhashes.row().primaryKeyLength, queryhashes.comparator(), 0);
while (j.hasNext()) {
hash = j.next();
pos = hs.get(hash);
if (pos == null) {
try {
remainingHashes.put(hash);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
} else {
p = pos.intValue();
if (p > maxpos) maxpos = p;
if (p < minpos) minpos = p;
}
}
// check result size
maxpos = maxpos + 10;
if (maxpos > sentence.length()) maxpos = sentence.length();
if (minpos < 0) minpos = 0;
// we have a result, but is it short enough?
if (maxpos - minpos + 10 > maxLength) {
// the string is too long, even if we cut at both ends
// so cut here in the middle of the string
final int lenb = sentence.length();
sentence = sentence.substring(0, (minpos + 20 > sentence.length()) ? sentence.length() : minpos + 20).trim() +
" [..] " +
sentence.substring((maxpos + 26 > sentence.length()) ? sentence.length() : maxpos + 26).trim();
maxpos = maxpos + lenb - sentence.length() + 6;
}
if (maxpos > maxLength) {
// the string is too long, even if we cut it at the end
// so cut it here at both ends at once
assert maxpos >= minpos;
final int newlen = Math.max(10, maxpos - minpos + 10);
final int around = (maxLength - newlen) / 2;
assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
//assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
minpos = around;
maxpos = sentence.length() - around - 5;
}
if (sentence.length() > maxLength) {
// trim sentence, 1st step (cut at right side)
sentence = sentence.substring(0, maxpos).trim() + " [..]";
}
if (sentence.length() > maxLength) {
// trim sentence, 2nd step (cut at left side)
sentence = "[..] " + sentence.substring(minpos).trim();
}
if (sentence.length() > maxLength) {
// trim sentence, 3rd step (cut in the middle)
sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
}
this.snippetString = sentence;
this.remainingHashes = remainingHashes;
} catch (final IndexOutOfBoundsException e) {
throw new UnsupportedOperationException(e.getMessage());
}
}
public String getSnippet() {
return this.snippetString;
}
public HandleSet getRemainingWords() {
return this.remainingHashes;
}
}
Loading…
Cancel
Save