From 10a9cb1971d03120bd8137029cdb536b131e6189 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 22 Sep 2010 20:50:02 +0000 Subject: [PATCH] simplified snippet computation process and separated the algorithm into two classes also enhances selection criteria for best snippet line computation git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7182 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ViewFile.java | 13 +- htroot/yacysearch.java | 42 +- source/de/anomic/http/server/HTTPDemon.java | 3 +- source/de/anomic/search/MediaSnippet.java | 60 +- source/de/anomic/search/ResultFetcher.java | 2 +- source/de/anomic/search/Switchboard.java | 11 +- source/de/anomic/search/TextSnippet.java | 537 +++++++----------- source/de/anomic/yacy/yacyClient.java | 2 +- .../storage/WeakPriorityBlockingQueue.java | 16 +- source/net/yacy/document/Document.java | 9 +- .../net/yacy/document/SnippetExtractor.java | 196 +++++++ 11 files changed, 487 insertions(+), 404 deletions(-) create mode 100644 source/net/yacy/document/SnippetExtractor.java diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index f6aba9517..61d028b3a 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -29,6 +29,7 @@ import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URLDecoder; +import java.util.Collection; import java.util.Enumeration; import java.util.HashMap; import java.util.Iterator; @@ -243,7 +244,7 @@ public class ViewFile { } else if (viewMode.equals("sentences")) { prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES); - final Iterator sentences = document.getSentences(pre); + final Collection sentences = document.getSentences(pre); boolean dark = true; int i = 0; @@ -251,8 +252,8 @@ public class ViewFile { if (sentences != null) { // Search word highlighting - while (sentences.hasNext()) { - sentence = sentences.next().toString(); + for (StringBuilder s: sentences) { + sentence = s.toString(); if (sentence.trim().length() > 0) { prop.put("viewMode_sentences_" + i + "_nr", i + 1); prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, sentence)); @@ -266,7 +267,7 @@ public class ViewFile { } else if (viewMode.equals("words")) { prop.put("viewMode", VIEW_MODE_AS_PARSED_WORDS); - final Iterator sentences = document.getSentences(pre); + final Collection sentences = document.getSentences(pre); boolean dark = true; int i = 0; @@ -274,8 +275,8 @@ public class ViewFile { if (sentences != null) { // Search word highlighting - while (sentences.hasNext()) { - sentence = sentences.next().toString(); + for (StringBuilder s: sentences) { + sentence = s.toString(); Enumeration tokens = Condenser.wordTokenizer(sentence, "UTF-8"); while (tokens.hasMoreElements()) { token = tokens.nextElement().toString(); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 53d75bf4b..3fde71546 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -80,6 +80,8 @@ public class yacysearch { final boolean searchAllowed = sb.getConfigBool("publicSearchpage", true) || sb.verifyAuthentication(header, false); final boolean authenticated = sb.adminAuthenticated(header) >= 2; + final boolean localhostAccess = sb.accessFromLocalhost(header); + int display = (post == null) ? 0 : post.getInt("display", 0); if (!authenticated) display = 2; // display == 0: shop top menu @@ -234,34 +236,34 @@ public class yacysearch { global = false; snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY; block = true; - Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: BLACKLISTED CLIENT FROM " + client + " gets no permission to search"); + Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: BLACKLISTED CLIENT FROM " + client + " gets no permission to search"); } else if (Domains.matchesList(client, sb.networkWhitelist)) { - Log.logInfo("LOCAL_SEARCH", "ACCECC CONTROL: WHITELISTED CLIENT FROM " + client + " gets no search restrictions"); - } else if (!authenticated && (global || snippetFetchStrategy.isAllowedToFetchOnline())) { + Log.logInfo("LOCAL_SEARCH", "ACCESS CONTROL: WHITELISTED CLIENT FROM " + client + " gets no search restrictions"); + } else if (!authenticated && !localhostAccess) { // in case that we do a global search or we want to fetch snippets, we check for DoS cases synchronized (trackerHandles) { int accInOneSecond = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 1000)).size(); int accInThreeSeconds = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 3000)).size(); int accInOneMinute = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 60000)).size(); int accInTenMinutes = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size(); - if (accInTenMinutes > 600) { - global = false; - snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY; - block = true; - Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInTenMinutes + " searches in ten minutes, fully blocked (no results generated)"); - } else if (accInOneMinute > 200) { - global = false; - snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY; + // protections against too strong YaCy network load, reduces remote search + if (global) { + if (accInTenMinutes >= 30 || accInOneMinute >= 6 || accInThreeSeconds >= 1) { + global = false; + Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + client + ": " + accInOneSecond + "/1s, " + accInThreeSeconds + "/3s, " + accInOneMinute + "/60s, " + accInTenMinutes + "/600s, " + " requests, disallowed global search"); + } + } + // protection against too many remote server snippet loads (protects traffic on server) + if (snippetFetchStrategy.isAllowedToFetchOnline()) { + if (accInTenMinutes >= 20 || accInOneMinute >= 4 || accInThreeSeconds >= 1) { + snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY; + Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + client + ": " + accInOneSecond + "/1s, " + accInThreeSeconds + "/3s, " + accInOneMinute + "/60s, " + accInTenMinutes + "/600s, " + " requests, disallowed remote snippet loading"); + } + } + // general load protection + if (accInTenMinutes >= 2000 || accInOneMinute >= 600 || accInOneSecond >= 20) { block = true; - Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInOneMinute + " searches in one minute, fully blocked (no results generated)"); - } else if (accInThreeSeconds > 1) { - global = false; - snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY; - Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInThreeSeconds + " searches in three seconds, blocked global search and snippets"); - } else if (accInOneSecond > 2) { - global = false; - snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY; - Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInOneSecond + " searches in one second, blocked global search and snippets"); + Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + client + ": " + accInOneSecond + "/1s, " + accInThreeSeconds + "/3s, " + accInOneMinute + "/60s, " + accInTenMinutes + "/600s, " + " requests, disallowed search"); } } } diff --git a/source/de/anomic/http/server/HTTPDemon.java b/source/de/anomic/http/server/HTTPDemon.java index 177bc492f..d5a17cc49 100644 --- a/source/de/anomic/http/server/HTTPDemon.java +++ b/source/de/anomic/http/server/HTTPDemon.java @@ -256,10 +256,9 @@ public final class HTTPDemon implements serverHandler, Cloneable { public static int staticAdminAuthenticated(final String authorization, final serverSwitch sw) { // the authorization string must be given with the truncated 6 bytes at the beginning - if (authorization == null) return 1; - //if (authorization.length() < 6) return 1; // no authentication information given final String adminAccountBase64MD5 = sw.getConfig(ADMIN_ACCOUNT_B64MD5, ""); if (adminAccountBase64MD5.length() == 0) return 2; // no password stored + if (authorization == null || authorization.length() == 0) return 1; if (adminAccountBase64MD5.equals(Digest.encodeMD5Hex(authorization))) return 4; // hard-authenticated, all ok return 1; } diff --git a/source/de/anomic/search/MediaSnippet.java b/source/de/anomic/search/MediaSnippet.java index d785f8d1e..a4240320a 100644 --- a/source/de/anomic/search/MediaSnippet.java +++ b/source/de/anomic/search/MediaSnippet.java @@ -29,17 +29,20 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.Iterator; import java.util.Map; +import java.util.TreeMap; import java.util.TreeSet; import de.anomic.crawler.CrawlProfile; import de.anomic.data.MimeTable; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.HandleSet; +import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.util.ByteArray; @@ -157,8 +160,8 @@ public class MediaSnippet implements Comparable, Comparator, Comparator 0 && ientry.width() < 64) continue; desc = ientry.alt(); int appcount = queryhashes.size() * 2 - - TextSnippet.removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() - - TextSnippet.removeAppearanceHashes(desc, queryhashes).size(); + removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() - + removeAppearanceHashes(desc, queryhashes).size(); final long ranking = Long.MAX_VALUE - (ientry.height() + 1) * (ientry.width() + 1) * (appcount + 1); result.add(new MediaSnippet(ContentDomain.IMAGE, url, MimeTable.url2mime(url), desc, ientry.fileSize(), ientry.width(), ientry.height(), ranking, source)); } return result; } - - /* - private static String computeMediaSnippet(Map media, Set queryhashes) { - Iterator> i = media.entrySet().iterator(); - Map.Entry entry; - yacyURL url; - String desc; - Set s; - String result = ""; - while (i.hasNext()) { - entry = i.next(); - url = entry.getKey(); - desc = entry.getValue(); - s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes); - if (isEmpty()) { - result += "
" + ((desc.length() == 0) ? url : desc) + ""; - continue; - } - s = removeAppearanceHashes(desc, s); - if (isEmpty()) { - result += "
" + ((desc.length() == 0) ? url : desc) + ""; - continue; + /** + * removed all word hashes that can be computed as tokens from a given sentence from a given hash set + * @param sentence + * @param queryhashes + * @return the given hash set minus the hashes from the tokenization of the given sentence + */ + private static HandleSet removeAppearanceHashes(final String sentence, final HandleSet queryhashes) { + // remove all hashes that appear in the sentence + if (sentence == null) return queryhashes; + final TreeMap hs = Condenser.hashSentence(sentence); + final Iterator j = queryhashes.iterator(); + byte[] hash; + Integer pos; + final HandleSet remaininghashes = new HandleSet(queryhashes.row().primaryKeyLength, queryhashes.comparator(), queryhashes.size()); + while (j.hasNext()) { + hash = j.next(); + pos = hs.get(hash); + if (pos == null) { + try { + remaininghashes.put(hash); + } catch (RowSpaceExceededException e) { + Log.logException(e); + } } } - if (result.length() == 0) return null; - return result.substring(6); + return remaininghashes; } - */ } diff --git a/source/de/anomic/search/ResultFetcher.java b/source/de/anomic/search/ResultFetcher.java index 296a1b125..1024d53e4 100644 --- a/source/de/anomic/search/ResultFetcher.java +++ b/source/de/anomic/search/ResultFetcher.java @@ -220,7 +220,7 @@ public class ResultFetcher { if (query.contentdom == ContentDomain.TEXT) { // attach text snippet startTime = System.currentTimeMillis(); - final TextSnippet snippet = TextSnippet.retrieveTextSnippet( + final TextSnippet snippet = new TextSnippet( this.loader, metadata, snippetFetchWordHashes, diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 04367fc31..09ca9a92b 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -2057,12 +2057,19 @@ public final class Switchboard extends serverSwitch { } } - public int adminAuthenticated(final RequestHeader requestHeader) { + public boolean accessFromLocalhost(final RequestHeader requestHeader) { // authorization for localhost, only if flag is set to grant localhost access as admin final String clientIP = requestHeader.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, ""); + if (!Domains.isLocal(clientIP)) return false; final String refererHost = requestHeader.refererHost(); - boolean accessFromLocalhost = Domains.isLocal(clientIP) && (refererHost == null || refererHost.length() == 0 || Domains.isLocal(refererHost)); + return refererHost == null || refererHost.length() == 0 || Domains.isLocal(refererHost); + } + + public int adminAuthenticated(final RequestHeader requestHeader) { + + // authorization for localhost, only if flag is set to grant localhost access as admin + boolean accessFromLocalhost = accessFromLocalhost(requestHeader); if (getConfigBool("adminAccountForLocalhost", false) && accessFromLocalhost) return 3; // soft-authenticated for localhost // get the authorization string from the header diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java index f8a84b437..0d18d458a 100644 --- a/source/de/anomic/search/TextSnippet.java +++ b/source/de/anomic/search/TextSnippet.java @@ -25,6 +25,7 @@ package de.anomic.search; import java.util.ArrayList; +import java.util.Collection; import java.util.Comparator; import java.util.Iterator; import java.util.TreeMap; @@ -36,20 +37,18 @@ import net.yacy.cora.storage.ConcurrentARC; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.SnippetExtractor; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.HandleSet; -import net.yacy.kelondro.index.RowSpaceExceededException; -import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.util.ByteArray; import net.yacy.repository.LoaderDispatcher; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.Response; -import de.anomic.http.client.Cache; import de.anomic.yacy.yacySearch; public class TextSnippet implements Comparable, Comparator { @@ -67,66 +66,6 @@ public class TextSnippet implements Comparable, Comparator snippetsCache = new ConcurrentARC(maxCache, Math.max(10, Runtime.getRuntime().availableProcessors())); - private static final ARC faviconCache = new ConcurrentARC(maxCache, Math.max(10, Runtime.getRuntime().availableProcessors())); - - private final DigestURI url; - private String line; - private final String error; - private final int errorCode; - private HandleSet remaingHashes; - private final DigestURI favicon; - - public static boolean existsInCache(final DigestURI url, final HandleSet queryhashes) { - final String hashes = yacySearch.set2string(queryhashes); - return retrieveFromCache(hashes, new String(url.hash())) != null; - } - - public static void storeToCache(final String wordhashes, final String urlhash, final String snippet) { - // generate key - String key = urlhash + wordhashes; - - // do nothing if snippet is known - if (snippetsCache.containsKey(key)) return; - - // learn new snippet - snippetsCache.put(key, snippet); - } - - public static String retrieveFromCache(final String wordhashes, final String urlhash) { - // generate key - final String key = urlhash + wordhashes; - return snippetsCache.get(key); - } - - /** - * removed all word hashes that can be computed as tokens from a given sentence from a given hash set - * @param sentence - * @param queryhashes - * @return the given hash set minus the hashes from the tokenization of the given sentence - */ - public static HandleSet removeAppearanceHashes(final String sentence, final HandleSet queryhashes) { - // remove all hashes that appear in the sentence - if (sentence == null) return queryhashes; - final TreeMap hs = Condenser.hashSentence(sentence); - final Iterator j = queryhashes.iterator(); - byte[] hash; - Integer pos; - final HandleSet remaininghashes = new HandleSet(queryhashes.row().primaryKeyLength, queryhashes.comparator(), queryhashes.size()); - while (j.hasNext()) { - hash = j.next(); - pos = hs.get(hash); - if (pos == null) { - try { - remaininghashes.put(hash); - } catch (RowSpaceExceededException e) { - Log.logException(e); - } - } - } - return remaininghashes; - } /** * \\A[^\\p{L}\\p{N}].+ @@ -148,54 +87,206 @@ public class TextSnippet implements Comparable, Comparator(.*?)(\\<b\\>.+?\\</b\\>)(.*) */ private final static Pattern p01 = Pattern.compile("(.*?)(\\.+?\\)(.*)"); // marked words are in -tags + + public static class Cache { + private final ARC cache; + public Cache() { + cache = new ConcurrentARC(maxCache, Math.max(10, Runtime.getRuntime().availableProcessors())); + } + public void put(final String wordhashes, final String urlhash, final String snippet) { + // generate key + String key = urlhash + wordhashes; + + // do nothing if snippet is known + if (cache.containsKey(key)) return; + + // learn new snippet + cache.put(key, snippet); + } + + public String get(final String wordhashes, final String urlhash) { + // generate key + final String key = urlhash + wordhashes; + return cache.get(key); + } + + public boolean contains(final String wordhashes, final String urlhash) { + return cache.containsKey(urlhash + wordhashes); + } + } - public TextSnippet(final DigestURI url, final String line, final int errorCode, final HandleSet remaingHashes, final String errortext) { - this(url, line, errorCode, remaingHashes, errortext, null); + public static final Cache snippetsCache = new Cache(); + + private byte[] urlhash; + private String line; + private String error; + private int errorCode; + + public TextSnippet(final byte[] urlhash, final String line, final int errorCode, final String errortext) { + init(urlhash, line, errorCode, errortext); + } + + public TextSnippet(final LoaderDispatcher loader, final URIMetadataRow.Components comp, final HandleSet queryhashes, final CrawlProfile.CacheStrategy cacheStrategy, final boolean pre, final int snippetMaxLength, final int maxDocLen, final boolean reindexing) { + // heise = "0OQUNU3JSs05" + final DigestURI url = comp.url(); + if (queryhashes.isEmpty()) { + //System.out.println("found no queryhashes for URL retrieve " + url); + init(url.hash(), null, ERROR_NO_HASH_GIVEN, "no query hashes given"); + return; + } + + // try to get snippet from snippetCache + int source = SOURCE_CACHE; + final String wordhashes = yacySearch.set2string(queryhashes); + final String urls = new String(url.hash()); + String line = snippetsCache.get(wordhashes, urls); + if (line != null) { + // found the snippet + init(url.hash(), line, source, null); + return; + } + + + /* =========================================================================== + * LOAD RESOURCE DATA + * =========================================================================== */ + // if the snippet is not in the cache, we can try to get it from the htcache + Response response; + try { + // first try to get the snippet from metadata + String loc; + boolean objectWasInCache = de.anomic.http.client.Cache.has(url); + boolean useMetadata = !objectWasInCache && !cacheStrategy.mustBeOffline(); + if (useMetadata && containsAllHashes(loc = comp.dc_title(), queryhashes)) { + // try to create the snippet from information given in the url itself + init(url.hash(), loc, SOURCE_METADATA, null); + return; + } else if (useMetadata && containsAllHashes(loc = comp.dc_creator(), queryhashes)) { + // try to create the snippet from information given in the creator metadata + init(url.hash(), loc, SOURCE_METADATA, null); + return; + } else if (useMetadata && containsAllHashes(loc = comp.dc_subject(), queryhashes)) { + // try to create the snippet from information given in the subject metadata + init(url.hash(), loc, SOURCE_METADATA, null); + return; + } else if (useMetadata && containsAllHashes(loc = comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) { + // try to create the snippet from information given in the url + init(url.hash(), loc, SOURCE_METADATA, null); + return; + } else { + // try to load the resource from the cache + response = loader.load(loader.request(url, true, reindexing), cacheStrategy, Long.MAX_VALUE); + if (response == null) { + // in case that we did not get any result we can still return a success when we are not allowed to go online + if (cacheStrategy.mustBeOffline()) { + init(url.hash(), null, ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry"); + return; + } + + // if it is still not available, report an error + init(url.hash(), null, ERROR_RESOURCE_LOADING, "error loading resource from net, no cache entry"); + return; + } + if (!objectWasInCache) { + // place entry on indexing queue + Switchboard.getSwitchboard().toIndexer(response); + source = SOURCE_WEB; + } + } + } catch (final Exception e) { + //Log.logException(e); + init(url.hash(), null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage()); + return; + } + + /* =========================================================================== + * PARSE RESOURCE + * =========================================================================== */ + Document document = null; + try { + document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); + } catch (final Parser.Failure e) { + init(url.hash(), null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed + return; + } + if (document == null) { + init(url.hash(), null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed + return; + } + + /* =========================================================================== + * COMPUTE SNIPPET + * =========================================================================== */ + // we have found a parseable non-empty file: use the lines + + // compute snippet from text + final Collection sentences = document.getSentences(pre); + if (sentences == null) { + init(url.hash(), null, ERROR_PARSER_NO_LINES, "parser returned no sentences"); + return; + } + final SnippetExtractor tsr; + String textline = null; + HandleSet remainingHashes = queryhashes; + try { + tsr = new SnippetExtractor(sentences, queryhashes, snippetMaxLength); + textline = tsr.getSnippet(); + remainingHashes = tsr.getRemainingWords(); + } catch (UnsupportedOperationException e) { + init(url.hash(), null, ERROR_NO_MATCH, "no matching snippet found"); + return; + } + + // compute snippet from media + //String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes); + //String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes); + //String appline = computeMediaSnippet(document.getApplinks(), queryhashes); + //String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes); + //String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes); + + line = ""; + //if (audioline != null) line += (line.length() == 0) ? audioline : "
" + audioline; + //if (videoline != null) line += (line.length() == 0) ? videoline : "
" + videoline; + //if (appline != null) line += (line.length() == 0) ? appline : "
" + appline; + //if (hrefline != null) line += (line.length() == 0) ? hrefline : "
" + hrefline; + if (textline != null) line += (line.length() == 0) ? textline : "
" + textline; + + if (line == null || !remainingHashes.isEmpty()) { + init(url.hash(), null, ERROR_NO_MATCH, "no matching snippet found"); + return; + } + if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength); + + // finally store this snippet in our own cache + snippetsCache.put(wordhashes, urls, line); + + document.close(); + init(url.hash(), line, source, null); } - public TextSnippet(final DigestURI url, final String line, final int errorCode, final HandleSet remaingHashes, final String errortext, final DigestURI favicon) { - this.url = url; + private void init(final byte[] urlhash, final String line, final int errorCode, final String errortext) { + this.urlhash = urlhash; this.line = line; this.errorCode = errorCode; this.error = errortext; - this.remaingHashes = remaingHashes; - this.favicon = favicon; - } - public DigestURI getUrl() { - return this.url; - } - public DigestURI getFavicon() { - return this.favicon; } + public boolean exists() { return line != null; } - public int compareTo(TextSnippet o) { - return Base64Order.enhancedCoder.compare(this.url.hash(), o.url.hash()); - } - public int compare(TextSnippet o1, TextSnippet o2) { - return o1.compareTo(o2); - } - public int hashCode() { - return ByteArray.hashCode(this.url.hash()); - } - @Override - public String toString() { - return (line == null) ? "" : line; - } public String getLineRaw() { return (line == null) ? "" : line; } + public String getError() { return (error == null) ? "" : error.trim(); } + public int getErrorCode() { return errorCode; } - public HandleSet getRemainingHashes() { - return this.remaingHashes; - } + public String getLineMarked(final HandleSet queryHashes) { if (line == null) return ""; if (queryHashes == null || queryHashes.isEmpty()) return line.trim(); @@ -225,6 +316,23 @@ public class TextSnippet implements Comparable, Comparator, Comparator sentences = document.getSentences(pre); - if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences",resFavicon); - final Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength); - final String textline = (tsr == null) ? null : (String) tsr[0]; - final HandleSet remainingHashes = (tsr == null) ? queryhashes : (HandleSet) tsr[1]; - - // compute snippet from media - //String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes); - //String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes); - //String appline = computeMediaSnippet(document.getApplinks(), queryhashes); - //String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes); - //String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes); - - line = ""; - //if (audioline != null) line += (line.length() == 0) ? audioline : "
" + audioline; - //if (videoline != null) line += (line.length() == 0) ? videoline : "
" + videoline; - //if (appline != null) line += (line.length() == 0) ? appline : "
" + appline; - //if (hrefline != null) line += (line.length() == 0) ? hrefline : "
" + hrefline; - if (textline != null) line += (line.length() == 0) ? textline : "
" + textline; - - if (line == null || !remainingHashes.isEmpty()) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found",resFavicon); - if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength); - - // finally store this snippet in our own cache - storeToCache(wordhashes, urls, line); - - document.close(); - return new TextSnippet(url, line, source, null, null, resFavicon); - } - private static boolean containsAllHashes(final String sentence, final HandleSet queryhashes) { final TreeMap m = Condenser.hashSentence(sentence); for (byte[] b: queryhashes) { @@ -427,137 +422,5 @@ public class TextSnippet implements Comparable, Comparator sentences, final HandleSet queryhashes, int maxLength) { - try { - if (sentences == null) return null; - if ((queryhashes == null) || (queryhashes.isEmpty())) return null; - Iterator j; - TreeMap hs; - StringBuilder sentence; - final TreeMap os = new TreeMap(); - int uniqCounter = 9999; - int score; - while (sentences.hasNext()) { - sentence = sentences.next(); - hs = Condenser.hashSentence(sentence.toString()); - j = queryhashes.iterator(); - score = 0; - while (j.hasNext()) {if (hs.containsKey(j.next())) score++;} - if (score > 0) { - os.put(Integer.valueOf(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence); - } - } - - String result; - HandleSet remaininghashes; - while (!os.isEmpty()) { - sentence = os.remove(os.lastKey()); // sentence with the biggest score - Object[] tsr = computeTextSnippet(sentence.toString(), queryhashes, maxLength); - if (tsr == null) continue; - result = (String) tsr[0]; - if ((result != null) && (result.length() > 0)) { - remaininghashes = (HandleSet) tsr[1]; - if (remaininghashes.isEmpty()) { - // we have found the snippet - return new Object[]{result, remaininghashes}; - } else if (remaininghashes.size() < queryhashes.size()) { - // the result has not all words in it. - // find another sentence that represents the missing other words - // and find recursively more sentences - maxLength = maxLength - result.length(); - if (maxLength < 20) maxLength = 20; - tsr = computeTextSnippet(os.values().iterator(), remaininghashes, maxLength); - if (tsr == null) return null; - final String nextSnippet = (String) tsr[0]; - if (nextSnippet == null) return tsr; - return new Object[]{result + (" / " + nextSnippet), tsr[1]}; - } else { - // error - //assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'"; - continue; - } - } - } - return null; - } catch (final IndexOutOfBoundsException e) { - Log.logSevere("computeSnippet", "error with string generation", e); - return new Object[]{null, queryhashes}; - } - } - - private static Object[] /*{String - the snippet, HandleSet - remaining hashes}*/ - computeTextSnippet(String sentence, final HandleSet queryhashes, final int maxLength) { - try { - if (sentence == null) return null; - if ((queryhashes == null) || (queryhashes.isEmpty())) return null; - byte[] hash; - - // find all hashes that appear in the sentence - final TreeMap hs = Condenser.hashSentence(sentence); - final Iterator j = queryhashes.iterator(); - Integer pos; - int p, minpos = sentence.length(), maxpos = -1; - final HandleSet remainingHashes = new HandleSet(queryhashes.row().primaryKeyLength, queryhashes.comparator(), 0); - while (j.hasNext()) { - hash = j.next(); - pos = hs.get(hash); - if (pos == null) { - try { - remainingHashes.put(hash); - } catch (RowSpaceExceededException e) { - Log.logException(e); - } - } else { - p = pos.intValue(); - if (p > maxpos) maxpos = p; - if (p < minpos) minpos = p; - } - } - // check result size - maxpos = maxpos + 10; - if (maxpos > sentence.length()) maxpos = sentence.length(); - if (minpos < 0) minpos = 0; - // we have a result, but is it short enough? - if (maxpos - minpos + 10 > maxLength) { - // the string is too long, even if we cut at both ends - // so cut here in the middle of the string - final int lenb = sentence.length(); - sentence = sentence.substring(0, (minpos + 20 > sentence.length()) ? sentence.length() : minpos + 20).trim() + - " [..] " + - sentence.substring((maxpos + 26 > sentence.length()) ? sentence.length() : maxpos + 26).trim(); - maxpos = maxpos + lenb - sentence.length() + 6; - } - if (maxpos > maxLength) { - // the string is too long, even if we cut it at the end - // so cut it here at both ends at once - assert maxpos >= minpos; - final int newlen = Math.max(10, maxpos - minpos + 10); - final int around = (maxLength - newlen) / 2; - assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); - //assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); - sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]"; - minpos = around; - maxpos = sentence.length() - around - 5; - } - if (sentence.length() > maxLength) { - // trim sentence, 1st step (cut at right side) - sentence = sentence.substring(0, maxpos).trim() + " [..]"; - } - if (sentence.length() > maxLength) { - // trim sentence, 2nd step (cut at left side) - sentence = "[..] " + sentence.substring(minpos).trim(); - } - if (sentence.length() > maxLength) { - // trim sentence, 3rd step (cut in the middle) - sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim(); - } - return new Object[] {sentence, remainingHashes}; - } catch (final IndexOutOfBoundsException e) { - Log.logSevere("computeSnippet", "error with string generation", e); - return null; - } - } - + } diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 495310f01..0b117c651 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -480,7 +480,7 @@ public final class yacyClient { // because they are search-specific. // instead, they are placed in a snipped-search cache. // System.out.println("--- RECEIVED SNIPPET '" + urlEntry.snippet() + "'"); - TextSnippet.storeToCache(wordhashes, new String(urlEntry.hash()), urlEntry.snippet()); + TextSnippet.snippetsCache.put(wordhashes, new String(urlEntry.hash()), urlEntry.snippet()); } // add the url entry to the word indexes diff --git a/source/net/yacy/cora/storage/WeakPriorityBlockingQueue.java b/source/net/yacy/cora/storage/WeakPriorityBlockingQueue.java index e50db77d2..acc568454 100644 --- a/source/net/yacy/cora/storage/WeakPriorityBlockingQueue.java +++ b/source/net/yacy/cora/storage/WeakPriorityBlockingQueue.java @@ -226,7 +226,7 @@ public class WeakPriorityBlockingQueue { } /** - * return the specific amount of entrie as they would be retrievable with element() + * return the specific amount of entries as they would be retrievable with element() * if count is < 0 then all elements are taken * the returned list is not cloned from the internal list and shall not be modified in any way (read-only) * @param count @@ -234,15 +234,23 @@ public class WeakPriorityBlockingQueue { */ public synchronized ArrayList list(final int count) { if (count < 0) { - // shift all elements - while (!this.queue.isEmpty()) this.poll(); - return this.drained; + return list(); } if (count > sizeAvailable()) throw new RuntimeException("list(" + count + ") exceeded avaiable number of elements (" + sizeAvailable() + ")"); while (count > this.drained.size()) this.poll(); return this.drained; } + /** + * return all entries as they would be retrievable with element() + * @return a list of all elements in the stack + */ + public synchronized ArrayList list() { + // shift all elements + while (!this.queue.isEmpty()) this.poll(); + return this.drained; + } + /** * iterate over all elements available. All elements that are still in the queue are drained to recorded positions * @return an iterator over all drained positions. diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 5ee74b72b..6d1b2ca5d 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -33,6 +33,7 @@ import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Date; @@ -275,11 +276,15 @@ dc_rights return -1; } - public Iterator getSentences(final boolean pre) { + public List getSentences(final boolean pre) { if (this.text == null) return null; final Condenser.sentencesFromInputStreamEnum e = Condenser.sentencesFromInputStream(getText()); e.pre(pre); - return e; + ArrayList sentences = new ArrayList(); + while (e.hasNext()) { + sentences.add(e.next()); + } + return sentences; } public List getKeywords() { diff --git a/source/net/yacy/document/SnippetExtractor.java b/source/net/yacy/document/SnippetExtractor.java new file mode 100644 index 000000000..1210a6b2f --- /dev/null +++ b/source/net/yacy/document/SnippetExtractor.java @@ -0,0 +1,196 @@ +/** + * SnippetExtractor + * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany + * First released 22.10.2010 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.document; + +import java.util.Collection; +import java.util.Iterator; +import java.util.TreeMap; +import java.util.TreeSet; + +import net.yacy.kelondro.index.HandleSet; +import net.yacy.kelondro.index.RowSpaceExceededException; +import net.yacy.kelondro.logging.Log; + +public class SnippetExtractor { + + String snippetString; + HandleSet remainingHashes; + + public SnippetExtractor(final Collection sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException { + if (sentences == null) throw new UnsupportedOperationException("sentence == null"); + if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null"); + TreeMap hs; + final TreeMap order = new TreeMap(); + long uniqCounter = 999L; + Integer pos; + TreeSet positions; + int linenumber = 0; + for (StringBuilder sentence: sentences) { + hs = Condenser.hashSentence(sentence.toString()); + positions = new TreeSet(); + for (byte[] word: queryhashes) { + pos = hs.get(word); + if (pos != null) { + positions.add(pos); + } + } + int worddistance = positions.size() > 1 ? positions.last() - positions.first() : 0; + // sort by + // - 1st order: number of matching words + // - 2nd order: word distance + // - 3th order: line length (not too short and not too long) + // - 4rd order: line number + if (positions.size() > 0) { + order.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence); + if (order.size() > 5) order.remove(order.firstEntry().getKey()); + } + linenumber++; + } + + StringBuilder sentence; + SnippetExtractor tsr; + while (!order.isEmpty()) { + sentence = order.remove(order.lastKey()); // sentence with the biggest score + try { + tsr = new SnippetExtractor(sentence.toString(), queryhashes, maxLength); + } catch (UnsupportedOperationException e) { + continue; + } + snippetString = tsr.snippetString; + if (snippetString != null && snippetString.length() > 0) { + remainingHashes = tsr.remainingHashes; + if (remainingHashes.isEmpty()) { + // we have found the snippet + return; // finished! + } else if (remainingHashes.size() < queryhashes.size()) { + // the result has not all words in it. + // find another sentence that represents the missing other words + // and find recursively more sentences + maxLength = maxLength - snippetString.length(); + if (maxLength < 20) maxLength = 20; + try { + tsr = new SnippetExtractor(order.values(), remainingHashes, maxLength); + } catch (UnsupportedOperationException e) { + throw e; + } + final String nextSnippet = tsr.snippetString; + if (nextSnippet == null) return; + snippetString = snippetString + (" / " + nextSnippet); + remainingHashes = tsr.remainingHashes; + return; + } else { + // error + //assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'"; + continue; + } + } + } + throw new UnsupportedOperationException("no snippet computed"); + } + + private static int linelengthKey(int givenlength, int maxlength) { + if (givenlength > maxlength) return 1; + if (givenlength >= maxlength / 2 && givenlength < maxlength) return 7; + if (givenlength >= maxlength / 4 && givenlength < maxlength / 2) return 5; + if (givenlength >= maxlength / 8 && givenlength < maxlength / 4) return 3; + return 0; + } + + private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException { + try { + if (sentence == null) throw new UnsupportedOperationException("no sentence given"); + if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null"); + byte[] hash; + + // find all hashes that appear in the sentence + final TreeMap hs = Condenser.hashSentence(sentence); + final Iterator j = queryhashes.iterator(); + Integer pos; + int p, minpos = sentence.length(), maxpos = -1; + final HandleSet remainingHashes = new HandleSet(queryhashes.row().primaryKeyLength, queryhashes.comparator(), 0); + while (j.hasNext()) { + hash = j.next(); + pos = hs.get(hash); + if (pos == null) { + try { + remainingHashes.put(hash); + } catch (RowSpaceExceededException e) { + Log.logException(e); + } + } else { + p = pos.intValue(); + if (p > maxpos) maxpos = p; + if (p < minpos) minpos = p; + } + } + // check result size + maxpos = maxpos + 10; + if (maxpos > sentence.length()) maxpos = sentence.length(); + if (minpos < 0) minpos = 0; + // we have a result, but is it short enough? + if (maxpos - minpos + 10 > maxLength) { + // the string is too long, even if we cut at both ends + // so cut here in the middle of the string + final int lenb = sentence.length(); + sentence = sentence.substring(0, (minpos + 20 > sentence.length()) ? sentence.length() : minpos + 20).trim() + + " [..] " + + sentence.substring((maxpos + 26 > sentence.length()) ? sentence.length() : maxpos + 26).trim(); + maxpos = maxpos + lenb - sentence.length() + 6; + } + if (maxpos > maxLength) { + // the string is too long, even if we cut it at the end + // so cut it here at both ends at once + assert maxpos >= minpos; + final int newlen = Math.max(10, maxpos - minpos + 10); + final int around = (maxLength - newlen) / 2; + assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); + //assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); + sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]"; + minpos = around; + maxpos = sentence.length() - around - 5; + } + if (sentence.length() > maxLength) { + // trim sentence, 1st step (cut at right side) + sentence = sentence.substring(0, maxpos).trim() + " [..]"; + } + if (sentence.length() > maxLength) { + // trim sentence, 2nd step (cut at left side) + sentence = "[..] " + sentence.substring(minpos).trim(); + } + if (sentence.length() > maxLength) { + // trim sentence, 3rd step (cut in the middle) + sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim(); + } + this.snippetString = sentence; + this.remainingHashes = remainingHashes; + } catch (final IndexOutOfBoundsException e) { + throw new UnsupportedOperationException(e.getMessage()); + } + } + + public String getSnippet() { + return this.snippetString; + } + + public HandleSet getRemainingWords() { + return this.remainingHashes; + } +}