From 72e5407115ebfe8e836d9a6e79b6382deeeb13f9 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 27 Aug 2009 14:34:41 +0000 Subject: [PATCH] refactoring of snippet cache git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6268 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Bookmarks.java | 4 +- htroot/ViewFile.java | 3 +- htroot/ViewImage.java | 3 +- htroot/yacysearch.java | 4 +- htroot/yacysearchitem.java | 11 +- .../crawler/retrieval/LoaderDispatcher.java | 122 +++ source/de/anomic/document/Condenser.java | 17 + source/de/anomic/document/Document.java | 59 ++ source/de/anomic/search/ImageCollector.java | 4 +- source/de/anomic/search/MediaSnippet.java | 167 ++++ source/de/anomic/search/ResultEntry.java | 12 +- source/de/anomic/search/SnippetCache.java | 933 ------------------ source/de/anomic/search/SnippetFetcher.java | 26 +- source/de/anomic/search/Switchboard.java | 6 +- source/de/anomic/search/TextSnippet.java | 597 +++++++++++ source/de/anomic/yacy/yacyClient.java | 4 +- 16 files changed, 1000 insertions(+), 972 deletions(-) create mode 100644 source/de/anomic/search/MediaSnippet.java delete mode 100644 source/de/anomic/search/SnippetCache.java create mode 100644 source/de/anomic/search/TextSnippet.java diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 8811cc179..89057b01a 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -36,6 +36,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Set; +import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.data.bookmarksDB; import de.anomic.data.listManager; import de.anomic.data.userDB; @@ -44,7 +45,6 @@ import de.anomic.document.Document; import de.anomic.http.metadata.RequestHeader; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.DateFormatter; -import de.anomic.search.SnippetCache; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -187,7 +187,7 @@ public class Bookmarks { Document document = null; if (urlentry != null) { final URLMetadataRow.Components metadata = urlentry.metadata(); - document = SnippetCache.retrieveDocument(metadata.url(), true, 5000, true, false); + document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false); prop.put("mode_edit", "0"); // create mode prop.put("mode_url", metadata.url().toNormalform(false, true)); prop.putHTML("mode_title", metadata.dc_title()); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 7cb38e957..a24f5a1c4 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -46,7 +46,6 @@ import de.anomic.http.metadata.RequestHeader; import de.anomic.http.metadata.ResponseHeader; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.FileUtils; -import de.anomic.search.SnippetCache; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -241,7 +240,7 @@ public class ViewFile { // parsing the resource content Document document = null; try { - document = SnippetCache.parseDocument(url, resourceLength, resource); + document = Document.parseDocument(url, resourceLength, resource); if (document == null) { prop.put("error", "5"); prop.put("error_errorText", "Unknown error"); diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index c73f81b4b..67d812460 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -32,7 +32,6 @@ import java.util.HashMap; import de.anomic.http.metadata.HeaderFramework; import de.anomic.http.metadata.RequestHeader; import de.anomic.kelondro.util.FileUtils; -import de.anomic.search.SnippetCache; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -89,7 +88,7 @@ public class ViewImage { if (scaled == null) { Object[] resource = null; if (url != null) try { - resource = SnippetCache.getResource(url, true, timeout, false, true); + resource = sb.loader.getResource(url, true, timeout, false, true); } catch (IOException e) { Log.logWarning("ViewImage", "cannot load: " + e.getMessage()); } diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 4aecb8378..8313d0eab 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -33,6 +33,7 @@ import java.util.Iterator; import java.util.TreeSet; import de.anomic.content.RSSMessage; +import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.document.Condenser; import de.anomic.document.Word; import de.anomic.document.Document; @@ -47,7 +48,6 @@ import de.anomic.search.QueryParams; import de.anomic.search.RankingProfile; import de.anomic.search.SearchEvent; import de.anomic.search.SearchEventCache; -import de.anomic.search.SnippetCache; import de.anomic.search.Switchboard; import de.anomic.search.SwitchboardConstants; import de.anomic.server.serverCore; @@ -390,7 +390,7 @@ public class yacysearch { if (urlentry != null) { final URLMetadataRow.Components metadata = urlentry.metadata(); Document document; - document = SnippetCache.retrieveDocument(metadata.url(), true, 5000, true, false); + document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false); if (document != null) { // create a news message final HashMap map = new HashMap(); diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index ea6b494f2..9c76f57ba 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -32,13 +32,14 @@ import java.util.TreeSet; import de.anomic.http.metadata.HeaderFramework; import de.anomic.http.metadata.RequestHeader; +import de.anomic.search.MediaSnippet; import de.anomic.search.QueryParams; import de.anomic.search.SearchEvent; import de.anomic.search.RankingProcess; import de.anomic.search.ResultEntry; import de.anomic.search.SearchEventCache; -import de.anomic.search.SnippetCache; import de.anomic.search.Switchboard; +import de.anomic.search.TextSnippet; import de.anomic.server.serverObjects; import de.anomic.server.serverProfiling; import de.anomic.server.serverSwitch; @@ -144,7 +145,7 @@ public class yacysearchitem { prop.put("content_rankingprops", result.word().toPropertyForm() + ", domLengthEstimated=" + yacyURL.domLengthEstimation(result.hash()) + ((yacyURL.probablyRootURL(result.hash())) ? ", probablyRootURL" : "") + (((wordURL = yacyURL.probablyWordURL(result.hash(), query[0])) != null) ? ", probablyWordURL=" + wordURL.toNormalform(false, true) : "")); - final SnippetCache.TextSnippet snippet = result.textSnippet(); + final TextSnippet snippet = result.textSnippet(); final String desc = (snippet == null) ? "" : snippet.getLineMarked(theQuery.fullqueryHashes); prop.put("content_description", desc); prop.putXML("content_description-xml", desc); @@ -158,7 +159,7 @@ public class yacysearchitem { // image search; shows thumbnails prop.put("content", theQuery.contentdom + 1); // switch on specific content - final SnippetCache.MediaSnippet ms = theSearch.result().oneImage(item); + final MediaSnippet ms = theSearch.result().oneImage(item); if (ms == null) { prop.put("content_items", "0"); } else { @@ -184,10 +185,10 @@ public class yacysearchitem { if (result == null) return prop; // no content prop.put("content", theQuery.contentdom + 1); // switch on specific content - final ArrayList media = result.mediaSnippets(); + final ArrayList media = result.mediaSnippets(); if (item == 0) col = true; if (media != null) { - SnippetCache.MediaSnippet ms; + MediaSnippet ms; int c = 0; for (int i = 0; i < media.size(); i++) { ms = media.get(i); diff --git a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java index 4fddfabcd..40086de3a 100644 --- a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java +++ b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java @@ -26,7 +26,9 @@ package de.anomic.crawler.retrieval; +import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.InputStream; import java.util.Arrays; import java.util.Date; import java.util.HashSet; @@ -35,6 +37,8 @@ import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import de.anomic.crawler.CrawlProfile; +import de.anomic.document.Document; +import de.anomic.document.ParserException; import de.anomic.http.client.Cache; import de.anomic.http.metadata.HeaderFramework; import de.anomic.http.metadata.RequestHeader; @@ -234,7 +238,125 @@ public final class LoaderDispatcher { throw new IOException("Unsupported protocol '" + protocol + "' in url " + request.url()); } + + /** + * + * @param url + * @param fetchOnline + * @param socketTimeout + * @param forText + * @return an Object array containing + * + * + * + *
[0]the content as {@link InputStream}
[1]the content-length as {@link Integer}
+ * @throws IOException + */ + public Object[] getResource(final yacyURL url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException { + // load the url as resource from the web + long contentLength = -1; + + // trying to load the resource body from cache + InputStream resource = Cache.getContentStream(url); + if (resource != null) { + contentLength = Cache.getResourceContentLength(url); + } else if (fetchOnline) { + // if the content is not available in cache try to download it from web + + // try to download the resource using the loader + final Response entry = load(url, forText, reindexing); + if (entry == null) return null; // not found in web + + // read resource body (if it is there) + final byte[] resourceArray = entry.getContent(); + + // in case that the resource was not in ram, read it from disk + if (resourceArray == null) { + resource = Cache.getContentStream(url); + contentLength = Cache.getResourceContentLength(url); + } else { + resource = new ByteArrayInputStream(resourceArray); + contentLength = resourceArray.length; + } + } else { + return null; + } + return new Object[]{resource, Long.valueOf(contentLength)}; + } + /** + * Tries to load and parse a resource specified by it's URL. + * If the resource is not stored in cache and if fetchOnline is set the + * this function tries to download the resource from web. + * + * @param url the URL of the resource + * @param fetchOnline specifies if the resource should be loaded from web if it'as not available in the cache + * @param timeout + * @param forText + * @param global the domain of the search. If global == true then the content is re-indexed + * @return the parsed document as {@link Document} + */ + public static Document retrieveDocument(final yacyURL url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global) { + + // load resource + long resContentLength = 0; + InputStream resContent = null; + ResponseHeader responseHeader = null; + try { + // trying to load the resource from the cache + resContent = Cache.getContentStream(url); + responseHeader = Cache.getResponseHeader(url); + if (resContent != null) { + // if the content was found + resContentLength = Cache.getResourceContentLength(url); + } else if (fetchOnline) { + // if not found try to download it + + // download resource using the crawler and keep resource in memory if possible + final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global); + + // getting resource metadata (e.g. the http headers for http resources) + if (entry != null) { + + // read resource body (if it is there) + final byte[] resourceArray = entry.getContent(); + if (resourceArray != null) { + resContent = new ByteArrayInputStream(resourceArray); + resContentLength = resourceArray.length; + } else { + resContent = Cache.getContentStream(url); + resContentLength = Cache.getResourceContentLength(url); + } + } + + // if it is still not available, report an error + if (resContent == null) { + Log.logFine("snippet fetch", "plasmaHTCache.Entry cache is NULL for url " + url); + return null; + } + } else { + Log.logFine("snippet fetch", "no resource available for url " + url); + return null; + } + } catch (final Exception e) { + Log.logFine("snippet fetch", "error loading resource: " + e.getMessage() + " for url " + url); + return null; + } + + // parse resource + Document document = null; + try { + document = Document.parseDocument(url, resContentLength, resContent, responseHeader); + } catch (final ParserException e) { + Log.logFine("snippet fetch", "parser error " + e.getMessage() + " for url " + url); + return null; + } finally { + try { resContent.close(); } catch (final Exception e) {} + } + return document; + } + + public synchronized void cleanupAccessTimeTable(long timeout) { final Iterator> i = accessTime.entrySet().iterator(); Map.Entry e; diff --git a/source/de/anomic/document/Condenser.java b/source/de/anomic/document/Condenser.java index 45f60ec4c..b823960e3 100644 --- a/source/de/anomic/document/Condenser.java +++ b/source/de/anomic/document/Condenser.java @@ -48,6 +48,7 @@ import java.util.TreeSet; import de.anomic.document.language.Identificator; import de.anomic.document.parser.html.ContentScraper; import de.anomic.document.parser.html.ImageEntry; +import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.SetTools; @@ -463,6 +464,22 @@ public final class Condenser { } return true; } + + public static TreeMap hashSentence(final String sentence) { + // generates a word-wordPos mapping + final TreeMap map = new TreeMap(Base64Order.enhancedCoder); + final Enumeration words = wordTokenizer(sentence, "UTF-8"); + int pos = 0; + StringBuilder word; + byte[] hash; + while (words.hasMoreElements()) { + word = words.nextElement(); + hash = Word.word2hash(new String(word)); + if (!map.containsKey(hash)) map.put(hash, Integer.valueOf(pos)); // don't overwrite old values, that leads to too far word distances + pos += word.length() + 1; + } + return map; + } public static Enumeration wordTokenizer(final String s, final String charset) { try { diff --git a/source/de/anomic/document/Document.java b/source/de/anomic/document/Document.java index 0f17668a7..dd4bbad35 100644 --- a/source/de/anomic/document/Document.java +++ b/source/de/anomic/document/Document.java @@ -45,6 +45,9 @@ import java.util.TreeSet; import de.anomic.document.parser.html.ContentScraper; import de.anomic.document.parser.html.ImageEntry; +import de.anomic.http.client.Cache; +import de.anomic.http.client.Client; +import de.anomic.http.metadata.ResponseHeader; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.FileUtils; import de.anomic.server.serverCachedFileOutputStream; @@ -607,4 +610,60 @@ dc_rights super.finalize(); } + + /** + * Parse the resource + * @param url the URL of the resource + * @param contentLength the contentLength of the resource + * @param resourceStream the resource body as stream + * @param docInfo metadata about the resource + * @return the extracted data + * @throws ParserException + */ + public static Document parseDocument(final yacyURL url, final long contentLength, final InputStream resourceStream, ResponseHeader responseHeader) throws ParserException { + try { + if (resourceStream == null) return null; + + // STEP 1: if no resource metadata is available, try to load it from cache + if (responseHeader == null) { + // try to get the header from the htcache directory + try { + responseHeader = Cache.getResponseHeader(url); + } catch (final Exception e) { + // ignore this. resource info loading failed + } + } + + // STEP 2: if the metadata is still null try to download it from web + if ((responseHeader == null) && (url.getProtocol().startsWith("http"))) { + // TODO: we need a better solution here + // e.g. encapsulate this in the crawlLoader class + + // getting URL mimeType + try { + responseHeader = Client.whead(url.toString()); + } catch (final Exception e) { + // ingore this. http header download failed + } + } + + // STEP 3: if the metadata is still null try to guess the mimeType of the resource + String supportError = Parser.supports(url, responseHeader == null ? null : responseHeader.mime()); + if (supportError != null) { + return null; + } + if (responseHeader == null) { + return Parser.parseSource(url, null, null, contentLength, resourceStream); + } + return Parser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream); + } catch (final InterruptedException e) { + // interruption of thread detected + return null; + } + } + + public static Document parseDocument(final yacyURL url, final long contentLength, final InputStream resourceStream) throws ParserException { + return parseDocument(url, contentLength, resourceStream, null); + } + } diff --git a/source/de/anomic/search/ImageCollector.java b/source/de/anomic/search/ImageCollector.java index f729b6c29..424caf794 100644 --- a/source/de/anomic/search/ImageCollector.java +++ b/source/de/anomic/search/ImageCollector.java @@ -46,7 +46,7 @@ public final class ImageCollector { if (maxTime > 10) { Object[] resource = null; try { - resource = SnippetCache.getResource(url, true, (int) maxTime, false, indexing); + resource = Switchboard.getSwitchboard().loader.getResource(url, true, (int) maxTime, false, indexing); } catch (IOException e) { Log.logWarning("ViewImage", "cannot load: " + e.getMessage()); } @@ -57,7 +57,7 @@ public final class ImageCollector { Document document = null; try { // parse the document - document = SnippetCache.parseDocument(url, resLength.longValue(), res); + document = Document.parseDocument(url, resLength.longValue(), res); } catch (final ParserException e) { // parsing failed Log.logWarning("ViewImage", "cannot parse: " + e.getMessage()); diff --git a/source/de/anomic/search/MediaSnippet.java b/source/de/anomic/search/MediaSnippet.java new file mode 100644 index 000000000..84aa55462 --- /dev/null +++ b/source/de/anomic/search/MediaSnippet.java @@ -0,0 +1,167 @@ +// MediaSnippet.java +// ----------------- +// (C) by Michael Peter Christen; mc@yacy.net +// first published on http://www.anomic.de +// Frankfurt, Germany, 2005 +// last major change: 10.09.2009 +// +// contributions by Marc Nause [MN] +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.search; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeSet; + +import de.anomic.crawler.retrieval.LoaderDispatcher; +import de.anomic.document.Document; +import de.anomic.document.parser.html.ImageEntry; +import de.anomic.yacy.yacyURL; +import de.anomic.yacy.logging.Log; + +public class MediaSnippet { + public int type; + public yacyURL href, source; + public String name, attr; + public int ranking; + public MediaSnippet(final int type, final yacyURL href, final String name, final String attr, final int ranking, final yacyURL source) { + this.type = type; + this.href = href; + this.source = source; // the web page where the media resource appeared + this.name = name; + this.attr = attr; + this.ranking = ranking; // the smaller the better! small values should be shown first + if ((this.name == null) || (this.name.length() == 0)) this.name = "_"; + if ((this.attr == null) || (this.attr.length() == 0)) this.attr = "_"; + } + public int hashCode() { + return href.hashCode(); + } + + public static ArrayList retrieveMediaSnippets(final yacyURL url, final TreeSet queryhashes, final int mediatype, final boolean fetchOnline, final int timeout, final boolean reindexing) { + if (queryhashes.size() == 0) { + Log.logFine("snippet fetch", "no query hashes given for url " + url); + return new ArrayList(); + } + + final Document document = LoaderDispatcher.retrieveDocument(url, fetchOnline, timeout, false, reindexing); + final ArrayList a = new ArrayList(); + if (document != null) { + if ((mediatype == QueryParams.CONTENTDOM_ALL) || (mediatype == QueryParams.CONTENTDOM_AUDIO)) a.addAll(computeMediaSnippets(document, queryhashes, QueryParams.CONTENTDOM_AUDIO)); + if ((mediatype == QueryParams.CONTENTDOM_ALL) || (mediatype == QueryParams.CONTENTDOM_VIDEO)) a.addAll(computeMediaSnippets(document, queryhashes, QueryParams.CONTENTDOM_VIDEO)); + if ((mediatype == QueryParams.CONTENTDOM_ALL) || (mediatype == QueryParams.CONTENTDOM_APP)) a.addAll(computeMediaSnippets(document, queryhashes, QueryParams.CONTENTDOM_APP)); + if ((mediatype == QueryParams.CONTENTDOM_ALL) || (mediatype == QueryParams.CONTENTDOM_IMAGE)) a.addAll(computeImageSnippets(document, queryhashes)); + } + return a; + } + + public static ArrayList computeMediaSnippets(final Document document, final TreeSet queryhashes, final int mediatype) { + + if (document == null) return new ArrayList(); + Map media = null; + if (mediatype == QueryParams.CONTENTDOM_AUDIO) media = document.getAudiolinks(); + else if (mediatype == QueryParams.CONTENTDOM_VIDEO) media = document.getVideolinks(); + else if (mediatype == QueryParams.CONTENTDOM_APP) media = document.getApplinks(); + if (media == null) return null; + + final Iterator> i = media.entrySet().iterator(); + Map.Entry entry; + yacyURL url; + String desc; + TreeSet s; + final ArrayList result = new ArrayList(); + while (i.hasNext()) { + entry = i.next(); + url = entry.getKey(); + desc = entry.getValue(); + s = TextSnippet.removeAppearanceHashes(url.toNormalform(false, false), queryhashes); + if (s.size() == 0) { + result.add(new MediaSnippet(mediatype, url, desc, null, 0, document.dc_source())); + continue; + } + s = TextSnippet.removeAppearanceHashes(desc, s); + if (s.size() == 0) { + result.add(new MediaSnippet(mediatype, url, desc, null, 0, document.dc_source())); + continue; + } + } + return result; + } + + public static ArrayList computeImageSnippets(final Document document, final TreeSet queryhashes) { + + final TreeSet images = new TreeSet(); + images.addAll(document.getImages().values()); // iterates images in descending size order! + // a measurement for the size of the images can be retrieved using the htmlFilterImageEntry.hashCode() + + final Iterator i = images.iterator(); + ImageEntry ientry; + yacyURL url; + String desc; + TreeSet s; + final ArrayList result = new ArrayList(); + while (i.hasNext()) { + ientry = i.next(); + url = ientry.url(); + desc = ientry.alt(); + s = TextSnippet.removeAppearanceHashes(url.toNormalform(false, false), queryhashes); + if (s.size() == 0) { + final int ranking = ientry.hashCode(); + result.add(new MediaSnippet(QueryParams.CONTENTDOM_IMAGE, url, desc, ientry.width() + " x " + ientry.height(), ranking, document.dc_source())); + continue; + } + s = TextSnippet.removeAppearanceHashes(desc, s); + if (s.size() == 0) { + final int ranking = ientry.hashCode(); + result.add(new MediaSnippet(QueryParams.CONTENTDOM_IMAGE, url, desc, ientry.width() + " x " + ientry.height(), ranking, document.dc_source())); + continue; + } + } + return result; + } + + + /* + private static String computeMediaSnippet(Map media, Set queryhashes) { + Iterator> i = media.entrySet().iterator(); + Map.Entry entry; + yacyURL url; + String desc; + Set s; + String result = ""; + while (i.hasNext()) { + entry = i.next(); + url = entry.getKey(); + desc = entry.getValue(); + s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes); + if (s.size() == 0) { + result += "
" + ((desc.length() == 0) ? url : desc) + ""; + continue; + } + s = removeAppearanceHashes(desc, s); + if (s.size() == 0) { + result += "
" + ((desc.length() == 0) ? url : desc) + ""; + continue; + } + } + if (result.length() == 0) return null; + return result.substring(6); + } + */ + +} diff --git a/source/de/anomic/search/ResultEntry.java b/source/de/anomic/search/ResultEntry.java index ff30cfbd0..d0cc162cc 100644 --- a/source/de/anomic/search/ResultEntry.java +++ b/source/de/anomic/search/ResultEntry.java @@ -48,8 +48,8 @@ public class ResultEntry { private final URLMetadataRow.Components urlcomps; // buffer for components private String alternative_urlstring; private String alternative_urlname; - private final SnippetCache.TextSnippet textSnippet; - private final ArrayList mediaSnippets; + private final TextSnippet textSnippet; + private final ArrayList mediaSnippets; // statistic objects public long dbRetrievalTime, snippetComputationTime; @@ -57,8 +57,8 @@ public class ResultEntry { public ResultEntry(final URLMetadataRow urlentry, final Segment indexSegment, yacySeedDB peers, - final SnippetCache.TextSnippet textSnippet, - final ArrayList mediaSnippets, + final TextSnippet textSnippet, + final ArrayList mediaSnippets, final long dbRetrievalTime, final long snippetComputationTime) { this.urlentry = urlentry; this.urlcomps = urlentry.metadata(); @@ -118,10 +118,10 @@ public class ResultEntry { public String title() { return urlcomps.dc_title(); } - public SnippetCache.TextSnippet textSnippet() { + public TextSnippet textSnippet() { return this.textSnippet; } - public ArrayList mediaSnippets() { + public ArrayList mediaSnippets() { return this.mediaSnippets; } public Date modified() { diff --git a/source/de/anomic/search/SnippetCache.java b/source/de/anomic/search/SnippetCache.java deleted file mode 100644 index 00a936a8b..000000000 --- a/source/de/anomic/search/SnippetCache.java +++ /dev/null @@ -1,933 +0,0 @@ -// plasmaSnippetCache.java -// ----------------------- -// part of YaCy -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2005 -// last major change: 09.10.2006 -// -// contributions by Marc Nause [MN] -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.search; - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Enumeration; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; -import java.util.TreeSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import de.anomic.crawler.retrieval.Response; -import de.anomic.document.Condenser; -import de.anomic.document.Parser; -import de.anomic.document.ParserException; -import de.anomic.document.Word; -import de.anomic.document.Document; -import de.anomic.document.parser.html.CharacterCoding; -import de.anomic.document.parser.html.ImageEntry; -import de.anomic.http.client.Client; -import de.anomic.http.client.Cache; -import de.anomic.http.metadata.ResponseHeader; -import de.anomic.kelondro.index.SimpleARC; -import de.anomic.kelondro.order.Base64Order; -import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; -import de.anomic.kelondro.util.SetTools; -import de.anomic.yacy.yacySearch; -import de.anomic.yacy.yacyURL; -import de.anomic.yacy.logging.Log; - -public class SnippetCache { - - private static final int maxCache = 500; - - public static final int SOURCE_CACHE = 0; - public static final int SOURCE_FILE = 1; - public static final int SOURCE_WEB = 2; - public static final int SOURCE_METADATA = 3; - - public static final int ERROR_NO_HASH_GIVEN = 11; - public static final int ERROR_SOURCE_LOADING = 12; - public static final int ERROR_RESOURCE_LOADING = 13; - public static final int ERROR_PARSER_FAILED = 14; - public static final int ERROR_PARSER_NO_LINES = 15; - public static final int ERROR_NO_MATCH = 16; - - private static final SimpleARC snippetsCache = new SimpleARC(maxCache); - - /** - * a cache holding URLs to favicons specified by the page content, e.g. by using the html link-tag. e.g. - *
-     * 	 <link rel="shortcut icon" type="image/x-icon" href="../src/favicon.ico">
-     * 
- */ - private static final HashMap faviconCache = new HashMap(); - private static Log log = null; - private static Switchboard sb = null; - - public static void init( - final Log logx, - final Switchboard switchboard - ) { - log = logx; - sb = switchboard; - snippetsCache.clear(); - faviconCache.clear(); - } - - public static class TextSnippet { - private final yacyURL url; - private String line; - private final String error; - private final int errorCode; - TreeSet remaingHashes; - private final yacyURL favicon; - - /** - * \\A[^\\p{L}\\p{N}].+ - */ - private final static Pattern p1 = Pattern.compile("\\A[^\\p{L}\\p{N}].+"); - /** - * .+[^\\p{L}\\p{N}]\\Z - */ - private final static Pattern p2 = Pattern.compile(".+[^\\p{L}\\p{N}]\\Z"); - /** - * \\A[\\p{L}\\p{N}]+[^\\p{L}\\p{N}].+\\Z - */ - private final static Pattern p3 = Pattern.compile("\\A[\\p{L}\\p{N}]+[^\\p{L}\\p{N}].+\\Z"); - /** - * [^\\p{L}\\p{N}] - */ - private final static Pattern p4 = Pattern.compile("[^\\p{L}\\p{N}]"); - /** - * (.*?)(\\<b\\>.+?\\</b\\>)(.*) - */ - private final static Pattern p01 = Pattern.compile("(.*?)(\\.+?\\)(.*)"); // marked words are in -tags - - public TextSnippet(final yacyURL url, final String line, final int errorCode, final TreeSet remaingHashes, final String errortext) { - this(url,line,errorCode,remaingHashes,errortext,null); - } - - public TextSnippet(final yacyURL url, final String line, final int errorCode, final TreeSet remaingHashes, final String errortext, final yacyURL favicon) { - this.url = url; - this.line = line; - this.errorCode = errorCode; - this.error = errortext; - this.remaingHashes = remaingHashes; - this.favicon = favicon; - } - public yacyURL getUrl() { - return this.url; - } - public yacyURL getFavicon() { - return this.favicon; - } - public boolean exists() { - return line != null; - } - public String toString() { - return (line == null) ? "" : line; - } - public String getLineRaw() { - return (line == null) ? "" : line; - } - public String getError() { - return (error == null) ? "" : error.trim(); - } - public int getErrorCode() { - return errorCode; - } - public TreeSet getRemainingHashes() { - return this.remaingHashes; - } - public String getLineMarked(final TreeSet queryHashes) { - if (line == null) return ""; - if ((queryHashes == null) || (queryHashes.size() == 0)) return line.trim(); - if (line.endsWith(".")) line = line.substring(0, line.length() - 1); - final Iterator i = queryHashes.iterator(); - byte[] h; - final String[] w = line.split(" "); - while (i.hasNext()) { - h = i.next(); - for (int j = 0; j < w.length; j++) { - final ArrayList al = markedWordArrayList(w[j]); // mark special character separated words correctly if more than 1 word has to be marked - w[j] = ""; - for (int k = 0; k < al.size(); k++) { - if(k % 2 == 0){ // word has not been marked - w[j] += getWordMarked(al.get(k), h); - } else { // word has been marked, do not encode again - w[j] += al.get(k); - } - } - } - } - final StringBuilder l = new StringBuilder(line.length() + queryHashes.size() * 8); - for (int j = 0; j < w.length; j++) { - l.append(w[j]); - l.append(' '); - } - return l.toString().trim(); - } - - /** - * mark words with <b>-tags - * @param word the word to mark - * @param h the hash of the word to mark - * @return the marked word if hash matches, else the unmarked word - * @see #getLineMarked(Set) - */ - private static String getWordMarked(String word, byte[] h){ - //ignore punctuation marks (contrib [MN]) - //note to myself: - //For details on regex see "Mastering regular expressions" by J.E.F. Friedl - //especially p. 123 and p. 390/391 (in the German version of the 2nd edition) - - String prefix = ""; - String postfix = ""; - int len = 0; - - // cut off prefix if it contains of non-characters or non-numbers - while(p1.matcher(word).find()) { - prefix = prefix + word.substring(0,1); - word = word.substring(1); - } - - // cut off postfix if it contains of non-characters or non-numbers - while(p2.matcher(word).find()) { - len = word.length(); - postfix = word.substring(len-1,len) + postfix; - word = word.substring(0,len-1); - } - - //special treatment if there is a special character in the word - if(p3.matcher(word).find()) { - String out = ""; - String temp = ""; - for(int k=0; k < word.length(); k++) { - //is character a special character? - if(p4.matcher(word.substring(k,k+1)).find()) { - if (new String(Word.word2hash(temp)).equals(new String(h))) temp = "" + CharacterCoding.unicode2html(temp, false) + ""; - out = out + temp + CharacterCoding.unicode2html(word.substring(k,k+1), false); - temp = ""; - } - //last character - else if(k == (word.length()-1)) { - temp = temp + word.substring(k,k+1); - if (new String(Word.word2hash(temp)).equals(new String(h))) temp = "" + CharacterCoding.unicode2html(temp, false) + ""; - out = out + temp; - temp = ""; - } - else temp = temp + word.substring(k,k+1); - } - word = out; - } - - //end contrib [MN] - else if (new String(Word.word2hash(word)).equals(new String(h))) word = "" + CharacterCoding.unicode2html(word, false) + ""; - - word = CharacterCoding.unicode2html(prefix, false) - + word - + CharacterCoding.unicode2html(postfix, false); - return word; - } - - /** - * words that already has been marked has index (i % 2 == 1) - * words that has not yet been marked has index (i % 2 == 0) - * @param string the String to be processed - * @return words that already has and has not yet been marked - * @author [DW], 08.11.2008 - */ - private static ArrayList markedWordArrayList(String string){ - ArrayList al = new java.util.ArrayList(1); - Matcher m = p01.matcher(string); - while (m.find()) { - al.add(m.group(1)); - al.add(m.group(2)); - string = m.group(3); // the postfix - m = p01.matcher(string); - } - al.add(string); - return al; - } - - } - - public static class MediaSnippet { - public int type; - public yacyURL href, source; - public String name, attr; - public int ranking; - public MediaSnippet(final int type, final yacyURL href, final String name, final String attr, final int ranking, final yacyURL source) { - this.type = type; - this.href = href; - this.source = source; // the web page where the media resource appeared - this.name = name; - this.attr = attr; - this.ranking = ranking; // the smaller the better! small values should be shown first - if ((this.name == null) || (this.name.length() == 0)) this.name = "_"; - if ((this.attr == null) || (this.attr.length() == 0)) this.attr = "_"; - } - public int hashCode() { - return href.hashCode(); - } - } - - public static boolean existsInCache(final yacyURL url, final TreeSet queryhashes) { - final String hashes = yacySearch.set2string(queryhashes); - return retrieveFromCache(hashes, url.hash()) != null; - } - - @SuppressWarnings("unchecked") - public static TextSnippet retrieveTextSnippet(final URLMetadataRow.Components comp, final TreeSet queryhashes, final boolean fetchOnline, final boolean pre, final int snippetMaxLength, final int maxDocLen, final boolean reindexing) { - // heise = "0OQUNU3JSs05" - final yacyURL url = comp.url(); - if (queryhashes.size() == 0) { - //System.out.println("found no queryhashes for URL retrieve " + url); - return new TextSnippet(url, null, ERROR_NO_HASH_GIVEN, queryhashes, "no query hashes given"); - } - - // try to get snippet from snippetCache - int source = SOURCE_CACHE; - final String wordhashes = yacySearch.set2string(queryhashes); - String line = retrieveFromCache(wordhashes, url.hash()); - if (line != null) { - // found the snippet - return new TextSnippet(url, line, source, null, null, faviconCache.get(url.hash())); - } - - /* =========================================================================== - * LOADING RESOURCE DATA - * =========================================================================== */ - // if the snippet is not in the cache, we can try to get it from the htcache - long resContentLength = 0; - InputStream resContent = null; - ResponseHeader responseHeader = null; - try { - // first try to get the snippet from metadata - String loc; - if (containsAllHashes(loc = comp.dc_title(), queryhashes)) { - // try to create the snippet from information given in the url itself - return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(url.hash())); - } else if (containsAllHashes(loc = comp.dc_creator(), queryhashes)) { - // try to create the snippet from information given in the creator metadata - return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(url.hash())); - } else if (containsAllHashes(loc = comp.dc_subject(), queryhashes)) { - // try to create the snippet from information given in the subject metadata - return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(url.hash())); - } else if (containsAllHashes(loc = comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) { - // try to create the snippet from information given in the subject metadata - return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(url.hash())); - } else { - // trying to load the resource from the cache - resContent = Cache.getContentStream(url); - responseHeader = Cache.getResponseHeader(url); - if (resContent != null && ((resContentLength = Cache.getResourceContentLength(url)) > maxDocLen) && (!fetchOnline)) { - // content may be too large to be parsed here. To be fast, we omit calculation of snippet here - return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes"); - } else if (fetchOnline) { - // if not found try to download it - - // download resource using the crawler and keep resource in memory if possible - final Response entry = Switchboard.getSwitchboard().loader.load(url, true, reindexing); - - // getting resource metadata (e.g. the http headers for http resources) - if (entry != null) { - // place entry on indexing queue - sb.toIndexer(entry); - - // read resource body (if it is there) - final byte []resourceArray = entry.getContent(); - if (resourceArray != null) { - resContent = new ByteArrayInputStream(resourceArray); - resContentLength = resourceArray.length; - } else { - resContent = Cache.getContentStream(url); - resContentLength = Cache.getResourceContentLength(url); - } - } - - // if it is still not available, report an error - if (resContent == null) return new TextSnippet(url, null, ERROR_RESOURCE_LOADING, queryhashes, "error loading resource, plasmaHTCache.Entry cache is NULL"); - - source = SOURCE_WEB; - } else { - return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "no resource available"); - } - } - } catch (final Exception e) { - //e.printStackTrace(); - return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "error loading resource: " + e.getMessage()); - } - - /* =========================================================================== - * PARSING RESOURCE - * =========================================================================== */ - Document document = null; - try { - document = parseDocument(url, resContentLength, resContent, responseHeader); - } catch (final ParserException e) { - return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed - } finally { - try { resContent.close(); } catch (final Exception e) {/* ignore this */} - } - if (document == null) return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, "parser error/failed"); // cannot be parsed - - - /* =========================================================================== - * COMPUTE SNIPPET - * =========================================================================== */ - final yacyURL resFavicon = document.getFavicon(); - if (resFavicon != null) faviconCache.put(url.hash(), resFavicon); - // we have found a parseable non-empty file: use the lines - - // compute snippet from text - final Iterator sentences = document.getSentences(pre); - if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences",resFavicon); - final Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength); - final String textline = (tsr == null) ? null : (String) tsr[0]; - final TreeSet remainingHashes = (tsr == null) ? queryhashes : (TreeSet) tsr[1]; - - // compute snippet from media - //String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes); - //String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes); - //String appline = computeMediaSnippet(document.getApplinks(), queryhashes); - //String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes); - //String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes); - - line = ""; - //if (audioline != null) line += (line.length() == 0) ? audioline : "
" + audioline; - //if (videoline != null) line += (line.length() == 0) ? videoline : "
" + videoline; - //if (appline != null) line += (line.length() == 0) ? appline : "
" + appline; - //if (hrefline != null) line += (line.length() == 0) ? hrefline : "
" + hrefline; - if (textline != null) line += (line.length() == 0) ? textline : "
" + textline; - - if ((line == null) || (remainingHashes.size() > 0)) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found",resFavicon); - if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength); - - // finally store this snippet in our own cache - storeToCache(wordhashes, url.hash(), line); - - document.close(); - return new TextSnippet(url, line, source, null, null, resFavicon); - } - - /** - * Tries to load and parse a resource specified by it's URL. - * If the resource is not stored in cache and if fetchOnline is set the - * this function tries to download the resource from web. - * - * @param url the URL of the resource - * @param fetchOnline specifies if the resource should be loaded from web if it'as not available in the cache - * @param timeout - * @param forText - * @param global the domain of the search. If global == true then the content is re-indexed - * @return the parsed document as {@link Document} - */ - public static Document retrieveDocument(final yacyURL url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global) { - - // load resource - long resContentLength = 0; - InputStream resContent = null; - ResponseHeader responseHeader = null; - try { - // trying to load the resource from the cache - resContent = Cache.getContentStream(url); - responseHeader = Cache.getResponseHeader(url); - if (resContent != null) { - // if the content was found - resContentLength = Cache.getResourceContentLength(url); - } else if (fetchOnline) { - // if not found try to download it - - // download resource using the crawler and keep resource in memory if possible - final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global); - - // getting resource metadata (e.g. the http headers for http resources) - if (entry != null) { - - // read resource body (if it is there) - final byte[] resourceArray = entry.getContent(); - if (resourceArray != null) { - resContent = new ByteArrayInputStream(resourceArray); - resContentLength = resourceArray.length; - } else { - resContent = Cache.getContentStream(url); - resContentLength = Cache.getResourceContentLength(url); - } - } - - // if it is still not available, report an error - if (resContent == null) { - Log.logFine("snippet fetch", "plasmaHTCache.Entry cache is NULL for url " + url); - return null; - } - } else { - Log.logFine("snippet fetch", "no resource available for url " + url); - return null; - } - } catch (final Exception e) { - Log.logFine("snippet fetch", "error loading resource: " + e.getMessage() + " for url " + url); - return null; - } - - // parse resource - Document document = null; - try { - document = parseDocument(url, resContentLength, resContent, responseHeader); - } catch (final ParserException e) { - Log.logFine("snippet fetch", "parser error " + e.getMessage() + " for url " + url); - return null; - } finally { - try { resContent.close(); } catch (final Exception e) {} - } - return document; - } - - public static void storeToCache(final String wordhashes, final String urlhash, final String snippet) { - // generate key - String key = urlhash + wordhashes; - - // do nothing if snippet is known - if (snippetsCache.containsKey(key)) return; - - // learn new snippet - snippetsCache.put(key, snippet); - } - - private static String retrieveFromCache(final String wordhashes, final String urlhash) { - // generate key - final String key = urlhash + wordhashes; - return snippetsCache.get(key); - } - - /* - private static String computeMediaSnippet(Map media, Set queryhashes) { - Iterator> i = media.entrySet().iterator(); - Map.Entry entry; - yacyURL url; - String desc; - Set s; - String result = ""; - while (i.hasNext()) { - entry = i.next(); - url = entry.getKey(); - desc = entry.getValue(); - s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes); - if (s.size() == 0) { - result += "
" + ((desc.length() == 0) ? url : desc) + ""; - continue; - } - s = removeAppearanceHashes(desc, s); - if (s.size() == 0) { - result += "
" + ((desc.length() == 0) ? url : desc) + ""; - continue; - } - } - if (result.length() == 0) return null; - return result.substring(6); - } - */ - - @SuppressWarnings("unchecked") - private static Object[] /*{String - the snippet, Set - remaining hashes}*/ - computeTextSnippet(final Iterator sentences, final TreeSet queryhashes, int maxLength) { - try { - if (sentences == null) return null; - if ((queryhashes == null) || (queryhashes.size() == 0)) return null; - Iterator j; - TreeMap hs; - StringBuilder sentence; - final TreeMap os = new TreeMap(); - int uniqCounter = 9999; - int score; - while (sentences.hasNext()) { - sentence = sentences.next(); - hs = hashSentence(sentence.toString()); - j = queryhashes.iterator(); - score = 0; - while (j.hasNext()) {if (hs.containsKey(j.next())) score++;} - if (score > 0) { - os.put(Integer.valueOf(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence); - } - } - - String result; - TreeSet remaininghashes; - while (os.size() > 0) { - sentence = os.remove(os.lastKey()); // sentence with the biggest score - Object[] tsr = computeTextSnippet(sentence.toString(), queryhashes, maxLength); - if (tsr == null) continue; - result = (String) tsr[0]; - if ((result != null) && (result.length() > 0)) { - remaininghashes = (TreeSet) tsr[1]; - if (remaininghashes.size() == 0) { - // we have found the snippet - return new Object[]{result, remaininghashes}; - } else if (remaininghashes.size() < queryhashes.size()) { - // the result has not all words in it. - // find another sentence that represents the missing other words - // and find recursively more sentences - maxLength = maxLength - result.length(); - if (maxLength < 20) maxLength = 20; - tsr = computeTextSnippet(os.values().iterator(), remaininghashes, maxLength); - if (tsr == null) return null; - final String nextSnippet = (String) tsr[0]; - if (nextSnippet == null) return tsr; - return new Object[]{result + (" / " + nextSnippet), tsr[1]}; - } else { - // error - //assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'"; - continue; - } - } - } - return null; - } catch (final IndexOutOfBoundsException e) { - log.logSevere("computeSnippet: error with string generation", e); - return new Object[]{null, queryhashes}; - } - } - - private static Object[] /*{String - the snippet, Set - remaining hashes}*/ - computeTextSnippet(String sentence, final TreeSet queryhashes, final int maxLength) { - try { - if (sentence == null) return null; - if ((queryhashes == null) || (queryhashes.size() == 0)) return null; - byte[] hash; - - // find all hashes that appear in the sentence - final TreeMap hs = hashSentence(sentence); - final Iterator j = queryhashes.iterator(); - Integer pos; - int p, minpos = sentence.length(), maxpos = -1; - final TreeSet remainingHashes = new TreeSet(Base64Order.enhancedCoder); - while (j.hasNext()) { - hash = j.next(); - pos = hs.get(hash); - if (pos == null) { - remainingHashes.add(hash); - } else { - p = pos.intValue(); - if (p > maxpos) maxpos = p; - if (p < minpos) minpos = p; - } - } - // check result size - maxpos = maxpos + 10; - if (maxpos > sentence.length()) maxpos = sentence.length(); - if (minpos < 0) minpos = 0; - // we have a result, but is it short enough? - if (maxpos - minpos + 10 > maxLength) { - // the string is too long, even if we cut at both ends - // so cut here in the middle of the string - final int lenb = sentence.length(); - sentence = sentence.substring(0, (minpos + 20 > sentence.length()) ? sentence.length() : minpos + 20).trim() + - " [..] " + - sentence.substring((maxpos + 26 > sentence.length()) ? sentence.length() : maxpos + 26).trim(); - maxpos = maxpos + lenb - sentence.length() + 6; - } - if (maxpos > maxLength) { - // the string is too long, even if we cut it at the end - // so cut it here at both ends at once - assert maxpos >= minpos; - final int newlen = Math.max(10, maxpos - minpos + 10); - final int around = (maxLength - newlen) / 2; - assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); - //assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); - sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]"; - minpos = around; - maxpos = sentence.length() - around - 5; - } - if (sentence.length() > maxLength) { - // trim sentence, 1st step (cut at right side) - sentence = sentence.substring(0, maxpos).trim() + " [..]"; - } - if (sentence.length() > maxLength) { - // trim sentence, 2nd step (cut at left side) - sentence = "[..] " + sentence.substring(minpos).trim(); - } - if (sentence.length() > maxLength) { - // trim sentence, 3rd step (cut in the middle) - sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim(); - } - return new Object[] {sentence, remainingHashes}; - } catch (final IndexOutOfBoundsException e) { - log.logSevere("computeSnippet: error with string generation", e); - return null; - } - } - - public static ArrayList retrieveMediaSnippets(final yacyURL url, final TreeSet queryhashes, final int mediatype, final boolean fetchOnline, final int timeout, final boolean reindexing) { - if (queryhashes.size() == 0) { - Log.logFine("snippet fetch", "no query hashes given for url " + url); - return new ArrayList(); - } - - final Document document = retrieveDocument(url, fetchOnline, timeout, false, reindexing); - final ArrayList a = new ArrayList(); - if (document != null) { - if ((mediatype == QueryParams.CONTENTDOM_ALL) || (mediatype == QueryParams.CONTENTDOM_AUDIO)) a.addAll(computeMediaSnippets(document, queryhashes, QueryParams.CONTENTDOM_AUDIO)); - if ((mediatype == QueryParams.CONTENTDOM_ALL) || (mediatype == QueryParams.CONTENTDOM_VIDEO)) a.addAll(computeMediaSnippets(document, queryhashes, QueryParams.CONTENTDOM_VIDEO)); - if ((mediatype == QueryParams.CONTENTDOM_ALL) || (mediatype == QueryParams.CONTENTDOM_APP)) a.addAll(computeMediaSnippets(document, queryhashes, QueryParams.CONTENTDOM_APP)); - if ((mediatype == QueryParams.CONTENTDOM_ALL) || (mediatype == QueryParams.CONTENTDOM_IMAGE)) a.addAll(computeImageSnippets(document, queryhashes)); - } - return a; - } - - public static ArrayList computeMediaSnippets(final Document document, final TreeSet queryhashes, final int mediatype) { - - if (document == null) return new ArrayList(); - Map media = null; - if (mediatype == QueryParams.CONTENTDOM_AUDIO) media = document.getAudiolinks(); - else if (mediatype == QueryParams.CONTENTDOM_VIDEO) media = document.getVideolinks(); - else if (mediatype == QueryParams.CONTENTDOM_APP) media = document.getApplinks(); - if (media == null) return null; - - final Iterator> i = media.entrySet().iterator(); - Map.Entry entry; - yacyURL url; - String desc; - TreeSet s; - final ArrayList result = new ArrayList(); - while (i.hasNext()) { - entry = i.next(); - url = entry.getKey(); - desc = entry.getValue(); - s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes); - if (s.size() == 0) { - result.add(new MediaSnippet(mediatype, url, desc, null, 0, document.dc_source())); - continue; - } - s = removeAppearanceHashes(desc, s); - if (s.size() == 0) { - result.add(new MediaSnippet(mediatype, url, desc, null, 0, document.dc_source())); - continue; - } - } - return result; - } - - public static ArrayList computeImageSnippets(final Document document, final TreeSet queryhashes) { - - final TreeSet images = new TreeSet(); - images.addAll(document.getImages().values()); // iterates images in descending size order! - // a measurement for the size of the images can be retrieved using the htmlFilterImageEntry.hashCode() - - final Iterator i = images.iterator(); - ImageEntry ientry; - yacyURL url; - String desc; - TreeSet s; - final ArrayList result = new ArrayList(); - while (i.hasNext()) { - ientry = i.next(); - url = ientry.url(); - desc = ientry.alt(); - s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes); - if (s.size() == 0) { - final int ranking = ientry.hashCode(); - result.add(new MediaSnippet(QueryParams.CONTENTDOM_IMAGE, url, desc, ientry.width() + " x " + ientry.height(), ranking, document.dc_source())); - continue; - } - s = removeAppearanceHashes(desc, s); - if (s.size() == 0) { - final int ranking = ientry.hashCode(); - result.add(new MediaSnippet(QueryParams.CONTENTDOM_IMAGE, url, desc, ientry.width() + " x " + ientry.height(), ranking, document.dc_source())); - continue; - } - } - return result; - } - - private static TreeSet removeAppearanceHashes(final String sentence, final TreeSet queryhashes) { - // remove all hashes that appear in the sentence - if (sentence == null) return queryhashes; - final TreeMap hs = hashSentence(sentence); - final Iterator j = queryhashes.iterator(); - byte[] hash; - Integer pos; - final TreeSet remaininghashes = new TreeSet(Base64Order.enhancedCoder); - while (j.hasNext()) { - hash = j.next(); - pos = hs.get(hash); - if (pos == null) { - remaininghashes.add(hash); - } - } - return remaininghashes; - } - - private static TreeMap hashSentence(final String sentence) { - // generates a word-wordPos mapping - final TreeMap map = new TreeMap(Base64Order.enhancedCoder); - final Enumeration words = Condenser.wordTokenizer(sentence, "UTF-8"); - int pos = 0; - StringBuilder word; - byte[] hash; - while (words.hasMoreElements()) { - word = words.nextElement(); - hash = Word.word2hash(new String(word)); - if (!map.containsKey(hash)) map.put(hash, Integer.valueOf(pos)); // don't overwrite old values, that leads to too far word distances - pos += word.length() + 1; - } - return map; - } - - private static boolean containsAllHashes(final String sentence, final Set queryhashes) { - final TreeMap m = hashSentence(sentence); - final Iterator i = queryhashes.iterator(); - while (i.hasNext()) { - if (!(m.containsKey(i.next()))) return false; - } - return true; - } - - public static Document parseDocument(final yacyURL url, final long contentLength, final InputStream resourceStream) throws ParserException { - return parseDocument(url, contentLength, resourceStream, null); - } - - /** - * Parse the resource - * @param url the URL of the resource - * @param contentLength the contentLength of the resource - * @param resourceStream the resource body as stream - * @param docInfo metadata about the resource - * @return the extracted data - * @throws ParserException - */ - public static Document parseDocument(final yacyURL url, final long contentLength, final InputStream resourceStream, ResponseHeader responseHeader) throws ParserException { - try { - if (resourceStream == null) return null; - - // STEP 1: if no resource metadata is available, try to load it from cache - if (responseHeader == null) { - // try to get the header from the htcache directory - try { - responseHeader = Cache.getResponseHeader(url); - } catch (final Exception e) { - // ignore this. resource info loading failed - } - } - - // STEP 2: if the metadata is still null try to download it from web - if ((responseHeader == null) && (url.getProtocol().startsWith("http"))) { - // TODO: we need a better solution here - // e.g. encapsulate this in the crawlLoader class - - // getting URL mimeType - try { - responseHeader = Client.whead(url.toString()); - } catch (final Exception e) { - // ingore this. http header download failed - } - } - - // STEP 3: if the metadata is still null try to guess the mimeType of the resource - String supportError = Parser.supports(url, responseHeader == null ? null : responseHeader.mime()); - if (supportError != null) { - log.logInfo("could not generate snippet for " + url.toNormalform(true, false) + ": " + supportError); - return null; - } - if (responseHeader == null) { - return Parser.parseSource(url, null, null, contentLength, resourceStream); - } - return Parser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream); - } catch (final InterruptedException e) { - // interruption of thread detected - return null; - } - } - - /** - * - * @param url - * @param fetchOnline - * @param socketTimeout - * @param forText - * @return an Object array containing - * - * - * - *
[0]the content as {@link InputStream}
[1]the content-length as {@link Integer}
- * @throws IOException - */ - public static Object[] getResource(final yacyURL url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException { - // load the url as resource from the web - long contentLength = -1; - - // trying to load the resource body from cache - InputStream resource = Cache.getContentStream(url); - if (resource != null) { - contentLength = Cache.getResourceContentLength(url); - } else if (fetchOnline) { - // if the content is not available in cache try to download it from web - - // try to download the resource using a crawler - final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, reindexing); - if (entry == null) return null; // not found in web - - // read resource body (if it is there) - final byte[] resourceArray = entry.getContent(); - - // in case that the resource was not in ram, read it from disk - if (resourceArray == null) { - resource = Cache.getContentStream(url); - contentLength = Cache.getResourceContentLength(url); - } else { - resource = new ByteArrayInputStream(resourceArray); - contentLength = resourceArray.length; - } - } else { - return null; - } - return new Object[]{resource, Long.valueOf(contentLength)}; - } - - public static String failConsequences(final TextSnippet snippet, final String eventID) throws IOException { - // problems with snippet fetch - final String urlHash = snippet.getUrl().hash(); - final String querystring = SetTools.setToString(snippet.getRemainingHashes(), ' '); - if ((snippet.getErrorCode() == ERROR_SOURCE_LOADING) || - (snippet.getErrorCode() == ERROR_RESOURCE_LOADING) || - (snippet.getErrorCode() == ERROR_PARSER_FAILED) || - (snippet.getErrorCode() == ERROR_PARSER_NO_LINES)) { - log.logInfo("error: '" + snippet.getError() + "', remove url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError()); - Switchboard.getSwitchboard().indexSegment.urlMetadata().remove(urlHash); - final SearchEvent event = SearchEventCache.getEvent(eventID); - assert Switchboard.getSwitchboard() != null; - assert Switchboard.getSwitchboard().indexSegment != null; - assert event != null : "eventID = " + eventID; - assert event.getQuery() != null; - Switchboard.getSwitchboard().indexSegment.termIndex().remove(event.getQuery().queryHashes, urlHash); - event.remove(urlHash); - } - if (snippet.getErrorCode() == ERROR_NO_MATCH) { - log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError()); - Switchboard.getSwitchboard().indexSegment.termIndex().remove(snippet.remaingHashes, urlHash); - SearchEventCache.getEvent(eventID).remove(urlHash); - } - return snippet.getError(); - } - -} \ No newline at end of file diff --git a/source/de/anomic/search/SnippetFetcher.java b/source/de/anomic/search/SnippetFetcher.java index 6e321f270..ed1f8431f 100644 --- a/source/de/anomic/search/SnippetFetcher.java +++ b/source/de/anomic/search/SnippetFetcher.java @@ -39,7 +39,7 @@ import de.anomic.kelondro.util.SetTools; import de.anomic.kelondro.util.SortStack; import de.anomic.kelondro.util.SortStore; import de.anomic.search.RankingProcess.NavigatorEntry; -import de.anomic.search.SnippetCache.MediaSnippet; +import de.anomic.search.MediaSnippet; import de.anomic.server.serverProfiling; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.logging.Log; @@ -56,11 +56,11 @@ public class SnippetFetcher { private final yacySeedDB peers; // result values - protected Worker[] workerThreads; - protected final SortStore result; - protected final SortStore images; // container to sort images by size - protected final HashMap failedURLs; // a mapping from a urlhash to a fail reason string - protected final TreeSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets + protected Worker[] workerThreads; + protected final SortStore result; + protected final SortStore images; // container to sort images by size + protected final HashMap failedURLs; // a mapping from a urlhash to a fail reason string + protected final TreeSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets long urlRetrievalAllTime; long snippetComputationAllTime; @@ -80,7 +80,7 @@ public class SnippetFetcher { this.urlRetrievalAllTime = 0; this.snippetComputationAllTime = 0; this.result = new SortStore(-1); // this is the result, enriched with snippets, ranked and ordered by ranking - this.images = new SortStore(-1); + this.images = new SortStore(-1); this.failedURLs = new HashMap(); // a map of urls to reason strings where a worker thread tried to work on, but failed. // snippets do not need to match with the complete query hashes, @@ -185,7 +185,7 @@ public class SnippetFetcher { if (query.contentdom == QueryParams.CONTENTDOM_TEXT) { // attach text snippet startTime = System.currentTimeMillis(); - final SnippetCache.TextSnippet snippet = SnippetCache.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), 180, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal()); + final TextSnippet snippet = TextSnippet.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), 180, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal()); final long snippetComputationTime = System.currentTimeMillis() - startTime; Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")"))); @@ -201,7 +201,7 @@ public class SnippetFetcher { registerFailure(page.hash(), "no text snippet for URL " + metadata.url()); if (!peers.mySeed().isVirgin()) try { - SnippetCache.failConsequences(snippet, query.id(false)); + TextSnippet.failConsequences(snippet, query.id(false)); } catch (IOException e) { e.printStackTrace(); } @@ -210,7 +210,7 @@ public class SnippetFetcher { } else { // attach media information startTime = System.currentTimeMillis(); - final ArrayList mediaSnippets = SnippetCache.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetFetchMode == 2), 6000, query.isGlobal()); + final ArrayList mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetFetchMode == 2), 6000, query.isGlobal()); final long snippetComputationTime = System.currentTimeMillis() - startTime; Log.logInfo("SEARCH_EVENT", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime); @@ -355,7 +355,7 @@ public class SnippetFetcher { return re; } - public SnippetCache.MediaSnippet oneImage(final int item) { + public MediaSnippet oneImage(final int item) { // check if we already retrieved this item (happens if a search pages is accessed a second time) if (this.images.sizeStore() > item) { // we have the wanted result already in the result array .. return that @@ -367,10 +367,10 @@ public class SnippetFetcher { for (int i = 0; i < count; i++) { // generate result object final ResultEntry result = nextResult(); - SnippetCache.MediaSnippet ms; + MediaSnippet ms; if (result != null) { // iterate over all images in the result - final ArrayList imagemedia = result.mediaSnippets(); + final ArrayList imagemedia = result.mediaSnippets(); if (imagemedia != null) { for (int j = 0; j < imagemedia.size(); j++) { ms = imagemedia.get(j); diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 1856b69e4..ea9ba4ce3 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -577,7 +577,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi // generate snippets cache log.logConfig("Initializing Snippet Cache"); - SnippetCache.init(log, this); + TextSnippet.init(log, this); // init the wiki wikiParser = new wikiCode(this.peers.mySeed().getClusterAddress()); @@ -1805,7 +1805,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi // get the resource content Object[] resource = null; try { - resource = SnippetCache.getResource(metadata.url(), fetchOnline, 10000, true, false); + resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false); } catch (IOException e) { Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage()); } @@ -1818,7 +1818,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi final Long resourceContentLength = (Long) resource[1]; // parse the resource - final Document document = SnippetCache.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent); + final Document document = Document.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent); // get the word set Set words = null; diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java new file mode 100644 index 000000000..fb23d1155 --- /dev/null +++ b/source/de/anomic/search/TextSnippet.java @@ -0,0 +1,597 @@ +// TextSnippet.java +// ----------------- +// (C) by Michael Peter Christen; mc@yacy.net +// first published on http://www.anomic.de +// Frankfurt, Germany, 2005 +// last major change: 10.09.2009 +// +// contributions by Marc Nause [MN] +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.search; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.anomic.crawler.retrieval.Response; +import de.anomic.document.Condenser; +import de.anomic.document.Document; +import de.anomic.document.ParserException; +import de.anomic.document.Word; +import de.anomic.document.parser.html.CharacterCoding; +import de.anomic.http.client.Cache; +import de.anomic.http.metadata.ResponseHeader; +import de.anomic.kelondro.index.SimpleARC; +import de.anomic.kelondro.order.Base64Order; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; +import de.anomic.kelondro.util.SetTools; +import de.anomic.yacy.yacySearch; +import de.anomic.yacy.yacyURL; +import de.anomic.yacy.logging.Log; + +public class TextSnippet { + + private static final int maxCache = 1000; + + public static final int SOURCE_CACHE = 0; + public static final int SOURCE_FILE = 1; + public static final int SOURCE_WEB = 2; + public static final int SOURCE_METADATA = 3; + + public static final int ERROR_NO_HASH_GIVEN = 11; + public static final int ERROR_SOURCE_LOADING = 12; + public static final int ERROR_RESOURCE_LOADING = 13; + public static final int ERROR_PARSER_FAILED = 14; + public static final int ERROR_PARSER_NO_LINES = 15; + public static final int ERROR_NO_MATCH = 16; + + private static final SimpleARC snippetsCache = new SimpleARC(maxCache); + private static final SimpleARC faviconCache = new SimpleARC(maxCache); + + private final yacyURL url; + private String line; + private final String error; + private final int errorCode; + private TreeSet remaingHashes; + private final yacyURL favicon; + + private static Log log = null; + private static Switchboard sb = null; + + + public static void init( + final Log logx, + final Switchboard switchboard + ) { + log = logx; + sb = switchboard; + } + + public static boolean existsInCache(final yacyURL url, final TreeSet queryhashes) { + final String hashes = yacySearch.set2string(queryhashes); + return retrieveFromCache(hashes, url.hash()) != null; + } + + public static void storeToCache(final String wordhashes, final String urlhash, final String snippet) { + // generate key + String key = urlhash + wordhashes; + + // do nothing if snippet is known + if (snippetsCache.containsKey(key)) return; + + // learn new snippet + snippetsCache.put(key, snippet); + } + + public static String retrieveFromCache(final String wordhashes, final String urlhash) { + // generate key + final String key = urlhash + wordhashes; + return snippetsCache.get(key); + } + + public static TreeSet removeAppearanceHashes(final String sentence, final TreeSet queryhashes) { + // remove all hashes that appear in the sentence + if (sentence == null) return queryhashes; + final TreeMap hs = Condenser.hashSentence(sentence); + final Iterator j = queryhashes.iterator(); + byte[] hash; + Integer pos; + final TreeSet remaininghashes = new TreeSet(Base64Order.enhancedCoder); + while (j.hasNext()) { + hash = j.next(); + pos = hs.get(hash); + if (pos == null) { + remaininghashes.add(hash); + } + } + return remaininghashes; + } + + /** + * \\A[^\\p{L}\\p{N}].+ + */ + private final static Pattern p1 = Pattern.compile("\\A[^\\p{L}\\p{N}].+"); + /** + * .+[^\\p{L}\\p{N}]\\Z + */ + private final static Pattern p2 = Pattern.compile(".+[^\\p{L}\\p{N}]\\Z"); + /** + * \\A[\\p{L}\\p{N}]+[^\\p{L}\\p{N}].+\\Z + */ + private final static Pattern p3 = Pattern.compile("\\A[\\p{L}\\p{N}]+[^\\p{L}\\p{N}].+\\Z"); + /** + * [^\\p{L}\\p{N}] + */ + private final static Pattern p4 = Pattern.compile("[^\\p{L}\\p{N}]"); + /** + * (.*?)(\\<b\\>.+?\\</b\\>)(.*) + */ + private final static Pattern p01 = Pattern.compile("(.*?)(\\.+?\\)(.*)"); // marked words are in -tags + + public TextSnippet(final yacyURL url, final String line, final int errorCode, final TreeSet remaingHashes, final String errortext) { + this(url,line,errorCode,remaingHashes,errortext,null); + } + + public TextSnippet(final yacyURL url, final String line, final int errorCode, final TreeSet remaingHashes, final String errortext, final yacyURL favicon) { + this.url = url; + this.line = line; + this.errorCode = errorCode; + this.error = errortext; + this.remaingHashes = remaingHashes; + this.favicon = favicon; + } + public yacyURL getUrl() { + return this.url; + } + public yacyURL getFavicon() { + return this.favicon; + } + public boolean exists() { + return line != null; + } + public String toString() { + return (line == null) ? "" : line; + } + public String getLineRaw() { + return (line == null) ? "" : line; + } + public String getError() { + return (error == null) ? "" : error.trim(); + } + public int getErrorCode() { + return errorCode; + } + public TreeSet getRemainingHashes() { + return this.remaingHashes; + } + public String getLineMarked(final TreeSet queryHashes) { + if (line == null) return ""; + if ((queryHashes == null) || (queryHashes.size() == 0)) return line.trim(); + if (line.endsWith(".")) line = line.substring(0, line.length() - 1); + final Iterator i = queryHashes.iterator(); + byte[] h; + final String[] w = line.split(" "); + while (i.hasNext()) { + h = i.next(); + for (int j = 0; j < w.length; j++) { + final ArrayList al = markedWordArrayList(w[j]); // mark special character separated words correctly if more than 1 word has to be marked + w[j] = ""; + for (int k = 0; k < al.size(); k++) { + if(k % 2 == 0){ // word has not been marked + w[j] += getWordMarked(al.get(k), h); + } else { // word has been marked, do not encode again + w[j] += al.get(k); + } + } + } + } + final StringBuilder l = new StringBuilder(line.length() + queryHashes.size() * 8); + for (int j = 0; j < w.length; j++) { + l.append(w[j]); + l.append(' '); + } + return l.toString().trim(); + } + + /** + * mark words with <b>-tags + * @param word the word to mark + * @param h the hash of the word to mark + * @return the marked word if hash matches, else the unmarked word + * @see #getLineMarked(Set) + */ + private static String getWordMarked(String word, byte[] h){ + //ignore punctuation marks (contrib [MN]) + //note to myself: + //For details on regex see "Mastering regular expressions" by J.E.F. Friedl + //especially p. 123 and p. 390/391 (in the German version of the 2nd edition) + + String prefix = ""; + String postfix = ""; + int len = 0; + + // cut off prefix if it contains of non-characters or non-numbers + while(p1.matcher(word).find()) { + prefix = prefix + word.substring(0,1); + word = word.substring(1); + } + + // cut off postfix if it contains of non-characters or non-numbers + while(p2.matcher(word).find()) { + len = word.length(); + postfix = word.substring(len-1,len) + postfix; + word = word.substring(0,len-1); + } + + //special treatment if there is a special character in the word + if(p3.matcher(word).find()) { + String out = ""; + String temp = ""; + for(int k=0; k < word.length(); k++) { + //is character a special character? + if(p4.matcher(word.substring(k,k+1)).find()) { + if (new String(Word.word2hash(temp)).equals(new String(h))) temp = "" + CharacterCoding.unicode2html(temp, false) + ""; + out = out + temp + CharacterCoding.unicode2html(word.substring(k,k+1), false); + temp = ""; + } + //last character + else if(k == (word.length()-1)) { + temp = temp + word.substring(k,k+1); + if (new String(Word.word2hash(temp)).equals(new String(h))) temp = "" + CharacterCoding.unicode2html(temp, false) + ""; + out = out + temp; + temp = ""; + } + else temp = temp + word.substring(k,k+1); + } + word = out; + } + + //end contrib [MN] + else if (new String(Word.word2hash(word)).equals(new String(h))) word = "" + CharacterCoding.unicode2html(word, false) + ""; + + word = CharacterCoding.unicode2html(prefix, false) + + word + + CharacterCoding.unicode2html(postfix, false); + return word; + } + + /** + * words that already has been marked has index (i % 2 == 1) + * words that has not yet been marked has index (i % 2 == 0) + * @param string the String to be processed + * @return words that already has and has not yet been marked + * @author [DW], 08.11.2008 + */ + private static ArrayList markedWordArrayList(String string){ + ArrayList al = new java.util.ArrayList(1); + Matcher m = p01.matcher(string); + while (m.find()) { + al.add(m.group(1)); + al.add(m.group(2)); + string = m.group(3); // the postfix + m = p01.matcher(string); + } + al.add(string); + return al; + } + + @SuppressWarnings("unchecked") + public static TextSnippet retrieveTextSnippet(final URLMetadataRow.Components comp, final TreeSet queryhashes, final boolean fetchOnline, final boolean pre, final int snippetMaxLength, final int maxDocLen, final boolean reindexing) { + // heise = "0OQUNU3JSs05" + final yacyURL url = comp.url(); + if (queryhashes.size() == 0) { + //System.out.println("found no queryhashes for URL retrieve " + url); + return new TextSnippet(url, null, ERROR_NO_HASH_GIVEN, queryhashes, "no query hashes given"); + } + + // try to get snippet from snippetCache + int source = SOURCE_CACHE; + final String wordhashes = yacySearch.set2string(queryhashes); + String line = retrieveFromCache(wordhashes, url.hash()); + if (line != null) { + // found the snippet + return new TextSnippet(url, line, source, null, null, faviconCache.get(url.hash())); + } + + /* =========================================================================== + * LOADING RESOURCE DATA + * =========================================================================== */ + // if the snippet is not in the cache, we can try to get it from the htcache + long resContentLength = 0; + InputStream resContent = null; + ResponseHeader responseHeader = null; + try { + // first try to get the snippet from metadata + String loc; + if (containsAllHashes(loc = comp.dc_title(), queryhashes)) { + // try to create the snippet from information given in the url itself + return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(url.hash())); + } else if (containsAllHashes(loc = comp.dc_creator(), queryhashes)) { + // try to create the snippet from information given in the creator metadata + return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(url.hash())); + } else if (containsAllHashes(loc = comp.dc_subject(), queryhashes)) { + // try to create the snippet from information given in the subject metadata + return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(url.hash())); + } else if (containsAllHashes(loc = comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) { + // try to create the snippet from information given in the subject metadata + return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(url.hash())); + } else { + // trying to load the resource from the cache + resContent = Cache.getContentStream(url); + responseHeader = Cache.getResponseHeader(url); + if (resContent != null && ((resContentLength = Cache.getResourceContentLength(url)) > maxDocLen) && (!fetchOnline)) { + // content may be too large to be parsed here. To be fast, we omit calculation of snippet here + return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes"); + } else if (fetchOnline) { + // if not found try to download it + + // download resource using the crawler and keep resource in memory if possible + final Response entry = Switchboard.getSwitchboard().loader.load(url, true, reindexing); + + // getting resource metadata (e.g. the http headers for http resources) + if (entry != null) { + // place entry on indexing queue + sb.toIndexer(entry); + + // read resource body (if it is there) + final byte []resourceArray = entry.getContent(); + if (resourceArray != null) { + resContent = new ByteArrayInputStream(resourceArray); + resContentLength = resourceArray.length; + } else { + resContent = Cache.getContentStream(url); + resContentLength = Cache.getResourceContentLength(url); + } + } + + // if it is still not available, report an error + if (resContent == null) return new TextSnippet(url, null, ERROR_RESOURCE_LOADING, queryhashes, "error loading resource, plasmaHTCache.Entry cache is NULL"); + + source = SOURCE_WEB; + } else { + return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "no resource available"); + } + } + } catch (final Exception e) { + //e.printStackTrace(); + return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "error loading resource: " + e.getMessage()); + } + + /* =========================================================================== + * PARSING RESOURCE + * =========================================================================== */ + Document document = null; + try { + document = Document.parseDocument(url, resContentLength, resContent, responseHeader); + } catch (final ParserException e) { + return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed + } finally { + try { resContent.close(); } catch (final Exception e) {/* ignore this */} + } + if (document == null) return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, "parser error/failed"); // cannot be parsed + + + /* =========================================================================== + * COMPUTE SNIPPET + * =========================================================================== */ + final yacyURL resFavicon = document.getFavicon(); + if (resFavicon != null) faviconCache.put(url.hash(), resFavicon); + // we have found a parseable non-empty file: use the lines + + // compute snippet from text + final Iterator sentences = document.getSentences(pre); + if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences",resFavicon); + final Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength); + final String textline = (tsr == null) ? null : (String) tsr[0]; + final TreeSet remainingHashes = (tsr == null) ? queryhashes : (TreeSet) tsr[1]; + + // compute snippet from media + //String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes); + //String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes); + //String appline = computeMediaSnippet(document.getApplinks(), queryhashes); + //String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes); + //String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes); + + line = ""; + //if (audioline != null) line += (line.length() == 0) ? audioline : "
" + audioline; + //if (videoline != null) line += (line.length() == 0) ? videoline : "
" + videoline; + //if (appline != null) line += (line.length() == 0) ? appline : "
" + appline; + //if (hrefline != null) line += (line.length() == 0) ? hrefline : "
" + hrefline; + if (textline != null) line += (line.length() == 0) ? textline : "
" + textline; + + if ((line == null) || (remainingHashes.size() > 0)) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found",resFavicon); + if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength); + + // finally store this snippet in our own cache + storeToCache(wordhashes, url.hash(), line); + + document.close(); + return new TextSnippet(url, line, source, null, null, resFavicon); + } + + private static boolean containsAllHashes(final String sentence, final Set queryhashes) { + final TreeMap m = Condenser.hashSentence(sentence); + final Iterator i = queryhashes.iterator(); + while (i.hasNext()) { + if (!(m.containsKey(i.next()))) return false; + } + return true; + } + + @SuppressWarnings("unchecked") + private static Object[] /*{String - the snippet, Set - remaining hashes}*/ + computeTextSnippet(final Iterator sentences, final TreeSet queryhashes, int maxLength) { + try { + if (sentences == null) return null; + if ((queryhashes == null) || (queryhashes.size() == 0)) return null; + Iterator j; + TreeMap hs; + StringBuilder sentence; + final TreeMap os = new TreeMap(); + int uniqCounter = 9999; + int score; + while (sentences.hasNext()) { + sentence = sentences.next(); + hs = Condenser.hashSentence(sentence.toString()); + j = queryhashes.iterator(); + score = 0; + while (j.hasNext()) {if (hs.containsKey(j.next())) score++;} + if (score > 0) { + os.put(Integer.valueOf(1000000 * score - sentence.length() * 10000 + uniqCounter--), sentence); + } + } + + String result; + TreeSet remaininghashes; + while (os.size() > 0) { + sentence = os.remove(os.lastKey()); // sentence with the biggest score + Object[] tsr = computeTextSnippet(sentence.toString(), queryhashes, maxLength); + if (tsr == null) continue; + result = (String) tsr[0]; + if ((result != null) && (result.length() > 0)) { + remaininghashes = (TreeSet) tsr[1]; + if (remaininghashes.size() == 0) { + // we have found the snippet + return new Object[]{result, remaininghashes}; + } else if (remaininghashes.size() < queryhashes.size()) { + // the result has not all words in it. + // find another sentence that represents the missing other words + // and find recursively more sentences + maxLength = maxLength - result.length(); + if (maxLength < 20) maxLength = 20; + tsr = computeTextSnippet(os.values().iterator(), remaininghashes, maxLength); + if (tsr == null) return null; + final String nextSnippet = (String) tsr[0]; + if (nextSnippet == null) return tsr; + return new Object[]{result + (" / " + nextSnippet), tsr[1]}; + } else { + // error + //assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'"; + continue; + } + } + } + return null; + } catch (final IndexOutOfBoundsException e) { + log.logSevere("computeSnippet: error with string generation", e); + return new Object[]{null, queryhashes}; + } + } + + private static Object[] /*{String - the snippet, Set - remaining hashes}*/ + computeTextSnippet(String sentence, final TreeSet queryhashes, final int maxLength) { + try { + if (sentence == null) return null; + if ((queryhashes == null) || (queryhashes.size() == 0)) return null; + byte[] hash; + + // find all hashes that appear in the sentence + final TreeMap hs = Condenser.hashSentence(sentence); + final Iterator j = queryhashes.iterator(); + Integer pos; + int p, minpos = sentence.length(), maxpos = -1; + final TreeSet remainingHashes = new TreeSet(Base64Order.enhancedCoder); + while (j.hasNext()) { + hash = j.next(); + pos = hs.get(hash); + if (pos == null) { + remainingHashes.add(hash); + } else { + p = pos.intValue(); + if (p > maxpos) maxpos = p; + if (p < minpos) minpos = p; + } + } + // check result size + maxpos = maxpos + 10; + if (maxpos > sentence.length()) maxpos = sentence.length(); + if (minpos < 0) minpos = 0; + // we have a result, but is it short enough? + if (maxpos - minpos + 10 > maxLength) { + // the string is too long, even if we cut at both ends + // so cut here in the middle of the string + final int lenb = sentence.length(); + sentence = sentence.substring(0, (minpos + 20 > sentence.length()) ? sentence.length() : minpos + 20).trim() + + " [..] " + + sentence.substring((maxpos + 26 > sentence.length()) ? sentence.length() : maxpos + 26).trim(); + maxpos = maxpos + lenb - sentence.length() + 6; + } + if (maxpos > maxLength) { + // the string is too long, even if we cut it at the end + // so cut it here at both ends at once + assert maxpos >= minpos; + final int newlen = Math.max(10, maxpos - minpos + 10); + final int around = (maxLength - newlen) / 2; + assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); + //assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); + sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]"; + minpos = around; + maxpos = sentence.length() - around - 5; + } + if (sentence.length() > maxLength) { + // trim sentence, 1st step (cut at right side) + sentence = sentence.substring(0, maxpos).trim() + " [..]"; + } + if (sentence.length() > maxLength) { + // trim sentence, 2nd step (cut at left side) + sentence = "[..] " + sentence.substring(minpos).trim(); + } + if (sentence.length() > maxLength) { + // trim sentence, 3rd step (cut in the middle) + sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim(); + } + return new Object[] {sentence, remainingHashes}; + } catch (final IndexOutOfBoundsException e) { + log.logSevere("computeSnippet: error with string generation", e); + return null; + } + } + + public static String failConsequences(final TextSnippet snippet, final String eventID) throws IOException { + // problems with snippet fetch + final String urlHash = snippet.getUrl().hash(); + final String querystring = SetTools.setToString(snippet.getRemainingHashes(), ' '); + if ((snippet.getErrorCode() == ERROR_SOURCE_LOADING) || + (snippet.getErrorCode() == ERROR_RESOURCE_LOADING) || + (snippet.getErrorCode() == ERROR_PARSER_FAILED) || + (snippet.getErrorCode() == ERROR_PARSER_NO_LINES)) { + log.logInfo("error: '" + snippet.getError() + "', remove url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError()); + Switchboard.getSwitchboard().indexSegment.urlMetadata().remove(urlHash); + final SearchEvent event = SearchEventCache.getEvent(eventID); + assert Switchboard.getSwitchboard() != null; + assert Switchboard.getSwitchboard().indexSegment != null; + assert event != null : "eventID = " + eventID; + assert event.getQuery() != null; + Switchboard.getSwitchboard().indexSegment.termIndex().remove(event.getQuery().queryHashes, urlHash); + event.remove(urlHash); + } + if (snippet.getErrorCode() == ERROR_NO_MATCH) { + log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError()); + Switchboard.getSwitchboard().indexSegment.termIndex().remove(snippet.getRemainingHashes(), urlHash); + SearchEventCache.getEvent(eventID).remove(urlHash); + } + return snippet.getError(); + } + +} diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 5675f157e..1d4d0d0d1 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -84,9 +84,9 @@ import de.anomic.kelondro.util.ByteBuffer; import de.anomic.kelondro.util.FileUtils; import de.anomic.search.RankingProfile; import de.anomic.search.RankingProcess; -import de.anomic.search.SnippetCache; import de.anomic.search.Switchboard; import de.anomic.search.SwitchboardConstants; +import de.anomic.search.TextSnippet; import de.anomic.server.serverCore; import de.anomic.server.serverDomains; import de.anomic.tools.crypt; @@ -587,7 +587,7 @@ public final class yacyClient { // because they are search-specific. // instead, they are placed in a snipped-search cache. // System.out.println("--- RECEIVED SNIPPET '" + link.snippet() + "'"); - SnippetCache.storeToCache(wordhashes, urlEntry.hash(), urlEntry.snippet()); + TextSnippet.storeToCache(wordhashes, urlEntry.hash(), urlEntry.snippet()); } // add the url entry to the word indexes