From e024e3b9cf85d388c4875af38063e0d99427d3e4 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 24 Apr 2008 08:42:08 +0000 Subject: [PATCH] added new default profiles to distinguish snippet fetch for local and global search the difference is, that a local search will no not cause a re-indexing of loaded pages git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4731 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Bookmarks.java | 2 +- htroot/IndexCreateWWWLocalQueue_p.java | 6 ++- htroot/ViewFile.java | 2 +- htroot/ViewImage.java | 2 +- htroot/WatchWebStructure_p.java | 6 ++- htroot/yacy/ui/ymarks.java | 2 +- htroot/yacy/user/sidebar_history.java | 2 +- htroot/yacy/user/sidebar_navigation.java | 6 +-- htroot/yacysearch.java | 4 +- htroot/yacysearchitem.java | 2 +- .../plasma/crawler/plasmaCrawlQueues.java | 12 ++++- source/de/anomic/plasma/plasmaParser.java | 2 +- .../de/anomic/plasma/plasmaSearchEvent.java | 4 +- .../de/anomic/plasma/plasmaSearchImages.java | 6 +-- .../de/anomic/plasma/plasmaSearchQuery.java | 8 ++- .../de/anomic/plasma/plasmaSnippetCache.java | 17 +++--- .../de/anomic/plasma/plasmaSwitchboard.java | 54 ++++++++++++------- source/de/anomic/yacy/yacyVersion.java | 1 - source/de/anomic/ymage/ymageOSM.java | 2 +- 19 files changed, 87 insertions(+), 53 deletions(-) diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index c47311148..43fda6230 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -200,7 +200,7 @@ public class Bookmarks { plasmaParserDocument document = null; if (urlentry != null) { indexURLReference.Components comp = urlentry.comp(); - document = plasmaSnippetCache.retrieveDocument(comp.url(), true, 5000, true); + document = plasmaSnippetCache.retrieveDocument(comp.url(), true, 5000, true, false); prop.put("mode_edit", "0"); // create mode prop.put("mode_url", comp.url().toNormalform(false, true)); prop.putHTML("mode_title", comp.dc_title()); diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java index d4776d0f7..fad668b6d 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -115,8 +115,10 @@ public class IndexCreateWWWLocalQueue_p { final String name = entry.name(); if (name.equals(plasmaSwitchboard.CRAWL_PROFILE_PROXY) || name.equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) || - name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) || - name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA)) + name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) || + name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) || + name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) || + name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) continue; if (compiledPattern.matcher(name).find()) { sb.profilesActiveCrawls.removeEntry(entry.handle()); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index e631fae51..9bf0d3fb5 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -166,7 +166,7 @@ public class ViewFile { if (resource == null) { plasmaHTCache.Entry entry = null; try { - entry = sb.crawlQueues.loadResourceFromWeb(url, 5000, false, true); + entry = sb.crawlQueues.loadResourceFromWeb(url, 5000, false, true, false); } catch (Exception e) { prop.put("error", "4"); prop.putHTML("error_errorText", e.getMessage()); diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index 53b4c3291..d1e6c31c7 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -98,7 +98,7 @@ public class ViewImage { // getting the image as stream Image scaled = iconcache.get(urlString); if (scaled == null) { - Object[] resource = plasmaSnippetCache.getResource(url, true, timeout, false); + Object[] resource = plasmaSnippetCache.getResource(url, true, timeout, false, true); byte[] imgb = null; if (resource == null) { if (urlString.endsWith(".ico")) { diff --git a/htroot/WatchWebStructure_p.java b/htroot/WatchWebStructure_p.java index 5dcb8488e..0ea06ff5f 100644 --- a/htroot/WatchWebStructure_p.java +++ b/htroot/WatchWebStructure_p.java @@ -37,8 +37,10 @@ public class WatchWebStructure_p { e = it.next(); if (e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_PROXY) || e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) || - e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) || - e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA)) + e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) || + e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) || + e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) || + e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) continue; host = e.name(); break; // take the first one diff --git a/htroot/yacy/ui/ymarks.java b/htroot/yacy/ui/ymarks.java index 7da366f82..67d899572 100644 --- a/htroot/yacy/ui/ymarks.java +++ b/htroot/yacy/ui/ymarks.java @@ -200,7 +200,7 @@ public class ymarks { plasmaParserDocument document = null; if (urlentry != null) { indexURLReference.Components comp = urlentry.comp(); - document = plasmaSnippetCache.retrieveDocument(comp.url(), true, 5000, true); + document = plasmaSnippetCache.retrieveDocument(comp.url(), true, 5000, true, true); prop.put("mode_edit", "0"); // create mode prop.put("mode_url", comp.url().toNormalform(false, true)); prop.putHTML("mode_title", comp.dc_title()); diff --git a/htroot/yacy/user/sidebar_history.java b/htroot/yacy/user/sidebar_history.java index 325dd09da..8c9b1312d 100644 --- a/htroot/yacy/user/sidebar_history.java +++ b/htroot/yacy/user/sidebar_history.java @@ -53,7 +53,7 @@ public class sidebar_history { if (visibleQueries.contains(query.queryString)) continue; // avoid doubles visibleQueries.add(query.queryString); prop.put("history_list_" + c + "_querystring", query.queryString); - prop.put("history_list_" + c + "_searchdom", query.searchdom()); + prop.put("history_list_" + c + "_searchdom", ((query.isLocal()) ? "local" : "global")); prop.put("history_list_" + c + "_contentdom", query.contentdom()); c++; if (c >= 10) break; diff --git a/htroot/yacy/user/sidebar_navigation.java b/htroot/yacy/user/sidebar_navigation.java index 8a9da0a5e..3ede24b0e 100644 --- a/htroot/yacy/user/sidebar_navigation.java +++ b/htroot/yacy/user/sidebar_navigation.java @@ -117,7 +117,7 @@ public class sidebar_navigation { prop.put("navigation_topwords_words_" + hintcount + "_count", theQuery.displayResults()); prop.put("navigation_topwords_words_" + hintcount + "_offset", "0"); prop.put("navigation_topwords_words_" + hintcount + "_contentdom", theQuery.contentdom()); - prop.put("navigation_topwords_words_" + hintcount + "_resource", theQuery.searchdom()); + prop.put("navigation_topwords_words_" + hintcount + "_resource", ((theQuery.isLocal()) ? "local" : "global")); prop.put("navigation_topwords_words_" + hintcount + "_zonecode", theQuery.zonecode); } hintcount++; @@ -182,7 +182,7 @@ public class sidebar_navigation { " map = new HashMap(); @@ -417,7 +417,7 @@ public class yacysearch { "&search=" + theQuery.queryString() + "&count="+ theQuery.displayResults() + "&offset=" + (page * theQuery.displayResults()) + - "&resource=" + theQuery.searchdom() + + "&resource=" + ((theQuery.isLocal()) ? "local" : "global") + "&urlmaskfilter=" + theQuery.urlMask + "&prefermaskfilter=" + theQuery.prefer + "&cat=href&constraint=" + ((theQuery.constraint == null) ? "" : theQuery.constraint.exportB64()) + diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 4540873b5..a3aee77e1 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -155,7 +155,7 @@ public class yacysearchitem { prop.put("references_words_" + hintcount + "_count", theQuery.displayResults()); prop.put("references_words_" + hintcount + "_offset", "0"); prop.put("references_words_" + hintcount + "_contentdom", theQuery.contentdom()); - prop.put("references_words_" + hintcount + "_resource", theQuery.searchdom()); + prop.put("references_words_" + hintcount + "_resource", ((theQuery.isLocal()) ? "local" : "global")); } prop.put("references_words", hintcount); if (hintcount++ > MAX_TOPWORDS) { diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java b/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java index 4fb1f8554..a330915ec 100644 --- a/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java +++ b/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java @@ -436,7 +436,8 @@ public class plasmaCrawlQueues { yacyURL url, int socketTimeout, boolean keepInMemory, - boolean forText + boolean forText, + boolean global ) { plasmaCrawlEntry centry = new plasmaCrawlEntry( @@ -445,7 +446,14 @@ public class plasmaCrawlQueues { null, "", new Date(), - (forText) ? sb.defaultTextSnippetProfile.handle() : sb.defaultMediaSnippetProfile.handle(), // crawl profile + (forText) ? + ((global) ? + sb.defaultTextSnippetGlobalProfile.handle() : + sb.defaultTextSnippetLocalProfile.handle()) + : + ((global) ? + sb.defaultMediaSnippetGlobalProfile.handle() : + sb.defaultMediaSnippetLocalProfile.handle()), // crawl profile 0, 0, 0); diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index fdd7de853..6dd83b73e 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -667,7 +667,7 @@ public final class plasmaParser { } if (!documentCharset.equalsIgnoreCase(charset)) { - this.theLogger.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "'"); + this.theLogger.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "' for URL = " + location.toNormalform(true, true)); } // parsing the content diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 47b69e865..272e8ab18 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -350,7 +350,7 @@ public final class plasmaSearchEvent { if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) { // attach text snippet startTime = System.currentTimeMillis(); - plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000); + plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000, query.isGlobal()); long snippetComputationTime = System.currentTimeMillis() - startTime; serverLog.logInfo("SEARCH_EVENT", "text snippet load time for " + comp.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")"))); @@ -370,7 +370,7 @@ public final class plasmaSearchEvent { } else { // attach media information startTime = System.currentTimeMillis(); - ArrayList mediaSnippets = plasmaSnippetCache.retrieveMediaSnippets(comp.url(), snippetFetchWordHashes, query.contentdom, (snippetFetchMode == 2), 6000); + ArrayList mediaSnippets = plasmaSnippetCache.retrieveMediaSnippets(comp.url(), snippetFetchWordHashes, query.contentdom, (snippetFetchMode == 2), 6000, query.isGlobal()); long snippetComputationTime = System.currentTimeMillis() - startTime; serverLog.logInfo("SEARCH_EVENT", "media snippet load time for " + comp.url() + ": " + snippetComputationTime); diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java index aabaf4cc8..c3e1fcb11 100644 --- a/source/de/anomic/plasma/plasmaSearchImages.java +++ b/source/de/anomic/plasma/plasmaSearchImages.java @@ -56,11 +56,11 @@ public final class plasmaSearchImages { private HashMap images; - public plasmaSearchImages(long maxTime, yacyURL url, int depth) { + public plasmaSearchImages(long maxTime, yacyURL url, int depth, boolean indexing) { long start = System.currentTimeMillis(); this.images = new HashMap(); if (maxTime > 10) { - Object[] resource = plasmaSnippetCache.getResource(url, true, (int) maxTime, false); + Object[] resource = plasmaSnippetCache.getResource(url, true, (int) maxTime, false, indexing); InputStream res = (InputStream) resource[0]; Long resLength = (Long) resource[1]; if (res != null) { @@ -85,7 +85,7 @@ public final class plasmaSearchImages { while (i.hasNext()) { try { nexturlstring = i.next().toNormalform(true, true); - addAll(new plasmaSearchImages(serverDate.remainingTime(start, maxTime, 10), new yacyURL(nexturlstring, null), depth - 1)); + addAll(new plasmaSearchImages(serverDate.remainingTime(start, maxTime, 10), new yacyURL(nexturlstring, null), depth - 1, indexing)); } catch (MalformedURLException e1) { e1.printStackTrace(); } diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index dac5ff0df..4f1c74aea 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -193,8 +193,12 @@ public final class plasmaSearchQuery { return "text"; } - public String searchdom() { - return (this.domType == SEARCHDOM_LOCAL) ? "local" : "global"; + public boolean isGlobal() { + return this.domType != SEARCHDOM_LOCAL; + } + + public boolean isLocal() { + return this.domType != SEARCHDOM_LOCAL; } public static TreeSet hashes2Set(String query) { diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 9a7e791ee..7db01e9ce 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -255,7 +255,7 @@ public class plasmaSnippetCache { } @SuppressWarnings("unchecked") - public static TextSnippet retrieveTextSnippet(indexURLReference.Components comp, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen) { + public static TextSnippet retrieveTextSnippet(indexURLReference.Components comp, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen, boolean reindexing) { // heise = "0OQUNU3JSs05" yacyURL url = comp.url(); if (queryhashes.size() == 0) { @@ -305,7 +305,7 @@ public class plasmaSnippetCache { // if not found try to download it // download resource using the crawler and keep resource in memory if possible - plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, true); + plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, true, reindexing); // place entry on crawl queue plasmaHTCache.push(entry); @@ -398,9 +398,10 @@ public class plasmaSnippetCache { * @param fetchOnline specifies if the resource should be loaded from web if it'as not available in the cache * @param timeout * @param forText + * @param global the domain of the search. If global == true then the content is re-indexed * @return the parsed document as {@link plasmaParserDocument} */ - public static plasmaParserDocument retrieveDocument(yacyURL url, boolean fetchOnline, int timeout, boolean forText) { + public static plasmaParserDocument retrieveDocument(yacyURL url, boolean fetchOnline, int timeout, boolean forText, boolean global) { // load resource long resContentLength = 0; @@ -416,7 +417,7 @@ public class plasmaSnippetCache { // if not found try to download it // download resource using the crawler and keep resource in memory if possible - plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, forText); + plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, forText, global); // getting resource metadata (e.g. the http headers for http resources) if (entry != null) { @@ -648,13 +649,13 @@ public class plasmaSnippetCache { } } - public static ArrayList retrieveMediaSnippets(yacyURL url, Set queryhashes, int mediatype, boolean fetchOnline, int timeout) { + public static ArrayList retrieveMediaSnippets(yacyURL url, Set queryhashes, int mediatype, boolean fetchOnline, int timeout, boolean reindexing) { if (queryhashes.size() == 0) { serverLog.logFine("snippet fetch", "no query hashes given for url " + url); return new ArrayList(); } - plasmaParserDocument document = retrieveDocument(url, fetchOnline, timeout, false); + plasmaParserDocument document = retrieveDocument(url, fetchOnline, timeout, false, reindexing); ArrayList a = new ArrayList(); if (document != null) { if ((mediatype == plasmaSearchQuery.CONTENTDOM_ALL) || (mediatype == plasmaSearchQuery.CONTENTDOM_AUDIO)) a.addAll(computeMediaSnippets(document, queryhashes, plasmaSearchQuery.CONTENTDOM_AUDIO)); @@ -860,7 +861,7 @@ public class plasmaSnippetCache { * [1]the content-length as {@link Integer} * */ - public static Object[] getResource(yacyURL url, boolean fetchOnline, int socketTimeout, boolean forText) { + public static Object[] getResource(yacyURL url, boolean fetchOnline, int socketTimeout, boolean forText, boolean reindexing) { // load the url as resource from the web long contentLength = -1; @@ -872,7 +873,7 @@ public class plasmaSnippetCache { // if the content is not available in cache try to download it from web // try to download the resource using a crawler - plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout, true, forText); + plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout, true, forText, reindexing); if (entry == null) return null; // not found in web // read resource body (if it is there) diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 96d7887b5..3fb42e891 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -212,8 +212,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitchpublic static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"

@@ -1511,8 +1513,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch i = this.profilesActiveCrawls.profiles(true); plasmaCrawlProfile.entry profile; String name; @@ -1521,8 +1525,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch