From e024e3b9cf85d388c4875af38063e0d99427d3e4 Mon Sep 17 00:00:00 2001
From: orbiter
Date: Thu, 24 Apr 2008 08:42:08 +0000
Subject: [PATCH] added new default profiles to distinguish snippet fetch for
local and global search the difference is, that a local search will no not
cause a re-indexing of loaded pages
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4731 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
htroot/Bookmarks.java | 2 +-
htroot/IndexCreateWWWLocalQueue_p.java | 6 ++-
htroot/ViewFile.java | 2 +-
htroot/ViewImage.java | 2 +-
htroot/WatchWebStructure_p.java | 6 ++-
htroot/yacy/ui/ymarks.java | 2 +-
htroot/yacy/user/sidebar_history.java | 2 +-
htroot/yacy/user/sidebar_navigation.java | 6 +--
htroot/yacysearch.java | 4 +-
htroot/yacysearchitem.java | 2 +-
.../plasma/crawler/plasmaCrawlQueues.java | 12 ++++-
source/de/anomic/plasma/plasmaParser.java | 2 +-
.../de/anomic/plasma/plasmaSearchEvent.java | 4 +-
.../de/anomic/plasma/plasmaSearchImages.java | 6 +--
.../de/anomic/plasma/plasmaSearchQuery.java | 8 ++-
.../de/anomic/plasma/plasmaSnippetCache.java | 17 +++---
.../de/anomic/plasma/plasmaSwitchboard.java | 54 ++++++++++++-------
source/de/anomic/yacy/yacyVersion.java | 1 -
source/de/anomic/ymage/ymageOSM.java | 2 +-
19 files changed, 87 insertions(+), 53 deletions(-)
diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java
index c47311148..43fda6230 100644
--- a/htroot/Bookmarks.java
+++ b/htroot/Bookmarks.java
@@ -200,7 +200,7 @@ public class Bookmarks {
plasmaParserDocument document = null;
if (urlentry != null) {
indexURLReference.Components comp = urlentry.comp();
- document = plasmaSnippetCache.retrieveDocument(comp.url(), true, 5000, true);
+ document = plasmaSnippetCache.retrieveDocument(comp.url(), true, 5000, true, false);
prop.put("mode_edit", "0"); // create mode
prop.put("mode_url", comp.url().toNormalform(false, true));
prop.putHTML("mode_title", comp.dc_title());
diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java
index d4776d0f7..fad668b6d 100644
--- a/htroot/IndexCreateWWWLocalQueue_p.java
+++ b/htroot/IndexCreateWWWLocalQueue_p.java
@@ -115,8 +115,10 @@ public class IndexCreateWWWLocalQueue_p {
final String name = entry.name();
if (name.equals(plasmaSwitchboard.CRAWL_PROFILE_PROXY) ||
name.equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) ||
- name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) ||
- name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA))
+ name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ||
+ name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ||
+ name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
+ name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
continue;
if (compiledPattern.matcher(name).find()) {
sb.profilesActiveCrawls.removeEntry(entry.handle());
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index e631fae51..9bf0d3fb5 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -166,7 +166,7 @@ public class ViewFile {
if (resource == null) {
plasmaHTCache.Entry entry = null;
try {
- entry = sb.crawlQueues.loadResourceFromWeb(url, 5000, false, true);
+ entry = sb.crawlQueues.loadResourceFromWeb(url, 5000, false, true, false);
} catch (Exception e) {
prop.put("error", "4");
prop.putHTML("error_errorText", e.getMessage());
diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java
index 53b4c3291..d1e6c31c7 100644
--- a/htroot/ViewImage.java
+++ b/htroot/ViewImage.java
@@ -98,7 +98,7 @@ public class ViewImage {
// getting the image as stream
Image scaled = iconcache.get(urlString);
if (scaled == null) {
- Object[] resource = plasmaSnippetCache.getResource(url, true, timeout, false);
+ Object[] resource = plasmaSnippetCache.getResource(url, true, timeout, false, true);
byte[] imgb = null;
if (resource == null) {
if (urlString.endsWith(".ico")) {
diff --git a/htroot/WatchWebStructure_p.java b/htroot/WatchWebStructure_p.java
index 5dcb8488e..0ea06ff5f 100644
--- a/htroot/WatchWebStructure_p.java
+++ b/htroot/WatchWebStructure_p.java
@@ -37,8 +37,10 @@ public class WatchWebStructure_p {
e = it.next();
if (e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_PROXY) ||
e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) ||
- e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) ||
- e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA))
+ e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ||
+ e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ||
+ e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
+ e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
continue;
host = e.name();
break; // take the first one
diff --git a/htroot/yacy/ui/ymarks.java b/htroot/yacy/ui/ymarks.java
index 7da366f82..67d899572 100644
--- a/htroot/yacy/ui/ymarks.java
+++ b/htroot/yacy/ui/ymarks.java
@@ -200,7 +200,7 @@ public class ymarks {
plasmaParserDocument document = null;
if (urlentry != null) {
indexURLReference.Components comp = urlentry.comp();
- document = plasmaSnippetCache.retrieveDocument(comp.url(), true, 5000, true);
+ document = plasmaSnippetCache.retrieveDocument(comp.url(), true, 5000, true, true);
prop.put("mode_edit", "0"); // create mode
prop.put("mode_url", comp.url().toNormalform(false, true));
prop.putHTML("mode_title", comp.dc_title());
diff --git a/htroot/yacy/user/sidebar_history.java b/htroot/yacy/user/sidebar_history.java
index 325dd09da..8c9b1312d 100644
--- a/htroot/yacy/user/sidebar_history.java
+++ b/htroot/yacy/user/sidebar_history.java
@@ -53,7 +53,7 @@ public class sidebar_history {
if (visibleQueries.contains(query.queryString)) continue; // avoid doubles
visibleQueries.add(query.queryString);
prop.put("history_list_" + c + "_querystring", query.queryString);
- prop.put("history_list_" + c + "_searchdom", query.searchdom());
+ prop.put("history_list_" + c + "_searchdom", ((query.isLocal()) ? "local" : "global"));
prop.put("history_list_" + c + "_contentdom", query.contentdom());
c++;
if (c >= 10) break;
diff --git a/htroot/yacy/user/sidebar_navigation.java b/htroot/yacy/user/sidebar_navigation.java
index 8a9da0a5e..3ede24b0e 100644
--- a/htroot/yacy/user/sidebar_navigation.java
+++ b/htroot/yacy/user/sidebar_navigation.java
@@ -117,7 +117,7 @@ public class sidebar_navigation {
prop.put("navigation_topwords_words_" + hintcount + "_count", theQuery.displayResults());
prop.put("navigation_topwords_words_" + hintcount + "_offset", "0");
prop.put("navigation_topwords_words_" + hintcount + "_contentdom", theQuery.contentdom());
- prop.put("navigation_topwords_words_" + hintcount + "_resource", theQuery.searchdom());
+ prop.put("navigation_topwords_words_" + hintcount + "_resource", ((theQuery.isLocal()) ? "local" : "global"));
prop.put("navigation_topwords_words_" + hintcount + "_zonecode", theQuery.zonecode);
}
hintcount++;
@@ -182,7 +182,7 @@ public class sidebar_navigation {
" map = new HashMap();
@@ -417,7 +417,7 @@ public class yacysearch {
"&search=" + theQuery.queryString() +
"&count="+ theQuery.displayResults() +
"&offset=" + (page * theQuery.displayResults()) +
- "&resource=" + theQuery.searchdom() +
+ "&resource=" + ((theQuery.isLocal()) ? "local" : "global") +
"&urlmaskfilter=" + theQuery.urlMask +
"&prefermaskfilter=" + theQuery.prefer +
"&cat=href&constraint=" + ((theQuery.constraint == null) ? "" : theQuery.constraint.exportB64()) +
diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java
index 4540873b5..a3aee77e1 100644
--- a/htroot/yacysearchitem.java
+++ b/htroot/yacysearchitem.java
@@ -155,7 +155,7 @@ public class yacysearchitem {
prop.put("references_words_" + hintcount + "_count", theQuery.displayResults());
prop.put("references_words_" + hintcount + "_offset", "0");
prop.put("references_words_" + hintcount + "_contentdom", theQuery.contentdom());
- prop.put("references_words_" + hintcount + "_resource", theQuery.searchdom());
+ prop.put("references_words_" + hintcount + "_resource", ((theQuery.isLocal()) ? "local" : "global"));
}
prop.put("references_words", hintcount);
if (hintcount++ > MAX_TOPWORDS) {
diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java b/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java
index 4fb1f8554..a330915ec 100644
--- a/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java
+++ b/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java
@@ -436,7 +436,8 @@ public class plasmaCrawlQueues {
yacyURL url,
int socketTimeout,
boolean keepInMemory,
- boolean forText
+ boolean forText,
+ boolean global
) {
plasmaCrawlEntry centry = new plasmaCrawlEntry(
@@ -445,7 +446,14 @@ public class plasmaCrawlQueues {
null,
"",
new Date(),
- (forText) ? sb.defaultTextSnippetProfile.handle() : sb.defaultMediaSnippetProfile.handle(), // crawl profile
+ (forText) ?
+ ((global) ?
+ sb.defaultTextSnippetGlobalProfile.handle() :
+ sb.defaultTextSnippetLocalProfile.handle())
+ :
+ ((global) ?
+ sb.defaultMediaSnippetGlobalProfile.handle() :
+ sb.defaultMediaSnippetLocalProfile.handle()), // crawl profile
0,
0,
0);
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index fdd7de853..6dd83b73e 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -667,7 +667,7 @@ public final class plasmaParser {
}
if (!documentCharset.equalsIgnoreCase(charset)) {
- this.theLogger.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "'");
+ this.theLogger.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "' for URL = " + location.toNormalform(true, true));
}
// parsing the content
diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java
index 47b69e865..272e8ab18 100644
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@@ -350,7 +350,7 @@ public final class plasmaSearchEvent {
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
// attach text snippet
startTime = System.currentTimeMillis();
- plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000);
+ plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000, query.isGlobal());
long snippetComputationTime = System.currentTimeMillis() - startTime;
serverLog.logInfo("SEARCH_EVENT", "text snippet load time for " + comp.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
@@ -370,7 +370,7 @@ public final class plasmaSearchEvent {
} else {
// attach media information
startTime = System.currentTimeMillis();
- ArrayList mediaSnippets = plasmaSnippetCache.retrieveMediaSnippets(comp.url(), snippetFetchWordHashes, query.contentdom, (snippetFetchMode == 2), 6000);
+ ArrayList mediaSnippets = plasmaSnippetCache.retrieveMediaSnippets(comp.url(), snippetFetchWordHashes, query.contentdom, (snippetFetchMode == 2), 6000, query.isGlobal());
long snippetComputationTime = System.currentTimeMillis() - startTime;
serverLog.logInfo("SEARCH_EVENT", "media snippet load time for " + comp.url() + ": " + snippetComputationTime);
diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java
index aabaf4cc8..c3e1fcb11 100644
--- a/source/de/anomic/plasma/plasmaSearchImages.java
+++ b/source/de/anomic/plasma/plasmaSearchImages.java
@@ -56,11 +56,11 @@ public final class plasmaSearchImages {
private HashMap images;
- public plasmaSearchImages(long maxTime, yacyURL url, int depth) {
+ public plasmaSearchImages(long maxTime, yacyURL url, int depth, boolean indexing) {
long start = System.currentTimeMillis();
this.images = new HashMap();
if (maxTime > 10) {
- Object[] resource = plasmaSnippetCache.getResource(url, true, (int) maxTime, false);
+ Object[] resource = plasmaSnippetCache.getResource(url, true, (int) maxTime, false, indexing);
InputStream res = (InputStream) resource[0];
Long resLength = (Long) resource[1];
if (res != null) {
@@ -85,7 +85,7 @@ public final class plasmaSearchImages {
while (i.hasNext()) {
try {
nexturlstring = i.next().toNormalform(true, true);
- addAll(new plasmaSearchImages(serverDate.remainingTime(start, maxTime, 10), new yacyURL(nexturlstring, null), depth - 1));
+ addAll(new plasmaSearchImages(serverDate.remainingTime(start, maxTime, 10), new yacyURL(nexturlstring, null), depth - 1, indexing));
} catch (MalformedURLException e1) {
e1.printStackTrace();
}
diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java
index dac5ff0df..4f1c74aea 100644
--- a/source/de/anomic/plasma/plasmaSearchQuery.java
+++ b/source/de/anomic/plasma/plasmaSearchQuery.java
@@ -193,8 +193,12 @@ public final class plasmaSearchQuery {
return "text";
}
- public String searchdom() {
- return (this.domType == SEARCHDOM_LOCAL) ? "local" : "global";
+ public boolean isGlobal() {
+ return this.domType != SEARCHDOM_LOCAL;
+ }
+
+ public boolean isLocal() {
+ return this.domType != SEARCHDOM_LOCAL;
}
public static TreeSet hashes2Set(String query) {
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index 9a7e791ee..7db01e9ce 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -255,7 +255,7 @@ public class plasmaSnippetCache {
}
@SuppressWarnings("unchecked")
- public static TextSnippet retrieveTextSnippet(indexURLReference.Components comp, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen) {
+ public static TextSnippet retrieveTextSnippet(indexURLReference.Components comp, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen, boolean reindexing) {
// heise = "0OQUNU3JSs05"
yacyURL url = comp.url();
if (queryhashes.size() == 0) {
@@ -305,7 +305,7 @@ public class plasmaSnippetCache {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible
- plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, true);
+ plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, true, reindexing);
// place entry on crawl queue
plasmaHTCache.push(entry);
@@ -398,9 +398,10 @@ public class plasmaSnippetCache {
* @param fetchOnline specifies if the resource should be loaded from web if it'as not available in the cache
* @param timeout
* @param forText
+ * @param global the domain of the search. If global == true then the content is re-indexed
* @return the parsed document as {@link plasmaParserDocument}
*/
- public static plasmaParserDocument retrieveDocument(yacyURL url, boolean fetchOnline, int timeout, boolean forText) {
+ public static plasmaParserDocument retrieveDocument(yacyURL url, boolean fetchOnline, int timeout, boolean forText, boolean global) {
// load resource
long resContentLength = 0;
@@ -416,7 +417,7 @@ public class plasmaSnippetCache {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible
- plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, forText);
+ plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, forText, global);
// getting resource metadata (e.g. the http headers for http resources)
if (entry != null) {
@@ -648,13 +649,13 @@ public class plasmaSnippetCache {
}
}
- public static ArrayList retrieveMediaSnippets(yacyURL url, Set queryhashes, int mediatype, boolean fetchOnline, int timeout) {
+ public static ArrayList retrieveMediaSnippets(yacyURL url, Set queryhashes, int mediatype, boolean fetchOnline, int timeout, boolean reindexing) {
if (queryhashes.size() == 0) {
serverLog.logFine("snippet fetch", "no query hashes given for url " + url);
return new ArrayList();
}
- plasmaParserDocument document = retrieveDocument(url, fetchOnline, timeout, false);
+ plasmaParserDocument document = retrieveDocument(url, fetchOnline, timeout, false, reindexing);
ArrayList a = new ArrayList();
if (document != null) {
if ((mediatype == plasmaSearchQuery.CONTENTDOM_ALL) || (mediatype == plasmaSearchQuery.CONTENTDOM_AUDIO)) a.addAll(computeMediaSnippets(document, queryhashes, plasmaSearchQuery.CONTENTDOM_AUDIO));
@@ -860,7 +861,7 @@ public class plasmaSnippetCache {
* [1] | the content-length as {@link Integer} |
*
*/
- public static Object[] getResource(yacyURL url, boolean fetchOnline, int socketTimeout, boolean forText) {
+ public static Object[] getResource(yacyURL url, boolean fetchOnline, int socketTimeout, boolean forText, boolean reindexing) {
// load the url as resource from the web
long contentLength = -1;
@@ -872,7 +873,7 @@ public class plasmaSnippetCache {
// if the content is not available in cache try to download it from web
// try to download the resource using a crawler
- plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout, true, forText);
+ plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout, true, forText, reindexing);
if (entry == null) return null; // not found in web
// read resource body (if it is there)
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 96d7887b5..3fb42e891 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -212,8 +212,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitchpublic static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"
@@ -1511,8 +1513,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch i = this.profilesActiveCrawls.profiles(true);
plasmaCrawlProfile.entry profile;
String name;
@@ -1521,8 +1525,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch