From c8f3a7d3633562122e0733a03cd42abd749668f5 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 9 Oct 2006 23:07:10 +0000 Subject: [PATCH] added snippet-url re-indexing - snippets will generate an entry in responseHeader.db - there is now another default profile for snippet loading - pages from snippet-loading will be indexed, indexing depth = 0 - better organization of default profiles git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2733 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ProxyIndexingMonitor_p.java | 10 ++-- source/de/anomic/http/httpdProxyHandler.java | 6 +-- source/de/anomic/plasma/plasmaCrawlEURL.java | 2 +- .../plasma/plasmaCrawlLoaderMessage.java | 2 +- .../de/anomic/plasma/plasmaSnippetCache.java | 17 +++---- .../de/anomic/plasma/plasmaSwitchboard.java | 47 ++++++++++++------- yacy.init | 5 -- 7 files changed, 46 insertions(+), 43 deletions(-) diff --git a/htroot/ProxyIndexingMonitor_p.java b/htroot/ProxyIndexingMonitor_p.java index ca7cc7c03..91badc907 100644 --- a/htroot/ProxyIndexingMonitor_p.java +++ b/htroot/ProxyIndexingMonitor_p.java @@ -50,7 +50,6 @@ import java.io.File; import java.io.IOException; import de.anomic.http.httpHeader; -import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -110,14 +109,13 @@ public class ProxyIndexingMonitor_p { sb.setCacheSize(Long.parseLong(newProxyCacheSize)); // implant these settings also into the crawling profile for the proxy - plasmaCrawlProfile.entry profile = sb.profiles.getEntry(sb.getConfig("defaultProxyProfile", "")); - if (profile == null) { + if (sb.defaultProxyProfile == null) { prop.put("info", 1); //delete DATA/PLASMADB/crawlProfiles0.db } else { try { - profile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth)); - profile.changeEntry("storeHTCache", (proxyStoreHTCache) ? "true": "false"); - profile.changeEntry("remoteIndexing",proxyCrawlOrder ? "true":"false"); + sb.defaultProxyProfile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth)); + sb.defaultProxyProfile.changeEntry("storeHTCache", (proxyStoreHTCache) ? "true": "false"); + sb.defaultProxyProfile.changeEntry("remoteIndexing",proxyCrawlOrder ? "true":"false"); prop.put("info", 2);//new proxyPrefetchdepth prop.put("info_message", newProxyPrefetchDepth); diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 441bdef2c..3d2f87f61 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -455,10 +455,10 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt requestDate, // init date 0, // crawling depth url, // url - "", // name of the url is unknown - //requestHeader, // request headers + "", // name of the url is unknown + //requestHeader, // request headers "200 OK", // request status - //cachedResponseHeader, // response headers + //cachedResponseHeader, // response headers cachedResInfo, null, // initiator switchboard.defaultProxyProfile // profile diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index c05f08e5b..8d9918354 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -205,7 +205,7 @@ public class plasmaCrawlEURL extends indexURL { private String hash; // the url's hash private String referrer; // the url's referrer hash private String initiator; // the crawling initiator - private String executor; // the crawling initiator + private String executor; // the crawling initiator private URL url; // the url as string private String name; // the name of the url, from anchor tag name private Date initdate; // the time when the url was first time appeared diff --git a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java index 60929d606..cd6eb1cd8 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java +++ b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java @@ -65,7 +65,7 @@ public final class plasmaCrawlLoaderMessage { // loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) { public plasmaCrawlLoaderMessage( URL url, - String name, + String name, // the name of the url, from anchor tag name String referer, String initiator, int depth, diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 138baf738..6ee1f2de8 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -65,6 +65,7 @@ import de.anomic.plasma.crawler.plasmaCrawlerException; import de.anomic.plasma.parser.ParserException; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySearch; +import de.anomic.yacy.yacyCore; public class plasmaSnippetCache { @@ -209,10 +210,6 @@ public class plasmaSnippetCache { if (resContent != null) { // if the content was found resContentLength = this.cacheManager.getResourceContentLength(url); - - // getting resource metadata - resInfo = this.cacheManager.loadResourceInfo(url); - } else if (fetchOnline) { // if not found try to download it @@ -616,12 +613,12 @@ public class plasmaSnippetCache { ) throws plasmaCrawlerException { plasmaHTCache.Entry result = this.sb.cacheLoader.loadSync( - url, - "", - null, - null, - 0, - null, + url, // the url + "", // name of the url, from anchor tag name + null, // referer + yacyCore.seedDB.mySeed.hash, // initiator + 0, // depth + sb.defaultSnippetProfile, // crawl profile socketTimeout, keepInMemory ); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index e64c00f61..f5990ebaf 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -219,6 +219,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public plasmaCrawlProfile profiles; public plasmaCrawlProfile.entry defaultProxyProfile; public plasmaCrawlProfile.entry defaultRemoteProfile; + public plasmaCrawlProfile.entry defaultSnippetProfile; public boolean rankingOn; public plasmaRankingDistribution rankingOwnDistribution; public plasmaRankingDistribution rankingOtherDistribution; @@ -251,8 +252,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser /* * Some constants */ - private static final String STR_PROXYPROFILE = "defaultProxyProfile"; - private static final String STR_REMOTEPROFILE = "defaultRemoteProfile"; private static final String STR_REMOTECRAWLTRIGGER = "REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER "; private serverSemaphore shutdownSync = new serverSemaphore(0); @@ -744,23 +743,35 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } private void initProfiles() { - if ((this.profiles.size() == 0) || - (getConfig(STR_PROXYPROFILE, "").length() == 0) || - (this.profiles.getEntry(getConfig(STR_PROXYPROFILE, "")) == null)) { + this.defaultProxyProfile = null; + this.defaultRemoteProfile = null; + this.defaultSnippetProfile = null; + Iterator i = this.profiles.profiles(true); + plasmaCrawlProfile.entry profile; + String name; + while (i.hasNext()) { + profile = (plasmaCrawlProfile.entry) i.next(); + name = profile.name(); + if (name.equals("proxy")) this.defaultProxyProfile = profile; + if (name.equals("remote")) this.defaultRemoteProfile = profile; + if (name.equals("snippet")) this.defaultSnippetProfile = profile; + } + if (this.defaultProxyProfile == null) { // generate new default entry for proxy crawling - this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), 60 * 24 * 30, -1, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true); - setConfig(STR_PROXYPROFILE, this.defaultProxyProfile.handle()); - } else { - this.defaultProxyProfile = this.profiles.getEntry(getConfig(STR_PROXYPROFILE, "")); + this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", + Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), + Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), + 60 * 24, -1, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true); } - if ((profiles.size() == 1) || - (getConfig(STR_REMOTEPROFILE, "").length() == 0) || - (profiles.getEntry(getConfig(STR_REMOTEPROFILE, "")) == null)) { + if (this.defaultRemoteProfile == null) { // generate new default entry for remote crawling - defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, 60 * 24 * 30, -1, -1, true, false, true, true, false, true, true, false); - setConfig(STR_REMOTEPROFILE, defaultRemoteProfile.handle()); - } else { - defaultRemoteProfile = profiles.getEntry(getConfig(STR_REMOTEPROFILE, "")); + defaultRemoteProfile = this.profiles.newEntry("remote", "", ".*", ".*", 0, 0, + -1, -1, -1, true, false, true, true, false, true, true, false); + } + if (this.defaultSnippetProfile == null) { + // generate new default entry for snippet fetch and optional crawling + defaultSnippetProfile = this.profiles.newEntry("snippet", "", ".*", ".*", 0, 0, + 60 * 24 * 30, -1, -1, true, true, true, true, false, true, true, false); } } @@ -785,7 +796,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // getting next profile entry = (plasmaCrawlProfile.entry) iter.next(); - if (!((entry.name().equals("proxy")) || (entry.name().equals("remote")))) { + if (!((entry.name().equals("proxy")) || + (entry.name().equals("remote")) || + (entry.name().equals("snippet")))) { iter.remove(); hasDoneSomething = true; } diff --git a/yacy.init b/yacy.init index 6033bb0ba..5171fa40d 100644 --- a/yacy.init +++ b/yacy.init @@ -439,11 +439,6 @@ crawlingQ=false storeHTCache=false storeTXCache=true -# default crawl profile entries -# if these entries are empty, then a new entry will be generated -defaultProxyProfile= -defaultRemoteProfile= - # peers may initiate remote crawling tasks. # every peer may allow or disallow to be used as crawling-peer; # you can also set a maximum crawl depth that can be requested or accepted