added snippet-url re-indexing

- snippets will generate an entry in responseHeader.db
- there is now another default profile for snippet loading
- pages from snippet-loading will be indexed, indexing depth = 0
- better organization of default profiles

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2733 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 2cfd4633ac
commit c8f3a7d363

@ -50,7 +50,6 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
@ -110,14 +109,13 @@ public class ProxyIndexingMonitor_p {
sb.setCacheSize(Long.parseLong(newProxyCacheSize)); sb.setCacheSize(Long.parseLong(newProxyCacheSize));
// implant these settings also into the crawling profile for the proxy // implant these settings also into the crawling profile for the proxy
plasmaCrawlProfile.entry profile = sb.profiles.getEntry(sb.getConfig("defaultProxyProfile", "")); if (sb.defaultProxyProfile == null) {
if (profile == null) {
prop.put("info", 1); //delete DATA/PLASMADB/crawlProfiles0.db prop.put("info", 1); //delete DATA/PLASMADB/crawlProfiles0.db
} else { } else {
try { try {
profile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth)); sb.defaultProxyProfile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth));
profile.changeEntry("storeHTCache", (proxyStoreHTCache) ? "true": "false"); sb.defaultProxyProfile.changeEntry("storeHTCache", (proxyStoreHTCache) ? "true": "false");
profile.changeEntry("remoteIndexing",proxyCrawlOrder ? "true":"false"); sb.defaultProxyProfile.changeEntry("remoteIndexing",proxyCrawlOrder ? "true":"false");
prop.put("info", 2);//new proxyPrefetchdepth prop.put("info", 2);//new proxyPrefetchdepth
prop.put("info_message", newProxyPrefetchDepth); prop.put("info_message", newProxyPrefetchDepth);

@ -65,7 +65,7 @@ public final class plasmaCrawlLoaderMessage {
// loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) { // loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) {
public plasmaCrawlLoaderMessage( public plasmaCrawlLoaderMessage(
URL url, URL url,
String name, String name, // the name of the url, from anchor tag <a>name</a>
String referer, String referer,
String initiator, String initiator,
int depth, int depth,

@ -65,6 +65,7 @@ import de.anomic.plasma.crawler.plasmaCrawlerException;
import de.anomic.plasma.parser.ParserException; import de.anomic.plasma.parser.ParserException;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySearch; import de.anomic.yacy.yacySearch;
import de.anomic.yacy.yacyCore;
public class plasmaSnippetCache { public class plasmaSnippetCache {
@ -209,10 +210,6 @@ public class plasmaSnippetCache {
if (resContent != null) { if (resContent != null) {
// if the content was found // if the content was found
resContentLength = this.cacheManager.getResourceContentLength(url); resContentLength = this.cacheManager.getResourceContentLength(url);
// getting resource metadata
resInfo = this.cacheManager.loadResourceInfo(url);
} else if (fetchOnline) { } else if (fetchOnline) {
// if not found try to download it // if not found try to download it
@ -616,12 +613,12 @@ public class plasmaSnippetCache {
) throws plasmaCrawlerException { ) throws plasmaCrawlerException {
plasmaHTCache.Entry result = this.sb.cacheLoader.loadSync( plasmaHTCache.Entry result = this.sb.cacheLoader.loadSync(
url, url, // the url
"", "", // name of the url, from anchor tag <a>name</a>
null, null, // referer
null, yacyCore.seedDB.mySeed.hash, // initiator
0, 0, // depth
null, sb.defaultSnippetProfile, // crawl profile
socketTimeout, socketTimeout,
keepInMemory keepInMemory
); );

@ -219,6 +219,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public plasmaCrawlProfile profiles; public plasmaCrawlProfile profiles;
public plasmaCrawlProfile.entry defaultProxyProfile; public plasmaCrawlProfile.entry defaultProxyProfile;
public plasmaCrawlProfile.entry defaultRemoteProfile; public plasmaCrawlProfile.entry defaultRemoteProfile;
public plasmaCrawlProfile.entry defaultSnippetProfile;
public boolean rankingOn; public boolean rankingOn;
public plasmaRankingDistribution rankingOwnDistribution; public plasmaRankingDistribution rankingOwnDistribution;
public plasmaRankingDistribution rankingOtherDistribution; public plasmaRankingDistribution rankingOtherDistribution;
@ -251,8 +252,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
/* /*
* Some constants * Some constants
*/ */
private static final String STR_PROXYPROFILE = "defaultProxyProfile";
private static final String STR_REMOTEPROFILE = "defaultRemoteProfile";
private static final String STR_REMOTECRAWLTRIGGER = "REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER "; private static final String STR_REMOTECRAWLTRIGGER = "REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER ";
private serverSemaphore shutdownSync = new serverSemaphore(0); private serverSemaphore shutdownSync = new serverSemaphore(0);
@ -744,23 +743,35 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
private void initProfiles() { private void initProfiles() {
if ((this.profiles.size() == 0) || this.defaultProxyProfile = null;
(getConfig(STR_PROXYPROFILE, "").length() == 0) || this.defaultRemoteProfile = null;
(this.profiles.getEntry(getConfig(STR_PROXYPROFILE, "")) == null)) { this.defaultSnippetProfile = null;
Iterator i = this.profiles.profiles(true);
plasmaCrawlProfile.entry profile;
String name;
while (i.hasNext()) {
profile = (plasmaCrawlProfile.entry) i.next();
name = profile.name();
if (name.equals("proxy")) this.defaultProxyProfile = profile;
if (name.equals("remote")) this.defaultRemoteProfile = profile;
if (name.equals("snippet")) this.defaultSnippetProfile = profile;
}
if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling // generate new default entry for proxy crawling
this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), 60 * 24 * 30, -1, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true); this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*",
setConfig(STR_PROXYPROFILE, this.defaultProxyProfile.handle()); Integer.parseInt(getConfig("proxyPrefetchDepth", "0")),
} else { Integer.parseInt(getConfig("proxyPrefetchDepth", "0")),
this.defaultProxyProfile = this.profiles.getEntry(getConfig(STR_PROXYPROFILE, "")); 60 * 24, -1, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true);
} }
if ((profiles.size() == 1) || if (this.defaultRemoteProfile == null) {
(getConfig(STR_REMOTEPROFILE, "").length() == 0) ||
(profiles.getEntry(getConfig(STR_REMOTEPROFILE, "")) == null)) {
// generate new default entry for remote crawling // generate new default entry for remote crawling
defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, 60 * 24 * 30, -1, -1, true, false, true, true, false, true, true, false); defaultRemoteProfile = this.profiles.newEntry("remote", "", ".*", ".*", 0, 0,
setConfig(STR_REMOTEPROFILE, defaultRemoteProfile.handle()); -1, -1, -1, true, false, true, true, false, true, true, false);
} else { }
defaultRemoteProfile = profiles.getEntry(getConfig(STR_REMOTEPROFILE, "")); if (this.defaultSnippetProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultSnippetProfile = this.profiles.newEntry("snippet", "", ".*", ".*", 0, 0,
60 * 24 * 30, -1, -1, true, true, true, true, false, true, true, false);
} }
} }
@ -785,7 +796,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// getting next profile // getting next profile
entry = (plasmaCrawlProfile.entry) iter.next(); entry = (plasmaCrawlProfile.entry) iter.next();
if (!((entry.name().equals("proxy")) || (entry.name().equals("remote")))) { if (!((entry.name().equals("proxy")) ||
(entry.name().equals("remote")) ||
(entry.name().equals("snippet")))) {
iter.remove(); iter.remove();
hasDoneSomething = true; hasDoneSomething = true;
} }

@ -439,11 +439,6 @@ crawlingQ=false
storeHTCache=false storeHTCache=false
storeTXCache=true storeTXCache=true
# default crawl profile entries
# if these entries are empty, then a new entry will be generated
defaultProxyProfile=
defaultRemoteProfile=
# peers may initiate remote crawling tasks. # peers may initiate remote crawling tasks.
# every peer may allow or disallow to be used as crawling-peer; # every peer may allow or disallow to be used as crawling-peer;
# you can also set a maximum crawl depth that can be requested or accepted # you can also set a maximum crawl depth that can be requested or accepted

Loading…
Cancel
Save