added snippet-url re-indexing

- snippets will generate an entry in responseHeader.db
- there is now another default profile for snippet loading
- pages from snippet-loading will be indexed, indexing depth = 0
- better organization of default profiles

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2733 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 2cfd4633ac
commit c8f3a7d363

@ -50,7 +50,6 @@ import java.io.File;
import java.io.IOException;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -110,14 +109,13 @@ public class ProxyIndexingMonitor_p {
sb.setCacheSize(Long.parseLong(newProxyCacheSize));
// implant these settings also into the crawling profile for the proxy
plasmaCrawlProfile.entry profile = sb.profiles.getEntry(sb.getConfig("defaultProxyProfile", ""));
if (profile == null) {
if (sb.defaultProxyProfile == null) {
prop.put("info", 1); //delete DATA/PLASMADB/crawlProfiles0.db
} else {
try {
profile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth));
profile.changeEntry("storeHTCache", (proxyStoreHTCache) ? "true": "false");
profile.changeEntry("remoteIndexing",proxyCrawlOrder ? "true":"false");
sb.defaultProxyProfile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth));
sb.defaultProxyProfile.changeEntry("storeHTCache", (proxyStoreHTCache) ? "true": "false");
sb.defaultProxyProfile.changeEntry("remoteIndexing",proxyCrawlOrder ? "true":"false");
prop.put("info", 2);//new proxyPrefetchdepth
prop.put("info_message", newProxyPrefetchDepth);

@ -65,7 +65,7 @@ public final class plasmaCrawlLoaderMessage {
// loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) {
public plasmaCrawlLoaderMessage(
URL url,
String name,
String name, // the name of the url, from anchor tag <a>name</a>
String referer,
String initiator,
int depth,

@ -65,6 +65,7 @@ import de.anomic.plasma.crawler.plasmaCrawlerException;
import de.anomic.plasma.parser.ParserException;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySearch;
import de.anomic.yacy.yacyCore;
public class plasmaSnippetCache {
@ -209,10 +210,6 @@ public class plasmaSnippetCache {
if (resContent != null) {
// if the content was found
resContentLength = this.cacheManager.getResourceContentLength(url);
// getting resource metadata
resInfo = this.cacheManager.loadResourceInfo(url);
} else if (fetchOnline) {
// if not found try to download it
@ -616,12 +613,12 @@ public class plasmaSnippetCache {
) throws plasmaCrawlerException {
plasmaHTCache.Entry result = this.sb.cacheLoader.loadSync(
url,
"",
null,
null,
0,
null,
url, // the url
"", // name of the url, from anchor tag <a>name</a>
null, // referer
yacyCore.seedDB.mySeed.hash, // initiator
0, // depth
sb.defaultSnippetProfile, // crawl profile
socketTimeout,
keepInMemory
);

@ -219,6 +219,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public plasmaCrawlProfile profiles;
public plasmaCrawlProfile.entry defaultProxyProfile;
public plasmaCrawlProfile.entry defaultRemoteProfile;
public plasmaCrawlProfile.entry defaultSnippetProfile;
public boolean rankingOn;
public plasmaRankingDistribution rankingOwnDistribution;
public plasmaRankingDistribution rankingOtherDistribution;
@ -251,8 +252,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
/*
* Some constants
*/
private static final String STR_PROXYPROFILE = "defaultProxyProfile";
private static final String STR_REMOTEPROFILE = "defaultRemoteProfile";
private static final String STR_REMOTECRAWLTRIGGER = "REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER ";
private serverSemaphore shutdownSync = new serverSemaphore(0);
@ -744,23 +743,35 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
private void initProfiles() {
if ((this.profiles.size() == 0) ||
(getConfig(STR_PROXYPROFILE, "").length() == 0) ||
(this.profiles.getEntry(getConfig(STR_PROXYPROFILE, "")) == null)) {
this.defaultProxyProfile = null;
this.defaultRemoteProfile = null;
this.defaultSnippetProfile = null;
Iterator i = this.profiles.profiles(true);
plasmaCrawlProfile.entry profile;
String name;
while (i.hasNext()) {
profile = (plasmaCrawlProfile.entry) i.next();
name = profile.name();
if (name.equals("proxy")) this.defaultProxyProfile = profile;
if (name.equals("remote")) this.defaultRemoteProfile = profile;
if (name.equals("snippet")) this.defaultSnippetProfile = profile;
}
if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling
this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), 60 * 24 * 30, -1, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true);
setConfig(STR_PROXYPROFILE, this.defaultProxyProfile.handle());
} else {
this.defaultProxyProfile = this.profiles.getEntry(getConfig(STR_PROXYPROFILE, ""));
this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*",
Integer.parseInt(getConfig("proxyPrefetchDepth", "0")),
Integer.parseInt(getConfig("proxyPrefetchDepth", "0")),
60 * 24, -1, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true);
}
if ((profiles.size() == 1) ||
(getConfig(STR_REMOTEPROFILE, "").length() == 0) ||
(profiles.getEntry(getConfig(STR_REMOTEPROFILE, "")) == null)) {
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, 60 * 24 * 30, -1, -1, true, false, true, true, false, true, true, false);
setConfig(STR_REMOTEPROFILE, defaultRemoteProfile.handle());
} else {
defaultRemoteProfile = profiles.getEntry(getConfig(STR_REMOTEPROFILE, ""));
defaultRemoteProfile = this.profiles.newEntry("remote", "", ".*", ".*", 0, 0,
-1, -1, -1, true, false, true, true, false, true, true, false);
}
if (this.defaultSnippetProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultSnippetProfile = this.profiles.newEntry("snippet", "", ".*", ".*", 0, 0,
60 * 24 * 30, -1, -1, true, true, true, true, false, true, true, false);
}
}
@ -785,7 +796,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// getting next profile
entry = (plasmaCrawlProfile.entry) iter.next();
if (!((entry.name().equals("proxy")) || (entry.name().equals("remote")))) {
if (!((entry.name().equals("proxy")) ||
(entry.name().equals("remote")) ||
(entry.name().equals("snippet")))) {
iter.remove();
hasDoneSomething = true;
}

@ -439,11 +439,6 @@ crawlingQ=false
storeHTCache=false
storeTXCache=true
# default crawl profile entries
# if these entries are empty, then a new entry will be generated
defaultProxyProfile=
defaultRemoteProfile=
# peers may initiate remote crawling tasks.
# every peer may allow or disallow to be used as crawling-peer;
# you can also set a maximum crawl depth that can be requested or accepted

Loading…
Cancel
Save