enhanced parallelization of local/global/remote crawling

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@197 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent a05d738ea4
commit 3d8a2ff937

@ -169,12 +169,17 @@ public class IndexCreate_p {
if (post.containsKey("clearcrawlqueue")) { if (post.containsKey("clearcrawlqueue")) {
String urlHash; String urlHash;
int c = 0; int c = 0;
while (switchboard.noticeURL.localStackSize() > 0) { while (switchboard.noticeURL.coreStackSize() > 0) {
urlHash = switchboard.noticeURL.localPop().hash(); urlHash = switchboard.noticeURL.corePop().hash();
if (urlHash != null) { if (urlHash != null) { switchboard.noticeURL.remove(urlHash); c++; }
switchboard.noticeURL.remove(urlHash); }
c++; while (switchboard.noticeURL.limitStackSize() > 0) {
} urlHash = switchboard.noticeURL.limitPop().hash();
if (urlHash != null) { switchboard.noticeURL.remove(urlHash); c++; }
}
while (switchboard.noticeURL.remoteStackSize() > 0) {
urlHash = switchboard.noticeURL.remotePop().hash();
if (urlHash != null) { switchboard.noticeURL.remove(urlHash); c++; }
} }
prop.put("info", 3);//crawling queue cleared prop.put("info", 3);//crawling queue cleared
prop.put("info_numEntries", c); prop.put("info_numEntries", c);
@ -375,12 +380,12 @@ public class IndexCreate_p {
prop.put("loader-set_list", i ); prop.put("loader-set_list", i );
} }
int localStackSize = switchboard.noticeURL.localStackSize(); int localStackSize = switchboard.noticeURL.coreStackSize();
if (localStackSize == 0) { if (localStackSize == 0) {
prop.put("crawler-queue", 0); prop.put("crawler-queue", 0);
} else { } else {
prop.put("crawler-queue", 1); prop.put("crawler-queue", 1);
plasmaCrawlNURL.entry[] crawlerList = switchboard.noticeURL.localTop(20); plasmaCrawlNURL.entry[] crawlerList = switchboard.noticeURL.coreTop(20);
prop.put("crawler-queue_num", localStackSize);//num Entries prop.put("crawler-queue_num", localStackSize);//num Entries
prop.put("crawler-queue_show-num", crawlerList.length); //showin sjow-num most recent prop.put("crawler-queue_show-num", crawlerList.length); //showin sjow-num most recent
plasmaCrawlNURL.entry urle; plasmaCrawlNURL.entry urle;

@ -61,8 +61,13 @@ import de.anomic.tools.bitfield;
public class plasmaCrawlNURL extends plasmaURL { public class plasmaCrawlNURL extends plasmaURL {
public static final int STACK_TYPE_NULL = 0; // do not stack
public static final int STACK_TYPE_CORE = 1; // put on local stack
public static final int STACK_TYPE_LIMIT = 2; // put on global stack
public static final int STACK_TYPE_OVERHANG = 3; // put on overhang stack; links that are known but not crawled
public static final int STACK_TYPE_REMOTE = 4; // put on remote-triggered stack
private kelondroStack localStack; // links found by crawling to depth-1 private kelondroStack coreStack; // links found by crawling to depth-1
private kelondroStack limitStack; // links found by crawling at target depth private kelondroStack limitStack; // links found by crawling at target depth
private kelondroStack overhangStack; // links found by crawling at depth+1 private kelondroStack overhangStack; // links found by crawling at depth+1
private kelondroStack remoteStack; // links from remote crawl orders private kelondroStack remoteStack; // links from remote crawl orders
@ -101,9 +106,21 @@ public class plasmaCrawlNURL extends plasmaURL {
File localCrawlStack = new File(cacheStacksPath, "urlNoticeLocal0.stack"); File localCrawlStack = new File(cacheStacksPath, "urlNoticeLocal0.stack");
if (localCrawlStack.exists()) { if (localCrawlStack.exists()) {
localStack = new kelondroStack(localCrawlStack, 0); coreStack = new kelondroStack(localCrawlStack, 0);
} else { } else {
localStack = new kelondroStack(localCrawlStack, 0, new int[] {plasmaURL.urlHashLength}); coreStack = new kelondroStack(localCrawlStack, 0, new int[] {plasmaURL.urlHashLength});
}
File limitCrawlStack = new File(cacheStacksPath, "urlNoticeLimit0.stack");
if (limitCrawlStack.exists()) {
limitStack = new kelondroStack(limitCrawlStack, 0);
} else {
limitStack = new kelondroStack(limitCrawlStack, 0, new int[] {plasmaURL.urlHashLength});
}
File overhangCrawlStack = new File(cacheStacksPath, "urlNoticeOverhang0.stack");
if (overhangCrawlStack.exists()) {
overhangStack = new kelondroStack(overhangCrawlStack, 0);
} else {
overhangStack = new kelondroStack(overhangCrawlStack, 0, new int[] {plasmaURL.urlHashLength});
} }
File globalCrawlStack = new File(cacheStacksPath, "urlNoticeRemote0.stack"); File globalCrawlStack = new File(cacheStacksPath, "urlNoticeRemote0.stack");
if (globalCrawlStack.exists()) { if (globalCrawlStack.exists()) {
@ -114,7 +131,7 @@ public class plasmaCrawlNURL extends plasmaURL {
// init stack Index // init stack Index
stackIndex = new HashSet(); stackIndex = new HashSet();
Iterator i = localStack.iterator(); Iterator i = coreStack.iterator();
while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
i = remoteStack.iterator(); i = remoteStack.iterator();
while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
@ -134,11 +151,22 @@ public class plasmaCrawlNURL extends plasmaURL {
} }
public int stackSize() { public int stackSize() {
return localStack.size() + remoteStack.size(); // this does not count the overhang stack size
return coreStack.size() + limitStack.size() + remoteStack.size();
} }
public int localStackSize() {
return localStack.size(); public int coreStackSize() {
return coreStack.size();
}
public int limitStackSize() {
return limitStack.size();
} }
public int overhangStackSize() {
return overhangStack.size();
}
public int remoteStackSize() { public int remoteStackSize() {
return remoteStack.size(); return remoteStack.size();
} }
@ -159,21 +187,24 @@ public class plasmaCrawlNURL extends plasmaURL {
// 3 = on overhang stack // 3 = on overhang stack
// 4 = on remote stack // 4 = on remote stack
try { try {
if (stackMode == 1) { if (stackMode == 1) coreStack.push(new byte[][] {e.hash.getBytes()});
localStack.push(new byte[][] {e.hash.getBytes()}); if (stackMode == 2) limitStack.push(new byte[][] {e.hash.getBytes()});
stackIndex.add(new String(e.hash.getBytes())); if (stackMode == 3) overhangStack.push(new byte[][] {e.hash.getBytes()});
} if (stackMode == 4) remoteStack.push(new byte[][] {e.hash.getBytes()});
if (stackMode == 4) { stackIndex.add(new String(e.hash.getBytes()));
remoteStack.push(new byte[][] {e.hash.getBytes()});
stackIndex.add(new String(e.hash.getBytes()));
}
} catch (IOException er) { } catch (IOException er) {
} }
return e; return e;
} }
public entry localPop() { return pop(localStack); } public entry corePop() { return pop(coreStack); }
public entry[] localTop(int count) { return top(localStack, count); } public entry[] coreTop(int count) { return top(coreStack, count); }
public entry limitPop() { return pop(limitStack); }
public entry[] limitTop(int count) { return top(limitStack, count); }
public entry overhangPop() { return pop(overhangStack); }
public entry[] overhangTop(int count) { return top(overhangStack, count); }
public entry remotePop() { return pop(remoteStack); } public entry remotePop() { return pop(remoteStack); }
public entry[] remoteTop(int count) { return top(remoteStack, count); } public entry[] remoteTop(int count) { return top(remoteStack, count); }
@ -344,6 +375,7 @@ public class plasmaCrawlNURL extends plasmaURL {
} }
} }
/*
public class kenum implements Enumeration { public class kenum implements Enumeration {
// enumerates entry elements // enumerates entry elements
kelondroTree.rowIterator i; kelondroTree.rowIterator i;
@ -362,5 +394,5 @@ public class plasmaCrawlNURL extends plasmaURL {
// enumerates entry elements // enumerates entry elements
return new kenum(up, rotating); return new kenum(up, rotating);
} }
*/
} }

@ -323,10 +323,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
new serverInstantThread(this, "deQueue", "queueSize"), 10000); new serverInstantThread(this, "deQueue", "queueSize"), 10000);
deployThread("70_cachemanager", "Proxy Cache Enqueue", "job takes new proxy files from RAM stack, stores them, and hands over to the Indexing Stack", deployThread("70_cachemanager", "Proxy Cache Enqueue", "job takes new proxy files from RAM stack, stores them, and hands over to the Indexing Stack",
new serverInstantThread(cacheManager, "job", "size"), 10000); new serverInstantThread(cacheManager, "job", "size"), 10000);
deployThread("60_globalcrawl", "Global Crawl", "thread that performes a single crawl/indexing step of a web page for global crawling", deployThread("62_remotetriggeredcrawl", "Remote Crawl Job", "thread that performes a single crawl/indexing step triggered by a remote peer",
new serverInstantThread(this, "globalCrawlJob", "globalCrawlJobSize"), 30000); new serverInstantThread(this, "remoteTriggeredCrawlJob", "remoteTriggeredCrawlJobSize"), 30000);
deployThread("61_globalcrawltrigger", "Global Crawl Trigger", "thread that triggeres remote peers for crawling",
new serverInstantThread(this, "limitCrawlTriggerJob", "limitCrawlTriggerJobSize"), 30000);
deployThread("50_localcrawl", "Local Crawl", "thread that performes a single crawl step from the local crawl queue", deployThread("50_localcrawl", "Local Crawl", "thread that performes a single crawl step from the local crawl queue",
new serverInstantThread(this, "localCrawlJob", "localCrawlJobSize"), 10000); new serverInstantThread(this, "coreCrawlJob", "coreCrawlJobSize"), 10000);
deployThread("40_peerseedcycle", "Seed-List Upload", "task that a principal peer performes to generate and upload a seed-list to a ftp account", deployThread("40_peerseedcycle", "Seed-List Upload", "task that a principal peer performes to generate and upload a seed-list to a ftp account",
new serverInstantThread(yc, "publishSeedList", null), 180000); new serverInstantThread(yc, "publishSeedList", null), 180000);
deployThread("30_peerping", "YaCy Core", "this is the p2p-control and peer-ping task", deployThread("30_peerping", "YaCy Core", "this is the p2p-control and peer-ping task",
@ -374,7 +376,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} catch (IOException e) {} } catch (IOException e) {}
} }
private void cleanProfiles() { private void cleanProfiles() {
if (totalSize() > 0) return; if (queueSize() > 0) return;
Iterator i = profiles.profiles(true); Iterator i = profiles.profiles(true);
plasmaCrawlProfile.entry entry; plasmaCrawlProfile.entry entry;
try { try {
@ -428,12 +430,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logSystem("SWITCHBOARD SHUTDOWN TERMINATED"); log.logSystem("SWITCHBOARD SHUTDOWN TERMINATED");
} }
/*
public int totalSize() { public int totalSize() {
return processStack.size() + cacheLoader.size() + noticeURL.stackSize(); return processStack.size() + cacheLoader.size() + noticeURL.stackSize();
} }
*/
public int queueSize() { public int queueSize() {
return processStack.size(); return processStack.size() + cacheLoader.size() + noticeURL.stackSize();
} }
public int lUrlSize() { public int lUrlSize() {
@ -463,7 +467,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// do one processing step // do one processing step
log.logDebug("DEQUEUE: cacheManager=" + ((cacheManager.idle()) ? "idle" : "busy") + log.logDebug("DEQUEUE: cacheManager=" + ((cacheManager.idle()) ? "idle" : "busy") +
", processStack=" + processStack.size() + ", processStack=" + processStack.size() +
", localStackSize=" + noticeURL.localStackSize() + ", coreStackSize=" + noticeURL.coreStackSize() +
", limitStackSize=" + noticeURL.limitStackSize() +
", overhangStackSize=" + noticeURL.overhangStackSize() +
", remoteStackSize=" + noticeURL.remoteStackSize()); ", remoteStackSize=" + noticeURL.remoteStackSize());
processResourceStack((plasmaHTCache.Entry) processStack.removeFirst()); processResourceStack((plasmaHTCache.Entry) processStack.removeFirst());
return true; return true;
@ -529,22 +535,22 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
} }
public int localCrawlJobSize() { public int coreCrawlJobSize() {
return noticeURL.localStackSize(); return noticeURL.coreStackSize();
} }
public boolean localCrawlJob() { public boolean coreCrawlJob() {
if (noticeURL.localStackSize() == 0) { if (noticeURL.coreStackSize() == 0) {
//log.logDebug("LocalCrawl: queue is empty"); //log.logDebug("CoreCrawl: queue is empty");
return false; return false;
} }
if (processStack.size() >= crawlSlots) { if (processStack.size() >= crawlSlots) {
log.logDebug("LocalCrawl: too many processes in queue, dismissed (" + log.logDebug("CoreCrawl: too many processes in queue, dismissed (" +
"processStack=" + processStack.size() + ")"); "processStack=" + processStack.size() + ")");
return false; return false;
} }
if (cacheLoader.size() >= crawlSlots) { if (cacheLoader.size() >= crawlSlots) {
log.logDebug("LocalCrawl: too many loader in queue, dismissed (" + log.logDebug("CoreCrawl: too many loader in queue, dismissed (" +
"cacheLoader=" + cacheLoader.size() + ")"); "cacheLoader=" + cacheLoader.size() + ")");
return false; return false;
} }
@ -562,17 +568,91 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
} }
// do a local crawl (may start a global crawl) // do a local crawl
plasmaCrawlNURL.entry nex = noticeURL.localPop(); plasmaCrawlNURL.entry urlEntry = noticeURL.corePop();
processCrawling(nex, nex.initiator()); if (urlEntry.url() == null) return false;
return true; String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
if (profile == null) {
log.logError("LOCALCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return false;
}
log.logDebug("LOCALCRAWL: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() +
", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false")));
return processLocalCrawling(urlEntry, profile);
}
public int limitCrawlTriggerJobSize() {
return noticeURL.limitStackSize();
}
public boolean limitCrawlTriggerJob() {
if (noticeURL.limitStackSize() == 0) {
//log.logDebug("LimitCrawl: queue is empty");
return false;
}
// if the server is busy, we do crawling more slowly
if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {}
// if crawling was paused we have to wait until we wer notified to continue
synchronized(this.crawlingPausedSync) {
if (this.crawlingIsPaused) {
try {
this.crawlingPausedSync.wait();
}
catch (InterruptedException e){ return false;}
}
}
// start a global crawl, if possible
plasmaCrawlNURL.entry urlEntry = noticeURL.limitPop();
if (urlEntry.url() == null) return false;
String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
if (profile == null) {
log.logError("REMOTECRAWLTRIGGER[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return false;
}
log.logDebug("plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() +
", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false")));
boolean tryRemote =
(profile.remoteIndexing()) /* granted */ &&
(urlEntry.initiator() != null) && (!(urlEntry.initiator().equals(plasmaURL.dummyHash))) /* not proxy */ &&
((yacyCore.seedDB.mySeed.isSenior()) ||
(yacyCore.seedDB.mySeed.isPrincipal())) /* qualified */;
if (tryRemote) {
boolean success = processRemoteCrawlTrigger(urlEntry);
if (success) return true;
}
// alternatively do a local crawl
if (processStack.size() >= crawlSlots) {
log.logDebug("LimitCrawl: too many processes in queue, dismissed (" +
"processStack=" + processStack.size() + ")");
return false;
}
if (cacheLoader.size() >= crawlSlots) {
log.logDebug("LimitCrawl: too many loader in queue, dismissed (" +
"cacheLoader=" + cacheLoader.size() + ")");
return false;
}
processLocalCrawling(urlEntry, profile);
return false;
} }
public int globalCrawlJobSize() { public int remoteTriggeredCrawlJobSize() {
return noticeURL.remoteStackSize(); return noticeURL.remoteStackSize();
} }
public boolean globalCrawlJob() { public boolean remoteTriggeredCrawlJob() {
// work off crawl requests that had been placed by other peers to our crawl stack // work off crawl requests that had been placed by other peers to our crawl stack
// do nothing if either there are private processes to be done // do nothing if either there are private processes to be done
@ -586,9 +666,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
"processStack=" + processStack.size() + ")"); "processStack=" + processStack.size() + ")");
return false; return false;
} }
if (noticeURL.localStackSize() > 0) { if (noticeURL.coreStackSize() > 0) {
log.logDebug("GlobalCrawl: any local crawl is in queue, dismissed (" + log.logDebug("GlobalCrawl: any local crawl is in queue, dismissed (" +
"localStackSize=" + noticeURL.localStackSize() + ")"); "coreStackSize=" + noticeURL.coreStackSize() + ")");
return false; return false;
} }
@ -606,9 +686,20 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
// we don't want to crawl a global URL globally, since WE are the global part. (from this point of view) // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
plasmaCrawlNURL.entry nex = noticeURL.remotePop(); plasmaCrawlNURL.entry urlEntry = noticeURL.remotePop();
processCrawling(nex, nex.initiator()); if (urlEntry.url() == null) return false;
return true; String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
if (profile == null) {
log.logError("REMOTETRIGGEREDCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return false;
}
log.logDebug("plasmaSwitchboard.remoteTriggeredCrawlJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() +
", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false")));
return processLocalCrawling(urlEntry, profile);
} }
private void processResourceStack(plasmaHTCache.Entry entry) { private void processResourceStack(plasmaHTCache.Entry entry) {
@ -687,7 +778,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
} }
log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url.toString() + log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url.toString() +
", NEW CRAWL STACK SIZE IS " + noticeURL.localStackSize()); ", NEW CRAWL STACK SIZE IS " + noticeURL.coreStackSize());
} }
// create index // create index
@ -839,6 +930,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// store information // store information
boolean local = ((initiatorHash.equals(plasmaURL.dummyHash)) || (initiatorHash.equals(yacyCore.seedDB.mySeed.hash))); boolean local = ((initiatorHash.equals(plasmaURL.dummyHash)) || (initiatorHash.equals(yacyCore.seedDB.mySeed.hash)));
boolean global =
(profile.remoteIndexing()) /* granted */ &&
(currentdepth == profile.generalDepth()) /* leaf node */ &&
(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
((yacyCore.seedDB.mySeed.isSenior()) ||
(yacyCore.seedDB.mySeed.isPrincipal())) /* qualified */;
noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */ noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
nexturl, /* url clear text string */ nexturl, /* url clear text string */
loadDate, /* load date */ loadDate, /* load date */
@ -848,7 +946,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
currentdepth, /*depth so far*/ currentdepth, /*depth so far*/
0, /*anchors, default value */ 0, /*anchors, default value */
0, /*forkfactor, default value */ 0, /*forkfactor, default value */
((local) ? 1 : 4) /*local/remote stack*/ ((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/
); );
return null; return null;
@ -870,13 +969,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (u == null) return plasmaURL.dummyHash; else return u.toString(); if (u == null) return plasmaURL.dummyHash; else return u.toString();
} }
private void processCrawling(plasmaCrawlNURL.entry urlEntry, String initiator) {
private void processCrawlingX(plasmaCrawlNURL.entry urlEntry, String initiator) {
if (urlEntry.url() == null) return; if (urlEntry.url() == null) return;
String profileHandle = urlEntry.profileHandle(); String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); //System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
if (profile == null) { if (profile == null) {
log.logError("CRAWL[" + noticeURL.localStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); log.logError("CRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return; return;
} }
log.logDebug("plasmaSwitchboard.processCrawling: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + log.logDebug("plasmaSwitchboard.processCrawling: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
@ -891,39 +991,41 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(yacyCore.seedDB.mySeed.isPrincipal())) /* qualified */; (yacyCore.seedDB.mySeed.isPrincipal())) /* qualified */;
if (tryRemote) { if (tryRemote) {
boolean success = processGlobalCrawling(urlEntry); boolean success = processRemoteCrawlTrigger(urlEntry);
if (!(success)) processLocalCrawling(urlEntry, profile, initiator); if (!(success)) processLocalCrawling(urlEntry, profile);
} else { } else {
processLocalCrawling(urlEntry, profile, initiator); processLocalCrawling(urlEntry, profile);
} }
} }
private void processLocalCrawling(plasmaCrawlNURL.entry urlEntry, plasmaCrawlProfile.entry profile, String initiator) {
private boolean processLocalCrawling(plasmaCrawlNURL.entry urlEntry, plasmaCrawlProfile.entry profile) {
// work off one Crawl stack entry // work off one Crawl stack entry
if ((urlEntry == null) && (urlEntry.url() == null)) { if ((urlEntry == null) && (urlEntry.url() == null)) {
log.logInfo("LOCALCRAWL[" + noticeURL.localStackSize() + ", " + noticeURL.remoteStackSize() + "]: urlEntry=null"); log.logInfo("LOCALCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: urlEntry=null");
return; return false;
} }
cacheLoader.loadParallel(urlEntry.url(), urlEntry.referrerHash(), initiator, urlEntry.depth(), profile); cacheLoader.loadParallel(urlEntry.url(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile);
log.logInfo("LOCALCRAWL[" + noticeURL.localStackSize() + ", " + noticeURL.remoteStackSize() + "]: enqueued for load " + urlEntry.url()); log.logInfo("LOCALCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: enqueued for load " + urlEntry.url());
return true;
} }
private boolean processGlobalCrawling(plasmaCrawlNURL.entry urlEntry) { private boolean processRemoteCrawlTrigger(plasmaCrawlNURL.entry urlEntry) {
if (urlEntry == null) { if (urlEntry == null) {
log.logInfo("GLOBALCRAWL[" + noticeURL.localStackSize() + ", " + noticeURL.remoteStackSize() + "]: urlEntry=null"); log.logInfo("REMOTECRAWLTRIGGER[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: urlEntry=null");
return false; return false;
} }
// are we qualified? // are we qualified?
if ((yacyCore.seedDB.mySeed == null) || if ((yacyCore.seedDB.mySeed == null) ||
(yacyCore.seedDB.mySeed.isJunior())) { (yacyCore.seedDB.mySeed.isJunior())) {
log.logDebug("plasmaSwitchboard.processGlobalCrawling: no permission"); log.logDebug("plasmaSwitchboard.processRemoteCrawlTrigger: no permission");
return false; return false;
} }
// check url // check url
if (urlEntry.url() == null) { if (urlEntry.url() == null) {
log.logDebug("ERROR: plasmaSwitchboard.processGlobalCrawling - url is null. name=" + urlEntry.name()); log.logDebug("ERROR: plasmaSwitchboard.processRemoteCrawlTrigger - url is null. name=" + urlEntry.name());
return false; return false;
} }
String nexturlString = urlEntry.url().toString(); String nexturlString = urlEntry.url().toString();
@ -932,7 +1034,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// check remote crawl // check remote crawl
yacySeed remoteSeed = yacyCore.dhtAgent.getCrawlSeed(urlhash); yacySeed remoteSeed = yacyCore.dhtAgent.getCrawlSeed(urlhash);
if (remoteSeed == null) { if (remoteSeed == null) {
log.logDebug("plasmaSwitchboard.processGlobalCrawling: no remote crawl seed available"); log.logDebug("plasmaSwitchboard.processRemoteCrawlTrigger: no remote crawl seed available");
return false; return false;
} }
@ -960,13 +1062,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
yacyCore.peerActions.peerDeparture(remoteSeed); yacyCore.peerActions.peerDeparture(remoteSeed);
return false; return false;
} else try { } else try {
log.logDebug("plasmaSwitchboard.processGlobalCrawling: remoteSeed=" + remoteSeed.getName() + ", url=" + nexturlString + ", response=" + page.toString()); // DEBUG log.logDebug("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed.getName() + ", url=" + nexturlString + ", response=" + page.toString()); // DEBUG
int newdelay = Integer.parseInt((String) page.get("delay")); int newdelay = Integer.parseInt((String) page.get("delay"));
yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay); yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay);
String response = (String) page.get("response"); String response = (String) page.get("response");
if (response.equals("stacked")) { if (response.equals("stacked")) {
log.logInfo("GLOBALCRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " PLACED URL=" + nexturlString + "; NEW DELAY=" + newdelay); log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " PLACED URL=" + nexturlString + "; NEW DELAY=" + newdelay);
return true; return true;
} else if (response.equals("double")) { } else if (response.equals("double")) {
String lurl = (String) page.get("lurl"); String lurl = (String) page.get("lurl");
@ -974,19 +1076,19 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
plasmaCrawlLURL.entry entry = loadedURL.newEntry(propStr, true, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); plasmaCrawlLURL.entry entry = loadedURL.newEntry(propStr, true, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1);
noticeURL.remove(entry.hash()); noticeURL.remove(entry.hash());
log.logInfo("GLOBALCRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + nexturlString + "). URL IS CONSIDERED AS 'LOADED!'"); log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + nexturlString + "). URL IS CONSIDERED AS 'LOADED!'");
return true; return true;
} else { } else {
log.logInfo("GLOBALCRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " REJECTED. CAUSE: " + page.get("reason") + " (URL=" + nexturlString + ")"); log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " REJECTED. CAUSE: " + page.get("reason") + " (URL=" + nexturlString + ")");
return false; return false;
} }
} else { } else {
log.logInfo("GLOBALCRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " DENIED. RESPONSE=" + response + ", CAUSE=" + page.get("reason") + ", URL=" + nexturlString); log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " DENIED. RESPONSE=" + response + ", CAUSE=" + page.get("reason") + ", URL=" + nexturlString);
return false; return false;
} }
} catch (Exception e) { } catch (Exception e) {
// wrong values // wrong values
log.logError("GLOBALCRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CLIENT RETURNED: " + page.toString()); log.logError("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CLIENT RETURNED: " + page.toString());
e.printStackTrace(); e.printStackTrace();
return false; return false;
} }
@ -1337,7 +1439,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
int transferred; int transferred;
long starttime = System.currentTimeMillis(); long starttime = System.currentTimeMillis();
try { try {
if ((totalSize() == 0) && if ((queueSize() == 0) &&
(getConfig("allowDistributeIndex", "false").equals("true")) && (getConfig("allowDistributeIndex", "false").equals("true")) &&
((transferred = performTransferIndex(indexCount, peerCount, true)) > 0)) { ((transferred = performTransferIndex(indexCount, peerCount, true)) > 0)) {
indexCount = transferred; indexCount = transferred;

@ -57,7 +57,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
private static final String newSingletonFileName = "indexAssortment001.db"; private static final String newSingletonFileName = "indexAssortment001.db";
private static final String indexAssortmentClusterPath = "ACLUSTER"; private static final String indexAssortmentClusterPath = "ACLUSTER";
private static final int assortmentLimit = 50; private static final int assortmentLimit = 50;
private static final int ramcacheLimit = 70; private static final int ramcacheLimit = 51;
// class variables // class variables

@ -412,10 +412,12 @@ xpstopw=true
30_peerping_busysleep=120000 30_peerping_busysleep=120000
40_peerseedcycle_idlesleep=1800000 40_peerseedcycle_idlesleep=1800000
40_peerseedcycle_busysleep=1200000 40_peerseedcycle_busysleep=1200000
50_localcrawl_idlesleep=15000 50_localcrawl_idlesleep=10000
50_localcrawl_busysleep=0 50_localcrawl_busysleep=0
60_globalcrawl_idlesleep=30000 61_globalcrawltrigger_idlesleep=10000
60_globalcrawl_busysleep=3000 61_globalcrawltrigger_busysleep=0
62_remotetriggeredcrawl_idlesleep=20000
62_remotetriggeredcrawl_busysleep=0
70_cachemanager_idlesleep=10000 70_cachemanager_idlesleep=10000
70_cachemanager_busysleep=0 70_cachemanager_busysleep=0
80_dequeue_idlesleep=10000 80_dequeue_idlesleep=10000

Loading…
Cancel
Save