Init remote crawler on demand

If remote crawl option is not activated, skip init of remoteCrawlJob to save the resources of queue and ideling thread.
Deploy of the remoteCrawlJob deferred on activation of the option.
pull/8/head
reger 10 years ago
parent dbf9e3503d
commit 3e742d1e34

@ -143,7 +143,7 @@ public class ConfigNetwork_p
prop.put("commit", commit); prop.put("commit", commit);
// write remote crawl request settings // write remote crawl request settings
prop.put("crawlResponse", sb.getConfigBool("crawlResponse", false) ? "1" : "0"); prop.put("crawlResponse", sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false) ? "1" : "0");
final long RTCbusySleep = final long RTCbusySleep =
Math Math
.max(1, env.getConfigInt(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, 100)); .max(1, env.getConfigInt(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, 100));
@ -166,7 +166,7 @@ public class ConfigNetwork_p
prop.put("indexReceiveSearchChecked", indexReceiveSearch); prop.put("indexReceiveSearchChecked", indexReceiveSearch);
// set seed information directly // set seed information directly
sb.peers.mySeed().setFlagAcceptRemoteCrawl(sb.getConfigBool("crawlResponse", false)); sb.peers.mySeed().setFlagAcceptRemoteCrawl(sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false));
sb.peers.mySeed().setFlagAcceptRemoteIndex(indexReceive); sb.peers.mySeed().setFlagAcceptRemoteIndex(indexReceive);
// set p2p/robinson mode flags and values // set p2p/robinson mode flags and values

@ -56,7 +56,7 @@ public class RemoteCrawl_p {
boolean crawlResponse = post.get("crawlResponse", "off").equals("on"); boolean crawlResponse = post.get("crawlResponse", "off").equals("on");
// read remote crawl request settings // read remote crawl request settings
sb.setConfig("crawlResponse", crawlResponse); sb.initRemoteCrawler(crawlResponse);
} }
if (post.containsKey("acceptCrawlLimit")) { if (post.containsKey("acceptCrawlLimit")) {
@ -70,7 +70,7 @@ public class RemoteCrawl_p {
} }
// set seed information directly // set seed information directly
sb.peers.mySeed().setFlagAcceptRemoteCrawl(sb.getConfigBool("crawlResponse", false)); sb.peers.mySeed().setFlagAcceptRemoteCrawl(sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false));
// write remote crawl request settings // write remote crawl request settings
prop.put("disabled", !sb.peers.mySeed().isActive() && !sb.peers.mySeed().getFlagAcceptRemoteCrawl() ? 1 : 0); prop.put("disabled", !sb.peers.mySeed().isActive() && !sb.peers.mySeed().getFlagAcceptRemoteCrawl() ? 1 : 0);

@ -356,7 +356,7 @@ public class Status
prop.putNum( prop.putNum(
"remoteTriggeredCrawlQueueSize", "remoteTriggeredCrawlQueueSize",
sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount()); sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) != null ? sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount() : 0);
prop.put( prop.put(
"remoteTriggeredCrawlPaused", "remoteTriggeredCrawlPaused",
sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? "1" : "0"); sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? "1" : "0");

@ -105,7 +105,7 @@ public class status_p {
prop.put("limitCrawlState", STATE_RUNNING); prop.put("limitCrawlState", STATE_RUNNING);
//remote crawl queue //remote crawl queue
prop.putNum("remoteCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount()); prop.putNum("remoteCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) != null ? sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount() : 0);
prop.put("remoteCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING); prop.put("remoteCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
//noload crawl queue //noload crawl queue

@ -143,7 +143,7 @@ public final class crawlReceipt {
return prop; return prop;
} }
if ("fill".equals(result)) try { if ("fill".equals(result) && sb.crawlQueues.delegatedURL != null) try {
// put new entry into database // put new entry into database
sb.index.fulltext().putMetadata(entry); sb.index.fulltext().putMetadata(entry);
ResultURLs.stack(ASCII.String(entry.url().hash()), entry.url().getHost(), youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS); ResultURLs.stack(ASCII.String(entry.url().hash()), entry.url().getHost(), youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS);
@ -159,8 +159,10 @@ public final class crawlReceipt {
return prop; return prop;
} }
sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work is transformed into an error case if (sb.crawlQueues.delegatedURL != null) { // the delegated work is transformed into an error case
sb.crawlQueues.errorURL.push(entry.url(), 997, null, FailCategory.FINAL_LOAD_CONTEXT, result + ":" + reason, -1); sb.crawlQueues.delegatedURL.remove(entry.hash());
sb.crawlQueues.errorURL.push(entry.url(), 997, null, FailCategory.FINAL_LOAD_CONTEXT, result + ":" + reason, -1);
}
//switchboard.noticeURL.remove(receivedUrlhash); //switchboard.noticeURL.remove(receivedUrlhash);
prop.put("delay", "3600"); prop.put("delay", "3600");
return prop; return prop;

@ -83,7 +83,7 @@ public class urls {
} }
// place url to notice-url db // place url to notice-url db
sb.crawlQueues.delegatedURL.put(ASCII.String(entry.url().hash()), entry.url()); if (sb.crawlQueues.delegatedURL != null) sb.crawlQueues.delegatedURL.put(ASCII.String(entry.url().hash()), entry.url());
// create RSS entry // create RSS entry
prop.put("item_" + c + "_title", ""); prop.put("item_" + c + "_title", "");

@ -21,6 +21,7 @@
package net.yacy.crawler; package net.yacy.crawler;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -90,6 +91,9 @@ public interface Balancer {
*/ */
public int size(); public int size();
public int getOnDemandLimit();
public boolean getExceed134217727();
/** /**
* check if stack is empty * check if stack is empty
* @return true iff size() == 0 * @return true iff size() == 0

@ -202,7 +202,16 @@ public class HostBalancer implements Balancer {
} }
return true; return true;
} }
@Override
public int getOnDemandLimit() {
return this.onDemandLimit;
}
@Override
public boolean getExceed134217727() {
return this.exceed134217727;
}
/** /**
* push a request to one of the host queues. If the queue does not exist, it is created * push a request to one of the host queues. If the queue does not exist, it is created
* @param entry * @param entry

@ -544,4 +544,14 @@ public class HostQueue implements Balancer {
return cel; return cel;
} }
@Override
public int getOnDemandLimit() {
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
}
@Override
public boolean getExceed134217727() {
return this.exceed134217727;
}
} }

@ -76,6 +76,16 @@ public class LegacyBalancer implements Balancer {
private final List<Map.Entry<String, byte[]>> zeroWaitingCandidates; private final List<Map.Entry<String, byte[]>> zeroWaitingCandidates;
private final Random random; // used to alternate between choose-from-maxstack or choose from any zero-waiting private final Random random; // used to alternate between choose-from-maxstack or choose from any zero-waiting
@Override
public int getOnDemandLimit() {
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
}
@Override
public boolean getExceed134217727() {
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
}
private static class HostHandles { private static class HostHandles {
public String hosthash; public String hosthash;
public HandleSet handleSet; public HandleSet handleSet;

@ -72,7 +72,7 @@ public class CrawlQueues {
private final Switchboard sb; private final Switchboard sb;
private final Loader[] worker; private final Loader[] worker;
private final ArrayBlockingQueue<Request> workerQueue; private final ArrayBlockingQueue<Request> workerQueue;
private final ArrayList<String> remoteCrawlProviderHashes; private ArrayList<String> remoteCrawlProviderHashes;
public NoticedURL noticeURL; public NoticedURL noticeURL;
public ErrorCache errorURL; public ErrorCache errorURL;
@ -83,7 +83,7 @@ public class CrawlQueues {
final int maxWorkers = (int) sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10); final int maxWorkers = (int) sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10);
this.worker = new Loader[maxWorkers]; this.worker = new Loader[maxWorkers];
this.workerQueue = new ArrayBlockingQueue<Request>(200); this.workerQueue = new ArrayBlockingQueue<Request>(200);
this.remoteCrawlProviderHashes = new ArrayList<String>(); this.remoteCrawlProviderHashes = null;
// start crawling management // start crawling management
log.config("Starting Crawling Management"); log.config("Starting Crawling Management");
@ -92,10 +92,16 @@ public class CrawlQueues {
log.config("Opening errorURL.."); log.config("Opening errorURL..");
this.errorURL = new ErrorCache(sb.index.fulltext()); this.errorURL = new ErrorCache(sb.index.fulltext());
log.config("Opening delegatedURL.."); log.config("Opening delegatedURL..");
this.delegatedURL = new ConcurrentHashMap<String, DigestURL>(); this.delegatedURL = null;
log.config("Finishted Startup of Crawling Management"); }
public void initRemoteCrawlQueues () {
if (this.remoteCrawlProviderHashes == null) this.remoteCrawlProviderHashes = new ArrayList<String>();
if (this.delegatedURL == null) {
this.delegatedURL = new ConcurrentHashMap<String, DigestURL>();
log.config("Finishted Startup of Crawling Management");
}
} }
/** /**
* Relocation is necessary if the user switches the network. * Relocation is necessary if the user switches the network.
* Because this object is part of the scheduler we cannot simply close that object and create a new one. * Because this object is part of the scheduler we cannot simply close that object and create a new one.
@ -106,10 +112,10 @@ public class CrawlQueues {
// removed pending requests // removed pending requests
this.workerQueue.clear(); this.workerQueue.clear();
this.errorURL.clearCache(); this.errorURL.clearCache();
this.remoteCrawlProviderHashes.clear(); if (this.remoteCrawlProviderHashes != null) this.remoteCrawlProviderHashes.clear();
this.noticeURL.close(); this.noticeURL.close();
this.noticeURL = new NoticedURL(newQueuePath, sb.getConfigInt("crawler.onDemandLimit", 1000), this.sb.exceed134217727); this.noticeURL = new NoticedURL(newQueuePath, sb.getConfigInt("crawler.onDemandLimit", 1000), this.sb.exceed134217727);
this.delegatedURL.clear(); if (this.delegatedURL != null) this.delegatedURL.clear();
} }
public synchronized void close() { public synchronized void close() {
@ -130,16 +136,16 @@ public class CrawlQueues {
} }
} }
this.noticeURL.close(); this.noticeURL.close();
this.delegatedURL.clear(); if (this.delegatedURL != null) this.delegatedURL.clear();
} }
public void clear() { public void clear() {
// wait for all workers to finish // wait for all workers to finish
this.workerQueue.clear(); this.workerQueue.clear();
for (final Loader w: this.worker) if (w != null) w.interrupt(); for (final Loader w: this.worker) if (w != null) w.interrupt();
this.remoteCrawlProviderHashes.clear(); if (this.remoteCrawlProviderHashes != null) this.remoteCrawlProviderHashes.clear();
this.noticeURL.clear(); this.noticeURL.clear();
this.delegatedURL.clear(); if (this.delegatedURL != null) this.delegatedURL.clear();
} }
/** /**
@ -148,7 +154,7 @@ public class CrawlQueues {
* @return if the hash exists, the name of the database is returned, otherwise null is returned * @return if the hash exists, the name of the database is returned, otherwise null is returned
*/ */
public HarvestProcess exists(final byte[] hash) { public HarvestProcess exists(final byte[] hash) {
if (this.delegatedURL.containsKey(ASCII.String(hash))) { if (this.delegatedURL != null && this.delegatedURL.containsKey(ASCII.String(hash))) {
return HarvestProcess.DELEGATED; return HarvestProcess.DELEGATED;
} }
//if (this.noticeURL.existsInStack(hash)) { //if (this.noticeURL.existsInStack(hash)) {
@ -181,7 +187,7 @@ public class CrawlQueues {
public void removeURL(final byte[] hash) { public void removeURL(final byte[] hash) {
assert hash != null && hash.length == 12; assert hash != null && hash.length == 12;
this.noticeURL.removeByURLHash(hash); this.noticeURL.removeByURLHash(hash);
this.delegatedURL.remove(hash); if (this.delegatedURL != null) this.delegatedURL.remove(hash);
} }
public int removeHosts(final Set<String> hosthashes) { public int removeHosts(final Set<String> hosthashes) {
@ -194,9 +200,11 @@ public class CrawlQueues {
if (urlhash == null || urlhash.length == 0) { if (urlhash == null || urlhash.length == 0) {
return null; return null;
} }
DigestURL u = this.delegatedURL.get(ASCII.String(urlhash)); if (this.delegatedURL != null) {
if (u != null) { DigestURL u = this.delegatedURL.get(ASCII.String(urlhash));
return u; if (u != null) {
return u;
}
} }
for (final DigestURL url: activeWorkerEntries().keySet()) { for (final DigestURL url: activeWorkerEntries().keySet()) {
if (Base64Order.enhancedCoder.equal(url.hash(), urlhash)) { if (Base64Order.enhancedCoder.equal(url.hash(), urlhash)) {
@ -456,7 +464,7 @@ public class CrawlQueues {
// check if we have an entry in the provider list, otherwise fill the list // check if we have an entry in the provider list, otherwise fill the list
Seed seed; Seed seed;
if (this.remoteCrawlProviderHashes.isEmpty()) { if (this.remoteCrawlProviderHashes != null && this.remoteCrawlProviderHashes.isEmpty()) {
if (this.sb.peers != null && this.sb.peers.sizeConnected() > 0) { if (this.sb.peers != null && this.sb.peers.sizeConnected() > 0) {
final Iterator<Seed> e = DHTSelection.getProvidesRemoteCrawlURLs(this.sb.peers); final Iterator<Seed> e = DHTSelection.getProvidesRemoteCrawlURLs(this.sb.peers);
while (e.hasNext()) { while (e.hasNext()) {
@ -467,14 +475,14 @@ public class CrawlQueues {
} }
} }
} }
if (this.remoteCrawlProviderHashes.isEmpty()) { if (this.remoteCrawlProviderHashes == null || this.remoteCrawlProviderHashes.isEmpty()) {
return false; return false;
} }
// take one entry from the provider list and load the entries from the remote peer // take one entry from the provider list and load the entries from the remote peer
seed = null; seed = null;
String hash = null; String hash = null;
while (seed == null && !this.remoteCrawlProviderHashes.isEmpty()) { while (seed == null && (this.remoteCrawlProviderHashes != null && !this.remoteCrawlProviderHashes.isEmpty())) {
hash = this.remoteCrawlProviderHashes.remove(this.remoteCrawlProviderHashes.size() - 1); hash = this.remoteCrawlProviderHashes.remove(this.remoteCrawlProviderHashes.size() - 1);
if (hash == null) { if (hash == null) {
continue; continue;

@ -46,6 +46,7 @@ import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.MemoryControl;
public class NoticedURL { public class NoticedURL {
@ -55,8 +56,9 @@ public class NoticedURL {
private Balancer coreStack; // links found by crawling to depth-1 private Balancer coreStack; // links found by crawling to depth-1
private Balancer limitStack; // links found by crawling at target depth private Balancer limitStack; // links found by crawling at target depth
private Balancer remoteStack; // links from remote crawl orders private Balancer remoteStack; // links from remote crawl orders (init on demand)
private Balancer noloadStack; // links that are not passed to a loader; the index will be generated from the Request entry private Balancer noloadStack; // links that are not passed to a loader; the index will be generated from the Request entry
private final File cachePath;
protected NoticedURL( protected NoticedURL(
final File cachePath, final File cachePath,
@ -64,16 +66,28 @@ public class NoticedURL {
final boolean exceed134217727) { final boolean exceed134217727) {
ConcurrentLog.info("NoticedURL", "START CREATING STACKS at " + cachePath.toString()); ConcurrentLog.info("NoticedURL", "START CREATING STACKS at " + cachePath.toString());
ConcurrentLog.info("NoticedURL", "opening CrawlerCoreStacks.."); ConcurrentLog.info("NoticedURL", "opening CrawlerCoreStacks..");
this.cachePath = cachePath;
this.coreStack = new HostBalancer(new File(cachePath, "CrawlerCoreStacks"), onDemandLimit, exceed134217727); this.coreStack = new HostBalancer(new File(cachePath, "CrawlerCoreStacks"), onDemandLimit, exceed134217727);
ConcurrentLog.info("NoticedURL", "opening CrawlerLimitStacks.."); ConcurrentLog.info("NoticedURL", "opening CrawlerLimitStacks..");
this.limitStack = new HostBalancer(new File(cachePath, "CrawlerLimitStacks"), onDemandLimit, exceed134217727); this.limitStack = new HostBalancer(new File(cachePath, "CrawlerLimitStacks"), onDemandLimit, exceed134217727);
ConcurrentLog.info("NoticedURL", "opening CrawlerRemoteStacks..");
this.remoteStack = new HostBalancer(new File(cachePath, "CrawlerRemoteStacks"), onDemandLimit, exceed134217727); this.remoteStack = null; // init on demand (on first push)
ConcurrentLog.info("NoticedURL", "opening CrawlerNoLoadStacks.."); ConcurrentLog.info("NoticedURL", "opening CrawlerNoLoadStacks..");
this.noloadStack = new HostBalancer(new File(cachePath, "CrawlerNoLoadStacks"), onDemandLimit, exceed134217727); this.noloadStack = new HostBalancer(new File(cachePath, "CrawlerNoLoadStacks"), onDemandLimit, exceed134217727);
ConcurrentLog.info("NoticedURL", "FINISHED CREATING STACKS at " + cachePath.toString()); ConcurrentLog.info("NoticedURL", "FINISHED CREATING STACKS at " + cachePath.toString());
} }
/**
* Init Remote crawl stack, internally called on 1st push to remoteStack
*/
protected void initRemoteStack() {
if (this.remoteStack == null && !MemoryControl.shortStatus()) {
ConcurrentLog.info("NoticedURL", "opening CrawlerRemoteStacks..");
this.remoteStack = new HostBalancer(new File(this.cachePath, "CrawlerRemoteStacks"), this.coreStack.getOnDemandLimit(), this.coreStack.getExceed134217727());
}
}
public void clear() { public void clear() {
ConcurrentLog.info("NoticedURL", "CLEARING ALL STACKS"); ConcurrentLog.info("NoticedURL", "CLEARING ALL STACKS");
if (this.coreStack != null) this.coreStack.clear(); if (this.coreStack != null) this.coreStack.clear();
@ -113,7 +127,6 @@ public class NoticedURL {
} }
public int size() { public int size() {
// this does not count the overhang stack size
return ((this.coreStack == null) ? 0 : this.coreStack.size()) + ((this.limitStack == null) ? 0 : this.limitStack.size()) + ((this.remoteStack == null) ? 0 : this.remoteStack.size()); return ((this.coreStack == null) ? 0 : this.coreStack.size()) + ((this.limitStack == null) ? 0 : this.limitStack.size()) + ((this.remoteStack == null) ? 0 : this.remoteStack.size());
} }
@ -127,7 +140,7 @@ public class NoticedURL {
public boolean isEmpty() { public boolean isEmpty() {
if (!isEmptyLocal()) return false; if (!isEmptyLocal()) return false;
if (!this.remoteStack.isEmpty()) return false; if (this.remoteStack != null && !this.remoteStack.isEmpty()) return false;
return true; return true;
} }
@ -155,8 +168,7 @@ public class NoticedURL {
return return
this.coreStack.has(urlhashb) || this.coreStack.has(urlhashb) ||
this.limitStack.has(urlhashb) || this.limitStack.has(urlhashb) ||
//overhangStack.has(urlhashb) || (this.remoteStack != null && this.remoteStack.has(urlhashb)) ||
this.remoteStack.has(urlhashb) ||
this.noloadStack.has(urlhashb); this.noloadStack.has(urlhashb);
} }
@ -169,11 +181,16 @@ public class NoticedURL {
public String push(final StackType stackType, final Request entry, CrawlProfile profile, final RobotsTxt robots) { public String push(final StackType stackType, final Request entry, CrawlProfile profile, final RobotsTxt robots) {
try { try {
switch (stackType) { switch (stackType) {
case LOCAL: return this.coreStack.push(entry, profile, robots); case LOCAL: return this.coreStack.push(entry, profile, robots);
case GLOBAL: return this.limitStack.push(entry, profile, robots); case GLOBAL: return this.limitStack.push(entry, profile, robots);
case REMOTE: return this.remoteStack.push(entry, profile, robots); case REMOTE: {
if (this.remoteStack == null) {
this.initRemoteStack();
}
return (this.remoteStack != null) ? this.remoteStack.push(entry, profile, robots) : "remote crawler stack deactivated";
}
case NOLOAD: return this.noloadStack.push(entry, profile, robots); case NOLOAD: return this.noloadStack.push(entry, profile, robots);
default: return "stack type unknown"; default: return "stack type unknown";
} }
} catch (final Exception er) { } catch (final Exception er) {
ConcurrentLog.logException(er); ConcurrentLog.logException(er);
@ -186,7 +203,7 @@ public class NoticedURL {
try {if ((entry = this.noloadStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} try {if ((entry = this.noloadStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
try {if ((entry = this.coreStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} try {if ((entry = this.coreStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
try {if ((entry = this.limitStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} try {if ((entry = this.limitStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
try {if ((entry = this.remoteStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} try {if (this.remoteStack != null && (entry = this.remoteStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
return null; return null;
} }
@ -204,7 +221,7 @@ public class NoticedURL {
try {ret |= this.noloadStack.remove(urlHashes) > 0;} catch (final IOException e) {} try {ret |= this.noloadStack.remove(urlHashes) > 0;} catch (final IOException e) {}
try {ret |= this.coreStack.remove(urlHashes) > 0;} catch (final IOException e) {} try {ret |= this.coreStack.remove(urlHashes) > 0;} catch (final IOException e) {}
try {ret |= this.limitStack.remove(urlHashes) > 0;} catch (final IOException e) {} try {ret |= this.limitStack.remove(urlHashes) > 0;} catch (final IOException e) {}
try {ret |= this.remoteStack.remove(urlHashes) > 0;} catch (final IOException e) {} try {ret |= this.remoteStack != null && this.remoteStack.remove(urlHashes) > 0;} catch (final IOException e) {}
return ret; return ret;
} catch (final SpaceExceededException e) { } catch (final SpaceExceededException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
@ -217,7 +234,7 @@ public class NoticedURL {
try {removed += this.noloadStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {} try {removed += this.noloadStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {}
try {removed += this.coreStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {} try {removed += this.coreStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {}
try {removed += this.limitStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {} try {removed += this.limitStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {}
try {removed += this.remoteStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {} if (this.remoteStack != null) try {removed += this.remoteStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {}
return removed; return removed;
} }
@ -226,7 +243,7 @@ public class NoticedURL {
removed += this.noloadStack.removeAllByHostHashes(hosthashes); removed += this.noloadStack.removeAllByHostHashes(hosthashes);
removed += this.coreStack.removeAllByHostHashes(hosthashes); removed += this.coreStack.removeAllByHostHashes(hosthashes);
removed += this.limitStack.removeAllByHostHashes(hosthashes); removed += this.limitStack.removeAllByHostHashes(hosthashes);
removed += this.remoteStack.removeAllByHostHashes(hosthashes); if (this.remoteStack != null) removed += this.remoteStack.removeAllByHostHashes(hosthashes);
return removed; return removed;
} }
@ -238,7 +255,7 @@ public class NoticedURL {
switch (stackType) { switch (stackType) {
case LOCAL: return this.coreStack.getDomainStackHosts(robots); case LOCAL: return this.coreStack.getDomainStackHosts(robots);
case GLOBAL: return this.limitStack.getDomainStackHosts(robots); case GLOBAL: return this.limitStack.getDomainStackHosts(robots);
case REMOTE: return this.remoteStack.getDomainStackHosts(robots); case REMOTE: return (this.remoteStack != null) ? this.remoteStack.getDomainStackHosts(robots) : null;
case NOLOAD: return this.noloadStack.getDomainStackHosts(robots); case NOLOAD: return this.noloadStack.getDomainStackHosts(robots);
default: return null; default: return null;
} }
@ -254,7 +271,7 @@ public class NoticedURL {
switch (stackType) { switch (stackType) {
case LOCAL: return this.coreStack.getDomainStackReferences(host, maxcount, maxtime); case LOCAL: return this.coreStack.getDomainStackReferences(host, maxcount, maxtime);
case GLOBAL: return this.limitStack.getDomainStackReferences(host, maxcount, maxtime); case GLOBAL: return this.limitStack.getDomainStackReferences(host, maxcount, maxtime);
case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount, maxtime); case REMOTE: return (this.remoteStack != null) ? this.remoteStack.getDomainStackReferences(host, maxcount, maxtime) : null;
case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount, maxtime); case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount, maxtime);
default: return null; default: return null;
} }
@ -264,7 +281,7 @@ public class NoticedURL {
switch (stackType) { switch (stackType) {
case LOCAL: return pop(this.coreStack, delay, cs, robots); case LOCAL: return pop(this.coreStack, delay, cs, robots);
case GLOBAL: return pop(this.limitStack, delay, cs, robots); case GLOBAL: return pop(this.limitStack, delay, cs, robots);
case REMOTE: return pop(this.remoteStack, delay, cs, robots); case REMOTE: return (this.remoteStack != null) ? pop(this.remoteStack, delay, cs, robots) : null;
case NOLOAD: return pop(this.noloadStack, false, cs, robots); case NOLOAD: return pop(this.noloadStack, false, cs, robots);
default: return null; default: return null;
} }
@ -285,14 +302,25 @@ public class NoticedURL {
} }
public void clear(final StackType stackType) { public void clear(final StackType stackType) {
ConcurrentLog.info("NoticedURL", "CLEARING STACK " + stackType); ConcurrentLog.info("NoticedURL", "CLEARING STACK " + stackType);
switch (stackType) { switch (stackType) {
case LOCAL: this.coreStack.clear(); break; case LOCAL:
case GLOBAL: this.limitStack.clear(); break; this.coreStack.clear();
case REMOTE: this.remoteStack.clear(); break; break;
case NOLOAD: this.noloadStack.clear(); break; case GLOBAL:
default: return; this.limitStack.clear();
} break;
case REMOTE:
if (this.remoteStack != null) {
this.remoteStack.clear();
}
break;
case NOLOAD:
this.noloadStack.clear();
break;
default:
return;
}
} }
private static Request pop(final Balancer balancer, final boolean delay, final CrawlSwitchboard cs, final RobotsTxt robots) throws IOException { private static Request pop(final Balancer balancer, final boolean delay, final CrawlSwitchboard cs, final RobotsTxt robots) throws IOException {
@ -331,7 +359,7 @@ public class NoticedURL {
try {switch (stackType) { try {switch (stackType) {
case LOCAL: return this.coreStack.iterator(); case LOCAL: return this.coreStack.iterator();
case GLOBAL: return this.limitStack.iterator(); case GLOBAL: return this.limitStack.iterator();
case REMOTE: return this.remoteStack.iterator(); case REMOTE: return (this.remoteStack != null) ? this.remoteStack.iterator() : null;
case NOLOAD: return this.noloadStack.iterator(); case NOLOAD: return this.noloadStack.iterator();
default: return null; default: return null;
}} catch (final IOException e) { }} catch (final IOException e) {

@ -1051,32 +1051,9 @@ public final class Switchboard extends serverSwitch {
20000, 20000,
0), 0),
10000); 10000);
deployThread(
SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL, this.initRemoteCrawler(this.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false));
"Remote Crawl Job",
"thread that performes a single crawl/indexing step triggered by a remote peer",
"/IndexCreateQueues_p.html?stack=REMOTE",
new InstantBusyThread(
this.crawlQueues,
SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_START,
SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_JOBCOUNT,
SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_FREEMEM,
0,
0),
10000);
deployThread(
SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER,
"Remote Crawl URL Loader",
"thread that loads remote crawl lists from other peers",
null,
new InstantBusyThread(
this.crawlQueues,
SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_START,
SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT,
SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_FREEMEM,
10000,
10000),
10000); // error here?
deployThread( deployThread(
SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL,
"Local Crawl", "Local Crawl",
@ -1472,21 +1449,77 @@ public final class Switchboard extends serverSwitch {
// propagate to crawler // propagate to crawler
final BusyThread rct = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); final BusyThread rct = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, newBusySleep); setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, newBusySleep);
setConfig( setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP,
SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, Math.min(10000, newBusySleep * 10));
Math.min(10000, newBusySleep * 10)); if (rct != null) {
rct.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, 1000)); rct.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, 1000));
rct rct.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, 10000));
.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, 10000)); }
// propagate to loader // propagate to loader
final BusyThread rcl = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER); final BusyThread rcl = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER);
setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, newBusySleep * 4); setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, newBusySleep * 4);
setConfig( setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP,
SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, Math.min(10000, newBusySleep * 20));
Math.min(10000, newBusySleep * 20)); if (rcl != null) {
rcl.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, 1000)); rcl.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, 1000));
rcl.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, 10000)); rcl.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, 10000));
}
}
/**
* Initialisize and perform all settings to enable remote crawls
* (if remote crawl is not in use, save the resources)
* @param activate true=enable, false=disable
*/
public void initRemoteCrawler(final boolean activate) {
this.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE, activate);
this.peers.mySeed().setFlagAcceptRemoteCrawl(activate);
if (activate) {
this.crawlQueues.initRemoteCrawlQueues();
BusyThread rct = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
if (rct == null) {
deployThread(
SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL,
"Remote Crawl Job",
"thread that performes a single crawl/indexing step triggered by a remote peer",
"/IndexCreateQueues_p.html?stack=REMOTE",
new InstantBusyThread(
this.crawlQueues,
SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_START,
SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_JOBCOUNT,
SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_FREEMEM,
0,
0),
10000);
rct = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
}
rct.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, 1000));
rct.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, 10000));
BusyThread rcl = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER);
if (rcl == null) {
deployThread(
SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER,
"Remote Crawl URL Loader",
"thread that loads remote crawl lists from other peers",
null,
new InstantBusyThread(
this.crawlQueues,
SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_START,
SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT,
SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_FREEMEM,
10000,
10000),
10000);
rcl = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER);
}
rcl.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, 1000));
rcl.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, 10000));
}
} }
public void initMessages() throws IOException { public void initMessages() throws IOException {
@ -2160,7 +2193,7 @@ public final class Switchboard extends serverSwitch {
public int cleanupJobSize() { public int cleanupJobSize() {
int c = 1; // run this always! int c = 1; // run this always!
if ( (this.crawlQueues.delegatedURL.size() > 1000) ) { if (this.crawlQueues.delegatedURL != null && (this.crawlQueues.delegatedURL.size() > 1000) ) {
c++; c++;
} }
if ( (this.crawlQueues.errorURL.stackSize() > 1000) ) { if ( (this.crawlQueues.errorURL.stackSize() > 1000) ) {
@ -2256,7 +2289,7 @@ public final class Switchboard extends serverSwitch {
// clean up delegated stack // clean up delegated stack
checkInterruption(); checkInterruption();
if ( (this.crawlQueues.delegatedURL.size() > 1000) ) { if (this.crawlQueues.delegatedURL != null && (this.crawlQueues.delegatedURL.size() > 1000) ) {
if ( this.log.isFine() ) { if ( this.log.isFine() ) {
this.log.fine("Cleaning Delegated-URLs report stack, " this.log.fine("Cleaning Delegated-URLs report stack, "
+ this.crawlQueues.delegatedURL.size() + this.crawlQueues.delegatedURL.size()
@ -3778,7 +3811,7 @@ public final class Switchboard extends serverSwitch {
mySeed.setFlagDirectConnect(true); mySeed.setFlagDirectConnect(true);
mySeed.setLastSeenUTC(); mySeed.setLastSeenUTC();
mySeed.put(Seed.UTC, GenericFormatter.UTCDiffString()); mySeed.put(Seed.UTC, GenericFormatter.UTCDiffString());
mySeed.setFlagAcceptRemoteCrawl(getConfigBool("crawlResponse", true)); mySeed.setFlagAcceptRemoteCrawl(getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false));
mySeed.setFlagAcceptRemoteIndex(getConfigBool("allowReceiveIndex", true)); mySeed.setFlagAcceptRemoteIndex(getConfigBool("allowReceiveIndex", true));
mySeed.setFlagSSLAvailable(this.getHttpServer() != null && this.getHttpServer().withSSL() && getConfigBool("server.https", false)); mySeed.setFlagSSLAvailable(this.getHttpServer() != null && this.getHttpServer().withSSL() && getConfigBool("server.https", false));
if (mySeed.getFlagSSLAvailable()) mySeed.put(Seed.PORTSSL, Integer.toString(getPublicPort("port.ssl", 8443))); if (mySeed.getFlagSSLAvailable()) mySeed.put(Seed.PORTSSL, Integer.toString(getPublicPort("port.ssl", 8443)));

@ -113,6 +113,7 @@ public final class SwitchboardConstants {
* *
* @see Switchboard#CRAWLJOB_REMOTE_CRAWL_LOADER * @see Switchboard#CRAWLJOB_REMOTE_CRAWL_LOADER
*/ */
public static final String CRAWLJOB_REMOTE = "crawlResponse"; // enable/disable response to remote crawl requests
public static final String CRAWLJOB_REMOTE_CRAWL_LOADER = "60_remotecrawlloader"; public static final String CRAWLJOB_REMOTE_CRAWL_LOADER = "60_remotecrawlloader";
public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_START = "remoteCrawlLoaderJob"; public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_START = "remoteCrawlLoaderJob";
public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT = null; public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT = null;

Loading…
Cancel
Save