fixed problem with switching of networks

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6247 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 0575f12838
commit 92edd24e70

@ -63,7 +63,7 @@ public class CrawlQueues {
public NoticedURL noticeURL; public NoticedURL noticeURL;
public ZURL errorURL, delegatedURL; public ZURL errorURL, delegatedURL;
public CrawlQueues(final Switchboard sb, final File plasmaPath) { public CrawlQueues(final Switchboard sb, final File queuePath) {
this.sb = sb; this.sb = sb;
this.log = new Log("CRAWLER"); this.log = new Log("CRAWLER");
this.workers = new ConcurrentHashMap<Integer, crawlWorker>(); this.workers = new ConcurrentHashMap<Integer, crawlWorker>();
@ -71,16 +71,69 @@ public class CrawlQueues {
// start crawling management // start crawling management
log.logConfig("Starting Crawling Management"); log.logConfig("Starting Crawling Management");
noticeURL = new NoticedURL(plasmaPath); noticeURL = new NoticedURL(queuePath);
//errorURL = new plasmaCrawlZURL(); // fresh error DB each startup; can be hold in RAM and reduces IO; //errorURL = new plasmaCrawlZURL(); // fresh error DB each startup; can be hold in RAM and reduces IO;
final File errorDBFile = new File(plasmaPath, "urlError2.db"); final File errorDBFile = new File(queuePath, "urlError2.db");
if (errorDBFile.exists()) { if (errorDBFile.exists()) {
// delete the error db to get a fresh each time on startup // delete the error db to get a fresh each time on startup
// this is useful because there is currently no re-use of the data in this table. // this is useful because there is currently no re-use of the data in this table.
if (errorDBFile.isDirectory()) SplitTable.delete(plasmaPath, "urlError2.db"); else FileUtils.deletedelete(errorDBFile); if (errorDBFile.isDirectory()) SplitTable.delete(queuePath, "urlError2.db"); else FileUtils.deletedelete(errorDBFile);
}
errorURL = new ZURL(queuePath, "urlError3.db", false);
delegatedURL = new ZURL(queuePath, "urlDelegated3.db", true);
}
public void relocate(final File newQueuePath) {
this.close();
this.workers = new ConcurrentHashMap<Integer, crawlWorker>();
this.remoteCrawlProviderHashes.clear();
noticeURL = new NoticedURL(newQueuePath);
final File errorDBFile = new File(newQueuePath, "urlError2.db");
if (errorDBFile.exists()) {
if (errorDBFile.isDirectory()) SplitTable.delete(newQueuePath, "urlError2.db"); else FileUtils.deletedelete(errorDBFile);
}
errorURL = new ZURL(newQueuePath, "urlError3.db", false);
delegatedURL = new ZURL(newQueuePath, "urlDelegated3.db", true);
}
public void close() {
// wait for all workers to finish
for (final crawlWorker w: workers.values()) {
w.interrupt();
}
for (final crawlWorker w: workers.values()) {
try {
w.join();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
noticeURL.close();
errorURL.close();
delegatedURL.close();
}
public void clear() {
// wait for all workers to finish
for (final crawlWorker w: workers.values()) {
w.interrupt();
}
// TODO: wait some more time until all threads are finished
workers.clear();
remoteCrawlProviderHashes.clear();
noticeURL.clear();
try {
errorURL.clear();
} catch (final IOException e) {
e.printStackTrace();
}
try {
delegatedURL.clear();
} catch (final IOException e) {
e.printStackTrace();
} }
errorURL = new ZURL(plasmaPath, "urlError3.db", false);
delegatedURL = new ZURL(plasmaPath, "urlDelegated3.db", true);
} }
/** /**
@ -127,44 +180,6 @@ public class CrawlQueues {
} }
} }
public void clear() {
// wait for all workers to finish
for (final crawlWorker w: workers.values()) {
w.interrupt();
}
// TODO: wait some more time until all threads are finished
workers.clear();
remoteCrawlProviderHashes.clear();
noticeURL.clear();
try {
errorURL.clear();
} catch (final IOException e) {
e.printStackTrace();
}
try {
delegatedURL.clear();
} catch (final IOException e) {
e.printStackTrace();
}
}
public void close() {
// wait for all workers to finish
for (final crawlWorker w: workers.values()) {
w.interrupt();
}
for (final crawlWorker w: workers.values()) {
try {
w.join();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
noticeURL.close();
errorURL.close();
delegatedURL.close();
}
public Request[] activeWorkerEntries() { public Request[] activeWorkerEntries() {
synchronized (workers) { synchronized (workers) {
final Request[] e = new Request[workers.size()]; final Request[] e = new Request[workers.size()];
@ -195,9 +210,16 @@ public class CrawlQueues {
", robinsonMode=" + ((sb.isRobinsonMode()) ? "on" : "off")); ", robinsonMode=" + ((sb.isRobinsonMode()) ? "on" : "off"));
} }
if(!crawlIsPossible(NoticedURL.STACK_TYPE_CORE, "Core")) return false; String queueCheck = crawlIsPossible(NoticedURL.STACK_TYPE_CORE, "Core");
if (queueCheck != null) {
if (log.isFine()) log.logFine("omitting de-queue/local: " + queueCheck);
return false;
}
if(isPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) return false; if (isPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) {
if (log.isFinest()) log.logFinest("omitting de-queue/local: paused");
return false;
}
// do a local crawl // do a local crawl
Request urlEntry = null; Request urlEntry = null;
@ -291,34 +313,29 @@ public class CrawlQueues {
* @param type * @param type
* @return * @return
*/ */
private boolean crawlIsPossible(int stackType, final String type) { private String crawlIsPossible(int stackType, final String type) {
int value;
//System.out.println("stacksize = " + noticeURL.stackSize(stackType)); //System.out.println("stacksize = " + noticeURL.stackSize(stackType));
if (noticeURL.stackSize(stackType) == 0) { if (noticeURL.stackSize(stackType) == 0) {
//log.logDebug("GlobalCrawl: queue is empty"); //log.logDebug("GlobalCrawl: queue is empty");
return false; return "stack is empty";
} }
value = (int) sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10);
if (this.size() >= value) { // check the worker threads
// try a cleanup int maxWorkers = (int) sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10);
if (this.workers.size() >= maxWorkers) {
// too many worker threads, try a cleanup
this.cleanup(); this.cleanup();
} }
// check again // check again
if (this.size() >= value) { if (this.workers.size() >= maxWorkers) {
if (this.log.isFine()) { return "too many workers active: " + this.workers.size();
log.logFine(type + "Crawl: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + "), httpClients = " + Client.connectionCount());
}
return false;
} }
String cautionCause = sb.onlineCaution(); String cautionCause = sb.onlineCaution();
if (cautionCause != null) { if (cautionCause != null) {
if (this.log.isFine()) { return "online caution: " + cautionCause;
log.logFine(type + "Crawl: online caution for " + cautionCause + ", omitting processing");
}
return false;
} }
return true; return null;
} }
public boolean remoteCrawlLoaderJob() { public boolean remoteCrawlLoaderJob() {
@ -467,9 +484,16 @@ public class CrawlQueues {
// do nothing if either there are private processes to be done // do nothing if either there are private processes to be done
// or there is no global crawl on the stack // or there is no global crawl on the stack
if (!crawlIsPossible(NoticedURL.STACK_TYPE_REMOTE, "Global")) return false; String queueCheck = crawlIsPossible(NoticedURL.STACK_TYPE_REMOTE, "Global");
if (queueCheck != null) {
if (log.isFine()) log.logFine("omitting de-queue/remote: " + queueCheck);
return false;
}
if (isPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL)) return false; if (isPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL)) {
if (log.isFinest()) log.logFinest("omitting de-queue/remote: paused");
return false;
}
// we don't want to crawl a global URL globally, since WE are the global part. (from this point of view) // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
final String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", " final String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", "

@ -809,11 +809,10 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
this.crawlStacker.close(); this.crawlStacker.close();
this.webStructure.close(); this.webStructure.close();
this.robots.close(); this.robots.close();
this.crawlQueues.close();
log.logInfo("SWITCH NETWORK: START UP OF NEW INDEX DATABASE..."); log.logInfo("SWITCH NETWORK: START UP OF NEW INDEX DATABASE...");
// start up // new properties
setConfig("network.unit.definition", networkDefinition); setConfig("network.unit.definition", networkDefinition);
overwriteNetworkDefinition(); overwriteNetworkDefinition();
final File indexPrimaryPath = getConfigPath(SwitchboardConstants.INDEX_PRIMARY_PATH, SwitchboardConstants.INDEX_PATH_DEFAULT); final File indexPrimaryPath = getConfigPath(SwitchboardConstants.INDEX_PRIMARY_PATH, SwitchboardConstants.INDEX_PATH_DEFAULT);
@ -826,6 +825,9 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES"); this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES");
this.networkRoot.mkdirs(); this.networkRoot.mkdirs();
this.queuesRoot.mkdirs(); this.queuesRoot.mkdirs();
// relocate
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
final File mySeedFile = new File(this.networkRoot, yacySeedDB.DBFILE_OWN_SEED); final File mySeedFile = new File(this.networkRoot, yacySeedDB.DBFILE_OWN_SEED);
peers = new yacySeedDB( peers = new yacySeedDB(
this.networkRoot, this.networkRoot,
@ -844,13 +846,14 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
// startup
crawler = new CrawlSwitchboard( crawler = new CrawlSwitchboard(
peers, peers,
networkName, networkName,
log, log,
this.queuesRoot); this.queuesRoot);
// create new web structure // create new web structure
this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map")); this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map"));

Loading…
Cancel
Save