*) next step of restructuring for new crawlers

- avoid using the http crawler class directly. Using the interface class instead

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2476 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent f94131c13d
commit 4e2a950ac9

@ -47,7 +47,7 @@ import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLoaderMessage;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.crawler.http.CrawlWorker;
import de.anomic.plasma.crawler.plasmaCrawlWorker;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
@ -74,8 +74,8 @@ public class IndexCreateLoaderQueue_p {
yacySeed initiator;
int i, count = 0;
for (i = 0; i < threadCount; i++) {
CrawlWorker theWorker = (CrawlWorker)threadList[i];
plasmaCrawlLoaderMessage theMsg = theWorker.theMsg;
plasmaCrawlWorker theWorker = (plasmaCrawlWorker)threadList[i];
plasmaCrawlLoaderMessage theMsg = theWorker.getMessage();
if (theMsg == null) continue;
initiator = yacyCore.seedDB.getConnected(theMsg.initiator);

@ -197,7 +197,7 @@ public class PerformanceQueues_p {
GenericKeyedObjectPool.Config crawlerPoolConfig = switchboard.cacheLoader.getPoolConfig();
int maxActive = Integer.parseInt(post.get("Crawler Pool_maxActive","8"));
int maxIdle = Integer.parseInt(post.get("Crawler Pool_maxIdle","4"));
int minIdle = Integer.parseInt(post.get("Crawler Pool_minIdle","0"));
int minIdle = 0; // Integer.parseInt(post.get("Crawler Pool_minIdle","0"));
//crawlerPoolConfig.minIdle = (minIdle > maxIdle) ? maxIdle/2 : minIdle;
crawlerPoolConfig.maxIdle = (maxIdle > maxActive) ? maxActive/2 : maxIdle;

@ -27,6 +27,9 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
public boolean destroyed = false;
protected boolean running = false;
protected boolean stopped = false;
/**
* Specifies that the execution of the current crawl job has finished
*/
protected boolean done = false;
/* ============================================================
@ -86,6 +89,14 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
this.log = theLog;
}
public void setNameTrailer(String trailer) {
this.setName(plasmaCrawlWorker.threadBaseName + trailer);
}
public plasmaCrawlLoaderMessage getMessage() {
return this.theMsg;
}
public abstract void close();
public void run() {
@ -173,6 +184,10 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
public void setStopped(boolean isStopped) {
this.stopped = isStopped;
}
public void setDestroyed(boolean isDestroyed) {
this.destroyed = isDestroyed;
}
public boolean isRunning() {
return this.running;

@ -10,6 +10,13 @@ public interface plasmaCrawlWorker {
public static final String threadBaseName = "CrawlerWorker";
public void setNameTrailer(String trailer);
public void setStopped(boolean isStopped);
public void setDestroyed(boolean isDestroyed);
public plasmaCrawlLoaderMessage getMessage();
public void reset();
public void execute();
public void execute(plasmaCrawlLoaderMessage theNewMsg);

@ -6,7 +6,6 @@ import org.apache.commons.pool.KeyedPoolableObjectFactory;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.crawler.http.CrawlWorker;
import de.anomic.server.logging.serverLog;
public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory {
@ -62,7 +61,7 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory {
} );
// instantiating class
CrawlWorker theCrawlWorker = (CrawlWorker) classConstructor.newInstance(new Object[] {
plasmaCrawlWorker theCrawlWorker = (plasmaCrawlWorker) classConstructor.newInstance(new Object[] {
this.theThreadGroup,
this.thePool,
this.sb,
@ -86,13 +85,13 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory {
*/
public void destroyObject(Object key, Object obj) {
if (obj == null) return;
if (obj instanceof CrawlWorker) {
CrawlWorker theWorker = (CrawlWorker) obj;
if (obj instanceof plasmaCrawlWorker) {
plasmaCrawlWorker theWorker = (plasmaCrawlWorker) obj;
synchronized(theWorker) {
theWorker.destroyed = true;
theWorker.setName(plasmaCrawlWorker.threadBaseName + "_destroyed");
theWorker.setDestroyed(true);
theWorker.setNameTrailer("_destroyed");
theWorker.setStopped(true);
theWorker.interrupt();
((Thread)theWorker).interrupt();
}
}
}

@ -1,8 +1,6 @@
package de.anomic.plasma.crawler;
import org.apache.commons.pool.impl.GenericKeyedObjectPool;
import de.anomic.plasma.crawler.http.CrawlWorker;
import de.anomic.server.logging.serverLog;
public final class plasmaCrawlerPool extends GenericKeyedObjectPool {
@ -21,12 +19,12 @@ public final class plasmaCrawlerPool extends GenericKeyedObjectPool {
public void returnObject(Object key,Object obj) {
if (obj == null) return;
if (obj instanceof CrawlWorker) {
if (obj instanceof plasmaCrawlWorker) {
try {
((CrawlWorker)obj).setName(plasmaCrawlWorker.threadBaseName + "_inPool");
((plasmaCrawlWorker)obj).setNameTrailer("_inPool");
super.returnObject(key,obj);
} catch (Exception e) {
((CrawlWorker)obj).setStopped(true);
((plasmaCrawlWorker)obj).setStopped(true);
serverLog.logSevere("CRAWLER-POOL","Unable to return crawler thread to pool.",e);
}
} else {
@ -38,10 +36,10 @@ public final class plasmaCrawlerPool extends GenericKeyedObjectPool {
public void invalidateObject(Object key,Object obj) {
if (obj == null) return;
if (this.isClosed) return;
if (obj instanceof CrawlWorker) {
if (obj instanceof plasmaCrawlWorker) {
try {
((CrawlWorker)obj).setName(plasmaCrawlWorker.threadBaseName + "_invalidated");
((CrawlWorker)obj).setStopped(true);
((plasmaCrawlWorker)obj).setNameTrailer("_invalidated");
((plasmaCrawlWorker)obj).setStopped(true);
super.invalidateObject(key,obj);
} catch (Exception e) {
serverLog.logSevere("CRAWLER-POOL","Unable to invalidate crawling thread.",e);
@ -64,7 +62,7 @@ public final class plasmaCrawlerPool extends GenericKeyedObjectPool {
// signaling shutdown to all still running or pooled threads ...
serverLog.logInfo("CRAWLER","Signaling shutdown to " + threadCount + " remaining crawler threads ...");
for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) {
((CrawlWorker)threadList[currentThreadIdx]).setStopped(true);
((plasmaCrawlWorker)threadList[currentThreadIdx]).setStopped(true);
}
// giving the crawlers some time to finish shutdown
@ -80,7 +78,7 @@ public final class plasmaCrawlerPool extends GenericKeyedObjectPool {
Thread currentThread = threadList[currentThreadIdx];
if (currentThread.isAlive()) {
serverLog.logInfo("CRAWLER","Trying to shutdown crawler thread '" + currentThread.getName() + "' [" + currentThreadIdx + "].");
((CrawlWorker)currentThread).close();
((plasmaCrawlWorker)currentThread).close();
}
}

@ -48,10 +48,10 @@ import org.apache.commons.pool.impl.GenericKeyedObjectPool;
import org.apache.commons.pool.impl.GenericObjectPool;
import de.anomic.net.URL;
import de.anomic.plasma.crawler.plasmaCrawlWorker;
import de.anomic.plasma.crawler.plasmaCrawlerFactory;
import de.anomic.plasma.crawler.plasmaCrawlerMsgQueue;
import de.anomic.plasma.crawler.plasmaCrawlerPool;
import de.anomic.plasma.crawler.http.CrawlWorker;
import de.anomic.server.logging.serverLog;
public final class plasmaCrawlLoader extends Thread {
@ -147,8 +147,12 @@ public final class plasmaCrawlLoader extends Thread {
String protocol = theMsg.url.getProtocol();
// getting a new crawler from the crawler pool
CrawlWorker theWorker = (CrawlWorker) this.crawlwerPool.borrowObject(protocol);
if (theWorker != null) theWorker.execute(theMsg);
plasmaCrawlWorker theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol);
if (theWorker == null) {
this.log.logWarning("Unsupported protocol '" + protocol + "' in url " + theMsg.url);
} else {
theWorker.execute(theMsg);
}
}
public void run() {

Loading…
Cancel
Save