*) next step of restructuring for new crawlers

- avoid using the http crawler class directly. Using the interface class instead

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2476 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent f94131c13d
commit 4e2a950ac9

@ -47,7 +47,7 @@ import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLoaderMessage; import de.anomic.plasma.plasmaCrawlLoaderMessage;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.crawler.http.CrawlWorker; import de.anomic.plasma.crawler.plasmaCrawlWorker;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyCore;
@ -74,8 +74,8 @@ public class IndexCreateLoaderQueue_p {
yacySeed initiator; yacySeed initiator;
int i, count = 0; int i, count = 0;
for (i = 0; i < threadCount; i++) { for (i = 0; i < threadCount; i++) {
CrawlWorker theWorker = (CrawlWorker)threadList[i]; plasmaCrawlWorker theWorker = (plasmaCrawlWorker)threadList[i];
plasmaCrawlLoaderMessage theMsg = theWorker.theMsg; plasmaCrawlLoaderMessage theMsg = theWorker.getMessage();
if (theMsg == null) continue; if (theMsg == null) continue;
initiator = yacyCore.seedDB.getConnected(theMsg.initiator); initiator = yacyCore.seedDB.getConnected(theMsg.initiator);

@ -197,7 +197,7 @@ public class PerformanceQueues_p {
GenericKeyedObjectPool.Config crawlerPoolConfig = switchboard.cacheLoader.getPoolConfig(); GenericKeyedObjectPool.Config crawlerPoolConfig = switchboard.cacheLoader.getPoolConfig();
int maxActive = Integer.parseInt(post.get("Crawler Pool_maxActive","8")); int maxActive = Integer.parseInt(post.get("Crawler Pool_maxActive","8"));
int maxIdle = Integer.parseInt(post.get("Crawler Pool_maxIdle","4")); int maxIdle = Integer.parseInt(post.get("Crawler Pool_maxIdle","4"));
int minIdle = Integer.parseInt(post.get("Crawler Pool_minIdle","0")); int minIdle = 0; // Integer.parseInt(post.get("Crawler Pool_minIdle","0"));
//crawlerPoolConfig.minIdle = (minIdle > maxIdle) ? maxIdle/2 : minIdle; //crawlerPoolConfig.minIdle = (minIdle > maxIdle) ? maxIdle/2 : minIdle;
crawlerPoolConfig.maxIdle = (maxIdle > maxActive) ? maxActive/2 : maxIdle; crawlerPoolConfig.maxIdle = (maxIdle > maxActive) ? maxActive/2 : maxIdle;

@ -27,6 +27,9 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
public boolean destroyed = false; public boolean destroyed = false;
protected boolean running = false; protected boolean running = false;
protected boolean stopped = false; protected boolean stopped = false;
/**
* Specifies that the execution of the current crawl job has finished
*/
protected boolean done = false; protected boolean done = false;
/* ============================================================ /* ============================================================
@ -86,6 +89,14 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
this.log = theLog; this.log = theLog;
} }
public void setNameTrailer(String trailer) {
this.setName(plasmaCrawlWorker.threadBaseName + trailer);
}
public plasmaCrawlLoaderMessage getMessage() {
return this.theMsg;
}
public abstract void close(); public abstract void close();
public void run() { public void run() {
@ -173,6 +184,10 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
public void setStopped(boolean isStopped) { public void setStopped(boolean isStopped) {
this.stopped = isStopped; this.stopped = isStopped;
} }
public void setDestroyed(boolean isDestroyed) {
this.destroyed = isDestroyed;
}
public boolean isRunning() { public boolean isRunning() {
return this.running; return this.running;

@ -10,6 +10,13 @@ public interface plasmaCrawlWorker {
public static final String threadBaseName = "CrawlerWorker"; public static final String threadBaseName = "CrawlerWorker";
public void setNameTrailer(String trailer);
public void setStopped(boolean isStopped);
public void setDestroyed(boolean isDestroyed);
public plasmaCrawlLoaderMessage getMessage();
public void reset(); public void reset();
public void execute(); public void execute();
public void execute(plasmaCrawlLoaderMessage theNewMsg); public void execute(plasmaCrawlLoaderMessage theNewMsg);

@ -6,7 +6,6 @@ import org.apache.commons.pool.KeyedPoolableObjectFactory;
import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.crawler.http.CrawlWorker;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory { public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory {
@ -62,7 +61,7 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory {
} ); } );
// instantiating class // instantiating class
CrawlWorker theCrawlWorker = (CrawlWorker) classConstructor.newInstance(new Object[] { plasmaCrawlWorker theCrawlWorker = (plasmaCrawlWorker) classConstructor.newInstance(new Object[] {
this.theThreadGroup, this.theThreadGroup,
this.thePool, this.thePool,
this.sb, this.sb,
@ -86,13 +85,13 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory {
*/ */
public void destroyObject(Object key, Object obj) { public void destroyObject(Object key, Object obj) {
if (obj == null) return; if (obj == null) return;
if (obj instanceof CrawlWorker) { if (obj instanceof plasmaCrawlWorker) {
CrawlWorker theWorker = (CrawlWorker) obj; plasmaCrawlWorker theWorker = (plasmaCrawlWorker) obj;
synchronized(theWorker) { synchronized(theWorker) {
theWorker.destroyed = true; theWorker.setDestroyed(true);
theWorker.setName(plasmaCrawlWorker.threadBaseName + "_destroyed"); theWorker.setNameTrailer("_destroyed");
theWorker.setStopped(true); theWorker.setStopped(true);
theWorker.interrupt(); ((Thread)theWorker).interrupt();
} }
} }
} }

@ -1,8 +1,6 @@
package de.anomic.plasma.crawler; package de.anomic.plasma.crawler;
import org.apache.commons.pool.impl.GenericKeyedObjectPool; import org.apache.commons.pool.impl.GenericKeyedObjectPool;
import de.anomic.plasma.crawler.http.CrawlWorker;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
public final class plasmaCrawlerPool extends GenericKeyedObjectPool { public final class plasmaCrawlerPool extends GenericKeyedObjectPool {
@ -21,12 +19,12 @@ public final class plasmaCrawlerPool extends GenericKeyedObjectPool {
public void returnObject(Object key,Object obj) { public void returnObject(Object key,Object obj) {
if (obj == null) return; if (obj == null) return;
if (obj instanceof CrawlWorker) { if (obj instanceof plasmaCrawlWorker) {
try { try {
((CrawlWorker)obj).setName(plasmaCrawlWorker.threadBaseName + "_inPool"); ((plasmaCrawlWorker)obj).setNameTrailer("_inPool");
super.returnObject(key,obj); super.returnObject(key,obj);
} catch (Exception e) { } catch (Exception e) {
((CrawlWorker)obj).setStopped(true); ((plasmaCrawlWorker)obj).setStopped(true);
serverLog.logSevere("CRAWLER-POOL","Unable to return crawler thread to pool.",e); serverLog.logSevere("CRAWLER-POOL","Unable to return crawler thread to pool.",e);
} }
} else { } else {
@ -38,10 +36,10 @@ public final class plasmaCrawlerPool extends GenericKeyedObjectPool {
public void invalidateObject(Object key,Object obj) { public void invalidateObject(Object key,Object obj) {
if (obj == null) return; if (obj == null) return;
if (this.isClosed) return; if (this.isClosed) return;
if (obj instanceof CrawlWorker) { if (obj instanceof plasmaCrawlWorker) {
try { try {
((CrawlWorker)obj).setName(plasmaCrawlWorker.threadBaseName + "_invalidated"); ((plasmaCrawlWorker)obj).setNameTrailer("_invalidated");
((CrawlWorker)obj).setStopped(true); ((plasmaCrawlWorker)obj).setStopped(true);
super.invalidateObject(key,obj); super.invalidateObject(key,obj);
} catch (Exception e) { } catch (Exception e) {
serverLog.logSevere("CRAWLER-POOL","Unable to invalidate crawling thread.",e); serverLog.logSevere("CRAWLER-POOL","Unable to invalidate crawling thread.",e);
@ -64,7 +62,7 @@ public final class plasmaCrawlerPool extends GenericKeyedObjectPool {
// signaling shutdown to all still running or pooled threads ... // signaling shutdown to all still running or pooled threads ...
serverLog.logInfo("CRAWLER","Signaling shutdown to " + threadCount + " remaining crawler threads ..."); serverLog.logInfo("CRAWLER","Signaling shutdown to " + threadCount + " remaining crawler threads ...");
for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) { for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) {
((CrawlWorker)threadList[currentThreadIdx]).setStopped(true); ((plasmaCrawlWorker)threadList[currentThreadIdx]).setStopped(true);
} }
// giving the crawlers some time to finish shutdown // giving the crawlers some time to finish shutdown
@ -80,7 +78,7 @@ public final class plasmaCrawlerPool extends GenericKeyedObjectPool {
Thread currentThread = threadList[currentThreadIdx]; Thread currentThread = threadList[currentThreadIdx];
if (currentThread.isAlive()) { if (currentThread.isAlive()) {
serverLog.logInfo("CRAWLER","Trying to shutdown crawler thread '" + currentThread.getName() + "' [" + currentThreadIdx + "]."); serverLog.logInfo("CRAWLER","Trying to shutdown crawler thread '" + currentThread.getName() + "' [" + currentThreadIdx + "].");
((CrawlWorker)currentThread).close(); ((plasmaCrawlWorker)currentThread).close();
} }
} }

@ -48,10 +48,10 @@ import org.apache.commons.pool.impl.GenericKeyedObjectPool;
import org.apache.commons.pool.impl.GenericObjectPool; import org.apache.commons.pool.impl.GenericObjectPool;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.crawler.plasmaCrawlWorker;
import de.anomic.plasma.crawler.plasmaCrawlerFactory; import de.anomic.plasma.crawler.plasmaCrawlerFactory;
import de.anomic.plasma.crawler.plasmaCrawlerMsgQueue; import de.anomic.plasma.crawler.plasmaCrawlerMsgQueue;
import de.anomic.plasma.crawler.plasmaCrawlerPool; import de.anomic.plasma.crawler.plasmaCrawlerPool;
import de.anomic.plasma.crawler.http.CrawlWorker;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
public final class plasmaCrawlLoader extends Thread { public final class plasmaCrawlLoader extends Thread {
@ -147,8 +147,12 @@ public final class plasmaCrawlLoader extends Thread {
String protocol = theMsg.url.getProtocol(); String protocol = theMsg.url.getProtocol();
// getting a new crawler from the crawler pool // getting a new crawler from the crawler pool
CrawlWorker theWorker = (CrawlWorker) this.crawlwerPool.borrowObject(protocol); plasmaCrawlWorker theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol);
if (theWorker != null) theWorker.execute(theMsg); if (theWorker == null) {
this.log.logWarning("Unsupported protocol '" + protocol + "' in url " + theMsg.url);
} else {
theWorker.execute(theMsg);
}
} }
public void run() { public void run() {

Loading…
Cancel
Save