- adding interface class (plasma/crawler/plasmaCrawlWorker.java) for protocol specific crawl-worker threads - moving reusable code into abstract crawl-worker class AbstractCrawlWorker.java - the load method of the worker threads should not be called directly anymore (e.g. by the snippet fetcher) to crawl a page and wait for the result use function plasmaCrawlLoader.loadSync([...]) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2474 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
eb9b138986
commit
09b106eb04
@ -0,0 +1,214 @@
|
||||
package de.anomic.plasma.crawler;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import de.anomic.index.indexURL;
|
||||
import de.anomic.net.URL;
|
||||
import de.anomic.plasma.plasmaCrawlEURL;
|
||||
import de.anomic.plasma.plasmaCrawlLoaderMessage;
|
||||
import de.anomic.plasma.plasmaCrawlProfile;
|
||||
import de.anomic.plasma.plasmaHTCache;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
import de.anomic.tools.bitfield;
|
||||
import de.anomic.yacy.yacyCore;
|
||||
|
||||
public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlWorker {
|
||||
|
||||
/**
|
||||
* The protocol that is supported by this crawler
|
||||
* e.g. <code>http</code>, <code>ftp</code>, etc.
|
||||
*/
|
||||
protected String protocol;
|
||||
|
||||
/* ============================================================
|
||||
* Variables for thread pool management
|
||||
* ============================================================ */
|
||||
public boolean destroyed = false;
|
||||
protected boolean running = false;
|
||||
protected boolean stopped = false;
|
||||
protected boolean done = false;
|
||||
|
||||
/* ============================================================
|
||||
* Crawl job specific variables
|
||||
* ============================================================ */
|
||||
public plasmaCrawlLoaderMessage theMsg;
|
||||
protected URL url;
|
||||
protected String name;
|
||||
protected String refererURLString;
|
||||
protected String initiator;
|
||||
protected int depth;
|
||||
protected long startdate;
|
||||
protected plasmaCrawlProfile.entry profile;
|
||||
protected boolean acceptAllContent;
|
||||
|
||||
/**
|
||||
* The crawler thread pool
|
||||
*/
|
||||
protected final plasmaCrawlerPool myPool;
|
||||
|
||||
/**
|
||||
* reference to the plasma switchboard
|
||||
*/
|
||||
protected final plasmaSwitchboard sb;
|
||||
|
||||
/**
|
||||
* reference to the cache manager
|
||||
*/
|
||||
protected final plasmaHTCache cacheManager;
|
||||
|
||||
/**
|
||||
* Logging class
|
||||
*/
|
||||
protected final serverLog log;
|
||||
|
||||
|
||||
/**
|
||||
* Constructor of this class
|
||||
* @param theTG the crawl worker thread group
|
||||
* @param thePool the crawl worker thread pool
|
||||
* @param theSb plasma switchboard
|
||||
* @param theCacheManager cache manager
|
||||
* @param theLog server log
|
||||
*/
|
||||
public AbstractCrawlWorker(
|
||||
ThreadGroup theTG,
|
||||
plasmaCrawlerPool thePool,
|
||||
plasmaSwitchboard theSb,
|
||||
plasmaHTCache theCacheManager,
|
||||
serverLog theLog
|
||||
) {
|
||||
super(theTG,plasmaCrawlWorker.threadBaseName + "_created");
|
||||
|
||||
this.myPool = thePool;
|
||||
this.sb = theSb;
|
||||
this.cacheManager = theCacheManager;
|
||||
this.log = theLog;
|
||||
}
|
||||
|
||||
public abstract void close();
|
||||
|
||||
public void run() {
|
||||
this.running = true;
|
||||
|
||||
try {
|
||||
// The thread keeps running.
|
||||
while (!this.stopped && !this.isInterrupted() && !this.myPool.isClosed) {
|
||||
if (this.done) {
|
||||
synchronized (this) {
|
||||
// return thread back into pool
|
||||
this.myPool.returnObject(this.protocol,this);
|
||||
|
||||
// We are waiting for a new task now.
|
||||
if (!this.stopped && !this.destroyed && !this.isInterrupted()) {
|
||||
this.wait();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
// executing the new task
|
||||
execute();
|
||||
} finally {
|
||||
reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (InterruptedException ex) {
|
||||
serverLog.logFiner("CRAWLER-POOL","Interruption of thread '" + this.getName() + "' detected.");
|
||||
} finally {
|
||||
if (this.myPool != null && !this.destroyed)
|
||||
this.myPool.invalidateObject(this.protocol,this);
|
||||
}
|
||||
}
|
||||
|
||||
public void execute() {
|
||||
try {
|
||||
// setting threadname
|
||||
this.setName(plasmaCrawlWorker.threadBaseName + "_" + this.url);
|
||||
|
||||
// load some configuration variables
|
||||
init();
|
||||
|
||||
// loading resource
|
||||
plasmaHTCache.Entry resource = load();
|
||||
|
||||
// store a reference to the result in the message object
|
||||
// this is e.g. needed by the snippet fetcher
|
||||
this.theMsg.setResult(resource);
|
||||
|
||||
} catch (IOException e) {
|
||||
//throw e;
|
||||
} finally {
|
||||
this.done = true;
|
||||
}
|
||||
}
|
||||
|
||||
public void execute(plasmaCrawlLoaderMessage theNewMsg) {
|
||||
synchronized (this) {
|
||||
|
||||
this.theMsg = theNewMsg;
|
||||
|
||||
this.url = theNewMsg.url;
|
||||
this.name = theNewMsg.name;
|
||||
this.refererURLString = theNewMsg.referer;
|
||||
this.initiator = theNewMsg.initiator;
|
||||
this.depth = theNewMsg.depth;
|
||||
this.profile = theNewMsg.profile;
|
||||
this.acceptAllContent = theNewMsg.acceptAllContent;
|
||||
|
||||
this.startdate = System.currentTimeMillis();
|
||||
|
||||
this.done = false;
|
||||
|
||||
if (!this.running) {
|
||||
// if the thread is not running until yet, we need to start it now
|
||||
this.start();
|
||||
} else {
|
||||
// inform the thread about the new crawl job
|
||||
this.notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void setStopped(boolean isStopped) {
|
||||
this.stopped = isStopped;
|
||||
}
|
||||
|
||||
public boolean isRunning() {
|
||||
return this.running;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
this.theMsg = null;
|
||||
this.url = null;
|
||||
this.name = null;
|
||||
this.refererURLString = null;
|
||||
this.initiator = null;
|
||||
this.depth = 0;
|
||||
this.startdate = 0;
|
||||
this.profile = null;
|
||||
this.acceptAllContent = false;
|
||||
}
|
||||
|
||||
protected void addURLtoErrorDB(String failreason) {
|
||||
// convert the referrer URL into a hash value
|
||||
String referrerHash = (this.refererURLString==null)?null:indexURL.urlHash(this.refererURLString);
|
||||
|
||||
// create a new errorURL DB entry
|
||||
plasmaCrawlEURL.Entry ee = this.sb.urlPool.errorURL.newEntry(
|
||||
this.url,
|
||||
referrerHash,
|
||||
this.initiator,
|
||||
yacyCore.seedDB.mySeed.hash,
|
||||
this.name,
|
||||
(failreason==null)?"Unknown reason":failreason,
|
||||
new bitfield(indexURL.urlFlagLength)
|
||||
);
|
||||
|
||||
// store the entry
|
||||
ee.store();
|
||||
|
||||
// push it onto the stack
|
||||
this.sb.urlPool.errorURL.stackPushEntry(ee);
|
||||
}
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
package de.anomic.plasma.crawler;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import de.anomic.plasma.plasmaCrawlLoaderMessage;
|
||||
import de.anomic.plasma.plasmaHTCache;
|
||||
|
||||
|
||||
public interface plasmaCrawlWorker {
|
||||
|
||||
public static final String threadBaseName = "CrawlerWorker";
|
||||
|
||||
public void reset();
|
||||
public void execute();
|
||||
public void execute(plasmaCrawlLoaderMessage theNewMsg);
|
||||
public void init();
|
||||
|
||||
public void close();
|
||||
public plasmaHTCache.Entry load() throws IOException;
|
||||
}
|
Loading…
Reference in new issue