- adding interface class (plasma/crawler/plasmaCrawlWorker.java) for protocol specific crawl-worker threads - moving reusable code into abstract crawl-worker class AbstractCrawlWorker.java - the load method of the worker threads should not be called directly anymore (e.g. by the snippet fetcher) to crawl a page and wait for the result use function plasmaCrawlLoader.loadSync([...]) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2474 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
eb9b138986
commit
09b106eb04
@ -0,0 +1,214 @@
|
|||||||
|
package de.anomic.plasma.crawler;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import de.anomic.index.indexURL;
|
||||||
|
import de.anomic.net.URL;
|
||||||
|
import de.anomic.plasma.plasmaCrawlEURL;
|
||||||
|
import de.anomic.plasma.plasmaCrawlLoaderMessage;
|
||||||
|
import de.anomic.plasma.plasmaCrawlProfile;
|
||||||
|
import de.anomic.plasma.plasmaHTCache;
|
||||||
|
import de.anomic.plasma.plasmaSwitchboard;
|
||||||
|
import de.anomic.server.logging.serverLog;
|
||||||
|
import de.anomic.tools.bitfield;
|
||||||
|
import de.anomic.yacy.yacyCore;
|
||||||
|
|
||||||
|
public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlWorker {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The protocol that is supported by this crawler
|
||||||
|
* e.g. <code>http</code>, <code>ftp</code>, etc.
|
||||||
|
*/
|
||||||
|
protected String protocol;
|
||||||
|
|
||||||
|
/* ============================================================
|
||||||
|
* Variables for thread pool management
|
||||||
|
* ============================================================ */
|
||||||
|
public boolean destroyed = false;
|
||||||
|
protected boolean running = false;
|
||||||
|
protected boolean stopped = false;
|
||||||
|
protected boolean done = false;
|
||||||
|
|
||||||
|
/* ============================================================
|
||||||
|
* Crawl job specific variables
|
||||||
|
* ============================================================ */
|
||||||
|
public plasmaCrawlLoaderMessage theMsg;
|
||||||
|
protected URL url;
|
||||||
|
protected String name;
|
||||||
|
protected String refererURLString;
|
||||||
|
protected String initiator;
|
||||||
|
protected int depth;
|
||||||
|
protected long startdate;
|
||||||
|
protected plasmaCrawlProfile.entry profile;
|
||||||
|
protected boolean acceptAllContent;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The crawler thread pool
|
||||||
|
*/
|
||||||
|
protected final plasmaCrawlerPool myPool;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* reference to the plasma switchboard
|
||||||
|
*/
|
||||||
|
protected final plasmaSwitchboard sb;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* reference to the cache manager
|
||||||
|
*/
|
||||||
|
protected final plasmaHTCache cacheManager;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Logging class
|
||||||
|
*/
|
||||||
|
protected final serverLog log;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor of this class
|
||||||
|
* @param theTG the crawl worker thread group
|
||||||
|
* @param thePool the crawl worker thread pool
|
||||||
|
* @param theSb plasma switchboard
|
||||||
|
* @param theCacheManager cache manager
|
||||||
|
* @param theLog server log
|
||||||
|
*/
|
||||||
|
public AbstractCrawlWorker(
|
||||||
|
ThreadGroup theTG,
|
||||||
|
plasmaCrawlerPool thePool,
|
||||||
|
plasmaSwitchboard theSb,
|
||||||
|
plasmaHTCache theCacheManager,
|
||||||
|
serverLog theLog
|
||||||
|
) {
|
||||||
|
super(theTG,plasmaCrawlWorker.threadBaseName + "_created");
|
||||||
|
|
||||||
|
this.myPool = thePool;
|
||||||
|
this.sb = theSb;
|
||||||
|
this.cacheManager = theCacheManager;
|
||||||
|
this.log = theLog;
|
||||||
|
}
|
||||||
|
|
||||||
|
public abstract void close();
|
||||||
|
|
||||||
|
public void run() {
|
||||||
|
this.running = true;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// The thread keeps running.
|
||||||
|
while (!this.stopped && !this.isInterrupted() && !this.myPool.isClosed) {
|
||||||
|
if (this.done) {
|
||||||
|
synchronized (this) {
|
||||||
|
// return thread back into pool
|
||||||
|
this.myPool.returnObject(this.protocol,this);
|
||||||
|
|
||||||
|
// We are waiting for a new task now.
|
||||||
|
if (!this.stopped && !this.destroyed && !this.isInterrupted()) {
|
||||||
|
this.wait();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
// executing the new task
|
||||||
|
execute();
|
||||||
|
} finally {
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (InterruptedException ex) {
|
||||||
|
serverLog.logFiner("CRAWLER-POOL","Interruption of thread '" + this.getName() + "' detected.");
|
||||||
|
} finally {
|
||||||
|
if (this.myPool != null && !this.destroyed)
|
||||||
|
this.myPool.invalidateObject(this.protocol,this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void execute() {
|
||||||
|
try {
|
||||||
|
// setting threadname
|
||||||
|
this.setName(plasmaCrawlWorker.threadBaseName + "_" + this.url);
|
||||||
|
|
||||||
|
// load some configuration variables
|
||||||
|
init();
|
||||||
|
|
||||||
|
// loading resource
|
||||||
|
plasmaHTCache.Entry resource = load();
|
||||||
|
|
||||||
|
// store a reference to the result in the message object
|
||||||
|
// this is e.g. needed by the snippet fetcher
|
||||||
|
this.theMsg.setResult(resource);
|
||||||
|
|
||||||
|
} catch (IOException e) {
|
||||||
|
//throw e;
|
||||||
|
} finally {
|
||||||
|
this.done = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void execute(plasmaCrawlLoaderMessage theNewMsg) {
|
||||||
|
synchronized (this) {
|
||||||
|
|
||||||
|
this.theMsg = theNewMsg;
|
||||||
|
|
||||||
|
this.url = theNewMsg.url;
|
||||||
|
this.name = theNewMsg.name;
|
||||||
|
this.refererURLString = theNewMsg.referer;
|
||||||
|
this.initiator = theNewMsg.initiator;
|
||||||
|
this.depth = theNewMsg.depth;
|
||||||
|
this.profile = theNewMsg.profile;
|
||||||
|
this.acceptAllContent = theNewMsg.acceptAllContent;
|
||||||
|
|
||||||
|
this.startdate = System.currentTimeMillis();
|
||||||
|
|
||||||
|
this.done = false;
|
||||||
|
|
||||||
|
if (!this.running) {
|
||||||
|
// if the thread is not running until yet, we need to start it now
|
||||||
|
this.start();
|
||||||
|
} else {
|
||||||
|
// inform the thread about the new crawl job
|
||||||
|
this.notifyAll();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setStopped(boolean isStopped) {
|
||||||
|
this.stopped = isStopped;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isRunning() {
|
||||||
|
return this.running;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset() {
|
||||||
|
this.theMsg = null;
|
||||||
|
this.url = null;
|
||||||
|
this.name = null;
|
||||||
|
this.refererURLString = null;
|
||||||
|
this.initiator = null;
|
||||||
|
this.depth = 0;
|
||||||
|
this.startdate = 0;
|
||||||
|
this.profile = null;
|
||||||
|
this.acceptAllContent = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void addURLtoErrorDB(String failreason) {
|
||||||
|
// convert the referrer URL into a hash value
|
||||||
|
String referrerHash = (this.refererURLString==null)?null:indexURL.urlHash(this.refererURLString);
|
||||||
|
|
||||||
|
// create a new errorURL DB entry
|
||||||
|
plasmaCrawlEURL.Entry ee = this.sb.urlPool.errorURL.newEntry(
|
||||||
|
this.url,
|
||||||
|
referrerHash,
|
||||||
|
this.initiator,
|
||||||
|
yacyCore.seedDB.mySeed.hash,
|
||||||
|
this.name,
|
||||||
|
(failreason==null)?"Unknown reason":failreason,
|
||||||
|
new bitfield(indexURL.urlFlagLength)
|
||||||
|
);
|
||||||
|
|
||||||
|
// store the entry
|
||||||
|
ee.store();
|
||||||
|
|
||||||
|
// push it onto the stack
|
||||||
|
this.sb.urlPool.errorURL.stackPushEntry(ee);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,20 @@
|
|||||||
|
package de.anomic.plasma.crawler;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import de.anomic.plasma.plasmaCrawlLoaderMessage;
|
||||||
|
import de.anomic.plasma.plasmaHTCache;
|
||||||
|
|
||||||
|
|
||||||
|
public interface plasmaCrawlWorker {
|
||||||
|
|
||||||
|
public static final String threadBaseName = "CrawlerWorker";
|
||||||
|
|
||||||
|
public void reset();
|
||||||
|
public void execute();
|
||||||
|
public void execute(plasmaCrawlLoaderMessage theNewMsg);
|
||||||
|
public void init();
|
||||||
|
|
||||||
|
public void close();
|
||||||
|
public plasmaHTCache.Entry load() throws IOException;
|
||||||
|
}
|
Loading…
Reference in new issue