You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
303 lines
10 KiB
303 lines
10 KiB
// AbstractCrawlWorker.java
|
|
// -------------------------------------
|
|
// part of YACY
|
|
// (C) by Michael Peter Christen; mc@anomic.de
|
|
// first published on http://www.anomic.de
|
|
// Frankfurt, Germany, 2006
|
|
//
|
|
// This file ist contributed by Martin Thelian
|
|
//
|
|
// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
|
|
// $LastChangedRevision: 1715 $
|
|
// $LastChangedBy: theli $
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
//
|
|
// Using this software in any meaning (reading, learning, copying, compiling,
|
|
// running) means that you agree that the Author(s) is (are) not responsible
|
|
// for cost, loss of data or any harm that may be caused directly or indirectly
|
|
// by usage of this softare or this documentation. The usage of this software
|
|
// is on your own risk. The installation and usage (starting/running) of this
|
|
// software may allow other people or application to access your computer and
|
|
// any attached devices and is highly dependent on the configuration of the
|
|
// software which must be done by the user of the software; the author(s) is
|
|
// (are) also not responsible for proper configuration and usage of the
|
|
// software, even if provoked by documentation provided together with
|
|
// the software.
|
|
//
|
|
// Any changes to this file according to the GPL as documented in the file
|
|
// gpl.txt aside this file in the shipment you received can be done to the
|
|
// lines that follows this copyright notice here, but changes must not be
|
|
// done inside the copyright notive above. A re-distribution must contain
|
|
// the intact and unchanged copyright notice.
|
|
// Contributions and changes to the program code must be marked as such.
|
|
|
|
|
|
package de.anomic.plasma.crawler;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
|
|
import de.anomic.index.indexURL;
|
|
import de.anomic.net.URL;
|
|
import de.anomic.plasma.plasmaCrawlEURL;
|
|
import de.anomic.plasma.plasmaCrawlLoaderMessage;
|
|
import de.anomic.plasma.plasmaCrawlProfile;
|
|
import de.anomic.plasma.plasmaHTCache;
|
|
import de.anomic.plasma.plasmaSwitchboard;
|
|
import de.anomic.server.logging.serverLog;
|
|
import de.anomic.tools.bitfield;
|
|
import de.anomic.yacy.yacyCore;
|
|
|
|
public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlWorker {
|
|
|
|
/**
|
|
* The protocol that is supported by this crawler
|
|
* e.g. <code>http</code>, <code>ftp</code>, etc.
|
|
*/
|
|
protected String protocol;
|
|
|
|
/* ============================================================
|
|
* Variables for thread pool management
|
|
* ============================================================ */
|
|
public boolean destroyed = false;
|
|
protected boolean running = false;
|
|
protected boolean stopped = false;
|
|
/**
|
|
* Specifies that the execution of the current crawl job has finished
|
|
*/
|
|
protected boolean done = false;
|
|
|
|
/* ============================================================
|
|
* Crawl job specific variables
|
|
* ============================================================ */
|
|
public plasmaCrawlLoaderMessage theMsg;
|
|
protected URL url;
|
|
protected String name;
|
|
protected String refererURLString;
|
|
protected String initiator;
|
|
protected int depth;
|
|
protected long startdate;
|
|
protected plasmaCrawlProfile.entry profile;
|
|
protected boolean acceptAllContent;
|
|
|
|
protected String errorMessage;
|
|
|
|
/**
|
|
* The crawler thread pool
|
|
*/
|
|
protected final plasmaCrawlerPool myPool;
|
|
|
|
/**
|
|
* reference to the plasma switchboard
|
|
*/
|
|
protected final plasmaSwitchboard sb;
|
|
|
|
/**
|
|
* reference to the cache manager
|
|
*/
|
|
protected final plasmaHTCache cacheManager;
|
|
|
|
/**
|
|
* Logging class
|
|
*/
|
|
protected final serverLog log;
|
|
|
|
|
|
/**
|
|
* Constructor of this class
|
|
* @param theTG the crawl worker thread group
|
|
* @param thePool the crawl worker thread pool
|
|
* @param theSb plasma switchboard
|
|
* @param theCacheManager cache manager
|
|
* @param theLog server log
|
|
*/
|
|
public AbstractCrawlWorker(
|
|
ThreadGroup theTG,
|
|
plasmaCrawlerPool thePool,
|
|
plasmaSwitchboard theSb,
|
|
plasmaHTCache theCacheManager,
|
|
serverLog theLog
|
|
) {
|
|
super(theTG,plasmaCrawlWorker.threadBaseName + "_created");
|
|
|
|
this.myPool = thePool;
|
|
this.sb = theSb;
|
|
this.cacheManager = theCacheManager;
|
|
this.log = theLog;
|
|
}
|
|
|
|
public void setNameTrailer(String trailer) {
|
|
this.setName(plasmaCrawlWorker.threadBaseName + trailer);
|
|
}
|
|
|
|
public plasmaCrawlLoaderMessage getMessage() {
|
|
return this.theMsg;
|
|
}
|
|
|
|
public abstract void close();
|
|
|
|
public long getDuration() {
|
|
final long startDate = this.startdate;
|
|
return (startDate != 0) ? System.currentTimeMillis() - startDate : 0;
|
|
}
|
|
|
|
public void run() {
|
|
this.running = true;
|
|
|
|
try {
|
|
// The thread keeps running.
|
|
while (!this.stopped && !this.isInterrupted() && !this.myPool.isClosed) {
|
|
if (this.done) {
|
|
synchronized (this) {
|
|
// return thread back into pool
|
|
this.myPool.returnObject(this.protocol,this);
|
|
|
|
// We are waiting for a new task now.
|
|
if (!this.stopped && !this.destroyed && !this.isInterrupted()) {
|
|
this.wait();
|
|
}
|
|
}
|
|
} else {
|
|
try {
|
|
// executing the new task
|
|
execute();
|
|
} finally {
|
|
reset();
|
|
}
|
|
}
|
|
}
|
|
} catch (InterruptedException ex) {
|
|
serverLog.logFiner("CRAWLER-POOL","Interruption of thread '" + this.getName() + "' detected.");
|
|
} finally {
|
|
if (this.myPool != null && !this.destroyed)
|
|
this.myPool.invalidateObject(this.protocol,this);
|
|
}
|
|
}
|
|
|
|
public void execute() {
|
|
|
|
plasmaHTCache.Entry loadedResource = null;
|
|
try {
|
|
// setting threadname
|
|
this.setName(plasmaCrawlWorker.threadBaseName + "_" + this.url);
|
|
|
|
// load some configuration variables
|
|
init();
|
|
|
|
// loading resource
|
|
loadedResource = load();
|
|
} catch (IOException e) {
|
|
//throw e;
|
|
} finally {
|
|
// setting the error message (if available)
|
|
if (this.errorMessage != null) {
|
|
this.theMsg.setError(this.errorMessage);
|
|
}
|
|
|
|
// store a reference to the result in the message object
|
|
// this is e.g. needed by the snippet fetcher
|
|
//
|
|
// Note: this is always called, even on empty results.
|
|
// Otherwise the caller will block forever
|
|
this.theMsg.setResult(loadedResource);
|
|
|
|
// signal that this worker thread has finished the job
|
|
this.done = true;
|
|
}
|
|
}
|
|
|
|
public void execute(plasmaCrawlLoaderMessage theNewMsg) {
|
|
synchronized (this) {
|
|
|
|
this.theMsg = theNewMsg;
|
|
|
|
this.url = theNewMsg.url;
|
|
this.name = theNewMsg.name;
|
|
this.refererURLString = theNewMsg.referer;
|
|
this.initiator = theNewMsg.initiator;
|
|
this.depth = theNewMsg.depth;
|
|
this.profile = theNewMsg.profile;
|
|
this.acceptAllContent = theNewMsg.acceptAllContent;
|
|
|
|
this.startdate = System.currentTimeMillis();
|
|
|
|
this.done = false;
|
|
|
|
if (!this.running) {
|
|
// if the thread is not running until yet, we need to start it now
|
|
this.start();
|
|
} else {
|
|
// inform the thread about the new crawl job
|
|
this.notifyAll();
|
|
}
|
|
}
|
|
}
|
|
|
|
public void setStopped(boolean isStopped) {
|
|
this.stopped = isStopped;
|
|
}
|
|
|
|
public void setDestroyed(boolean isDestroyed) {
|
|
this.destroyed = isDestroyed;
|
|
}
|
|
|
|
public boolean isRunning() {
|
|
return this.running;
|
|
}
|
|
|
|
public void reset() {
|
|
this.theMsg = null;
|
|
this.url = null;
|
|
this.name = null;
|
|
this.refererURLString = null;
|
|
this.initiator = null;
|
|
this.depth = 0;
|
|
this.startdate = 0;
|
|
this.profile = null;
|
|
this.acceptAllContent = false;
|
|
this.errorMessage = null;
|
|
}
|
|
|
|
protected void addURLtoErrorDB(String failreason) {
|
|
// remember error message
|
|
this.errorMessage = failreason;
|
|
|
|
// convert the referrer URL into a hash value
|
|
String referrerHash = (this.refererURLString==null)?null:indexURL.urlHash(this.refererURLString);
|
|
|
|
// create a new errorURL DB entry
|
|
plasmaCrawlEURL.Entry ee = this.sb.urlPool.errorURL.newEntry(
|
|
this.url,
|
|
referrerHash,
|
|
this.initiator,
|
|
yacyCore.seedDB.mySeed.hash,
|
|
this.name,
|
|
(failreason==null)?"Unknown reason":failreason,
|
|
new bitfield(indexURL.urlFlagLength)
|
|
);
|
|
|
|
// store the entry
|
|
ee.store();
|
|
|
|
// push it onto the stack
|
|
this.sb.urlPool.errorURL.stackPushEntry(ee);
|
|
|
|
// delete the cache file
|
|
File cacheFile = this.cacheManager.getCachePath(this.url);
|
|
if (cacheFile.exists()) cacheFile.delete();
|
|
}
|
|
}
|