From 1395aae7429de7a4e334dea735520057efae4bce Mon Sep 17 00:00:00 2001 From: theli Date: Mon, 4 Sep 2006 06:09:20 +0000 Subject: [PATCH] *) starting restructuring which is needed to add crawlers for additional protocols git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2472 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexCreateLoaderQueue_p.java | 2 +- htroot/xml/queues_p.java | 2 +- .../{ => crawler}/plasmaCrawlWorker.java | 53 ++-- .../plasma/crawler/plasmaCrawlerFactory.java | 93 ++++++ .../plasma/crawler/plasmaCrawlerMsgQueue.java | 127 ++++++++ .../plasma/crawler/plasmaCrawlerPool.java | 106 +++++++ .../de/anomic/plasma/plasmaCrawlLoader.java | 288 +----------------- .../de/anomic/plasma/plasmaParserConfig.java | 50 ++- .../de/anomic/plasma/plasmaSnippetCache.java | 2 + 9 files changed, 412 insertions(+), 311 deletions(-) rename source/de/anomic/plasma/{ => crawler}/plasmaCrawlWorker.java (94%) create mode 100644 source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java create mode 100644 source/de/anomic/plasma/crawler/plasmaCrawlerMsgQueue.java create mode 100644 source/de/anomic/plasma/crawler/plasmaCrawlerPool.java diff --git a/htroot/IndexCreateLoaderQueue_p.java b/htroot/IndexCreateLoaderQueue_p.java index 1e465937e..b2bebb11f 100644 --- a/htroot/IndexCreateLoaderQueue_p.java +++ b/htroot/IndexCreateLoaderQueue_p.java @@ -46,8 +46,8 @@ import de.anomic.data.wikiCode; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaCrawlLoaderMessage; -import de.anomic.plasma.plasmaCrawlWorker; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.crawler.plasmaCrawlWorker; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; diff --git a/htroot/xml/queues_p.java b/htroot/xml/queues_p.java index 6099ad6be..91e26d088 100644 --- a/htroot/xml/queues_p.java +++ b/htroot/xml/queues_p.java @@ -57,7 +57,7 @@ import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaCrawlLoaderMessage; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlProfile; -import de.anomic.plasma.plasmaCrawlWorker; +import de.anomic.plasma.crawler.plasmaCrawlWorker; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboardQueue; import de.anomic.server.serverObjects; diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/crawler/plasmaCrawlWorker.java similarity index 94% rename from source/de/anomic/plasma/plasmaCrawlWorker.java rename to source/de/anomic/plasma/crawler/plasmaCrawlWorker.java index 2e05d3118..4c171afdb 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/crawler/plasmaCrawlWorker.java @@ -5,9 +5,9 @@ //first published on http://www.anomic.de //Frankfurt, Germany, 2004 // -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ +// $LastChangedDate: 2006-08-12 16:28:14 +0200 (Sa, 12 Aug 2006) $ +// $LastChangedRevision: 2397 $ +// $LastChangedBy: theli $ // //This program is free software; you can redistribute it and/or modify //it under the terms of the GNU General Public License as published by @@ -42,24 +42,31 @@ //the intact and unchanged copyright notice. //Contributions and changes to the program code must be marked as such. -package de.anomic.plasma; +package de.anomic.plasma.crawler; -import java.io.File; +import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.NoRouteToHostException; import java.net.SocketException; -import de.anomic.net.URL; -import de.anomic.plasma.urlPattern.plasmaURLPattern; - import java.net.UnknownHostException; import java.util.Date; + import de.anomic.http.httpHeader; import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpc; import de.anomic.http.httpdProxyHandler; import de.anomic.index.indexURL; +import de.anomic.net.URL; +import de.anomic.plasma.plasmaCrawlEURL; +import de.anomic.plasma.plasmaCrawlLoader; +import de.anomic.plasma.plasmaCrawlLoaderMessage; +import de.anomic.plasma.plasmaCrawlProfile; +import de.anomic.plasma.plasmaHTCache; +import de.anomic.plasma.plasmaParser; +import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverSystem; import de.anomic.server.logging.serverLog; import de.anomic.tools.bitfield; @@ -67,10 +74,10 @@ import de.anomic.yacy.yacyCore; public final class plasmaCrawlWorker extends Thread { - private static final int DEFAULT_CRAWLING_RETRY_COUNT = 5; - static final String threadBaseName = "CrawlerWorker"; + public static final int DEFAULT_CRAWLING_RETRY_COUNT = 5; + public static final String threadBaseName = "CrawlerWorker"; - private final CrawlerPool myPool; + private final plasmaCrawlerPool myPool; private final plasmaSwitchboard sb; private final plasmaHTCache cacheManager; private final serverLog log; @@ -91,29 +98,9 @@ public final class plasmaCrawlWorker extends Thread { private boolean stopped = false; private boolean done = false; - //private static boolean doCrawlerLogging = false; - - /** - * Do logging configuration for special proxy access log file - */ -// static { -// try { -// Logger crawlerLogger = Logger.getLogger("CRAWLER.access"); -// crawlerLogger.setUseParentHandlers(false); -// FileHandler txtLog = new FileHandler("log/crawlerAccess%u%g.log",1024*1024, 20, true); -// txtLog.setFormatter(new serverMiniLogFormatter()); -// txtLog.setLevel(Level.FINEST); -// crawlerLogger.addHandler(txtLog); -// -// doAccessLogging = true; -// } catch (Exception e) { -// System.err.println("PROXY: Unable to configure proxy access logging."); -// } -// } - public plasmaCrawlWorker( ThreadGroup theTG, - CrawlerPool thePool, + plasmaCrawlerPool thePool, plasmaSwitchboard theSb, plasmaHTCache theCacheManager, serverLog theLog) { @@ -245,7 +232,7 @@ public final class plasmaCrawlWorker extends Thread { if (closedSockets > 0) { this.log.logInfo(closedSockets + " HTTP-client sockets of thread '" + this.getName() + "' closed."); } - } catch (Exception e) {} + } catch (Exception e) {/* ignore this. shutdown in progress */} } } diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java b/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java new file mode 100644 index 000000000..ac179e0f9 --- /dev/null +++ b/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java @@ -0,0 +1,93 @@ +package de.anomic.plasma.crawler; + +import de.anomic.plasma.plasmaHTCache; +import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.server.logging.serverLog; + +public final class plasmaCrawlerFactory implements org.apache.commons.pool.PoolableObjectFactory { + + private plasmaCrawlerPool thePool; + private final ThreadGroup theThreadGroup; + private final plasmaHTCache cacheManager; + private final serverLog theLog; + private final plasmaSwitchboard sb; + + public plasmaCrawlerFactory( + ThreadGroup threadGroup, + plasmaSwitchboard theSb, + plasmaHTCache theCacheManager, + serverLog log) { + + super(); + + if (threadGroup == null) + throw new IllegalArgumentException("The threadgroup object must not be null."); + + this.theThreadGroup = threadGroup; + this.cacheManager = theCacheManager; + this.sb = theSb; + this.theLog = log; + } + + public void setPool(plasmaCrawlerPool pool) { + this.thePool = pool; + } + + /** + * @see org.apache.commons.pool.PoolableObjectFactory#makeObject() + */ + public Object makeObject() { + return new plasmaCrawlWorker( + this.theThreadGroup, + this.thePool, + this.sb, + this.cacheManager, + this.theLog); + } + + /** + * @see org.apache.commons.pool.PoolableObjectFactory#destroyObject(java.lang.Object) + */ + public void destroyObject(Object obj) { + if (obj == null) return; + if (obj instanceof plasmaCrawlWorker) { + plasmaCrawlWorker theWorker = (plasmaCrawlWorker) obj; + synchronized(theWorker) { + theWorker.destroyed = true; + theWorker.setName(plasmaCrawlWorker.threadBaseName + "_destroyed"); + theWorker.setStopped(true); + theWorker.interrupt(); + } + } + } + + /** + * @see org.apache.commons.pool.PoolableObjectFactory#validateObject(java.lang.Object) + */ + public boolean validateObject(Object obj) { + return true; + } + + /** + * @param obj + * + */ + public void activateObject(Object obj) { + //log.debug(" activateObject..."); + } + + /** + * @param obj + * + */ + + public void passivateObject(Object obj) { + //log.debug(" passivateObject..." + obj); + /* + if (obj instanceof plasmaCrawlWorker) { + plasmaCrawlWorker theWorker = (plasmaCrawlWorker) obj; + } + */ + } + +} diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerMsgQueue.java b/source/de/anomic/plasma/crawler/plasmaCrawlerMsgQueue.java new file mode 100644 index 000000000..2743455c1 --- /dev/null +++ b/source/de/anomic/plasma/crawler/plasmaCrawlerMsgQueue.java @@ -0,0 +1,127 @@ +// plasmaCrawlerMsgQueue.java +// ------------------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2004 +// +// This file ist contributed by Martin Thelian +// +// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $ +// $LastChangedRevision: 1715 $ +// $LastChangedBy: theli $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +package de.anomic.plasma.crawler; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; + +import de.anomic.plasma.plasmaCrawlLoaderMessage; +import de.anomic.server.serverSemaphore; + +public class plasmaCrawlerMsgQueue { + private final serverSemaphore readSync; + private final serverSemaphore writeSync; + private final ArrayList messageList; + + public plasmaCrawlerMsgQueue() { + this.readSync = new serverSemaphore (0); + this.writeSync = new serverSemaphore (1); + + this.messageList = new ArrayList(10); + } + + /** + * + * @param newMessage + * @throws MessageQueueLockedException + * @throws InterruptedException + */ + public void addMessage(plasmaCrawlLoaderMessage newMessage) + throws InterruptedException, NullPointerException + { + if (newMessage == null) throw new NullPointerException(); + + this.writeSync.P(); + + boolean insertionDoneSuccessfully = false; + synchronized(this.messageList) { + insertionDoneSuccessfully = this.messageList.add(newMessage); + } + + if (insertionDoneSuccessfully) { + this.sortMessages(); + this.readSync.V(); + } + + this.writeSync.V(); + } + + public plasmaCrawlLoaderMessage waitForMessage() throws InterruptedException { + this.readSync.P(); + this.writeSync.P(); + + plasmaCrawlLoaderMessage newMessage = null; + synchronized(this.messageList) { + newMessage = (plasmaCrawlLoaderMessage) this.messageList.remove(0); + } + + this.writeSync.V(); + return newMessage; + } + + protected void sortMessages() { + Collections.sort(this.messageList, new Comparator() { + public int compare(Object o1, Object o2) + { + plasmaCrawlLoaderMessage message1 = (plasmaCrawlLoaderMessage) o1; + plasmaCrawlLoaderMessage message2 = (plasmaCrawlLoaderMessage) o2; + + int message1Priority = message1.crawlingPriority; + int message2Priority = message2.crawlingPriority; + + if (message1Priority > message2Priority){ + return -1; + } else if (message1Priority < message2Priority) { + return 1; + } else { + return 0; + } + } + }); + } +} \ No newline at end of file diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java b/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java new file mode 100644 index 000000000..322974c81 --- /dev/null +++ b/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java @@ -0,0 +1,106 @@ +package de.anomic.plasma.crawler; + +import org.apache.commons.pool.impl.GenericObjectPool; + +import de.anomic.server.logging.serverLog; + +public final class plasmaCrawlerPool extends GenericObjectPool { + private final ThreadGroup theThreadGroup; + public boolean isClosed = false; + + public plasmaCrawlerPool(plasmaCrawlerFactory objFactory, + GenericObjectPool.Config config, + ThreadGroup threadGroup) { + super(objFactory, config); + this.theThreadGroup = threadGroup; + objFactory.setPool(this); + } + + public Object borrowObject() throws Exception { + return super.borrowObject(); + } + + public void returnObject(Object obj) { + if (obj == null) return; + if (obj instanceof plasmaCrawlWorker) { + try { + ((plasmaCrawlWorker)obj).setName(plasmaCrawlWorker.threadBaseName + "_inPool"); + super.returnObject(obj); + } catch (Exception e) { + ((plasmaCrawlWorker)obj).setStopped(true); + serverLog.logSevere("CRAWLER-POOL","Unable to return crawler thread to pool.",e); + } + } else { + serverLog.logSevere("CRAWLER-POOL","Object of wront type '" + obj.getClass().getName() + + "' returned to pool."); + } + } + + public void invalidateObject(Object obj) { + if (obj == null) return; + if (this.isClosed) return; + if (obj instanceof plasmaCrawlWorker) { + try { + ((plasmaCrawlWorker)obj).setName(plasmaCrawlWorker.threadBaseName + "_invalidated"); + ((plasmaCrawlWorker)obj).setStopped(true); + super.invalidateObject(obj); + } catch (Exception e) { + serverLog.logSevere("CRAWLER-POOL","Unable to invalidate crawling thread.",e); + } + } + } + + public synchronized void close() throws Exception { + try { + /* + * shutdown all still running session threads ... + */ + this.isClosed = true; + + /* waiting for all threads to finish */ + int threadCount = this.theThreadGroup.activeCount(); + Thread[] threadList = new Thread[threadCount]; + threadCount = this.theThreadGroup.enumerate(threadList); + + // signaling shutdown to all still running or pooled threads ... + serverLog.logInfo("CRAWLER","Signaling shutdown to " + threadCount + " remaining crawler threads ..."); + for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) { + ((plasmaCrawlWorker)threadList[currentThreadIdx]).setStopped(true); + } + + // giving the crawlers some time to finish shutdown + try { Thread.sleep(500); } catch(Exception e) {} + + // sending interrupted signal to all remaining threads + serverLog.logInfo("CRAWLER","Sending interruption signal to " + this.theThreadGroup.activeCount() + " remaining crawler threads ..."); + this.theThreadGroup.interrupt(); + + // aborting all crawlers by closing all still open httpc sockets + serverLog.logInfo("CRAWLER","Trying to abort " + this.theThreadGroup.activeCount() + " remaining crawler threads ..."); + for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) { + Thread currentThread = threadList[currentThreadIdx]; + if (currentThread.isAlive()) { + serverLog.logInfo("CRAWLER","Trying to shutdown crawler thread '" + currentThread.getName() + "' [" + currentThreadIdx + "]."); + ((plasmaCrawlWorker)currentThread).close(); + } + } + + serverLog.logInfo("CRAWLER","Waiting for " + this.theThreadGroup.activeCount() + " remaining crawler threads to finish shutdown ..."); + for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) { + Thread currentThread = threadList[currentThreadIdx]; + if (currentThread.isAlive()) { + serverLog.logInfo("CRAWLER","Waiting for crawler thread '" + currentThread.getName() + "' [" + currentThreadIdx + "] to finish shutdown."); + try { currentThread.join(500); } catch (InterruptedException ex) {} + } + } + serverLog.logWarning("CRAWLER","Shutdown of remaining crawler threads finish."); + } + catch (Exception e) { + serverLog.logWarning("CRAWLER","Unexpected error while trying to shutdown all remaining crawler threads.",e); + } + + super.close(); + + } + +} diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java index 256674d95..21cdb2034 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoader.java +++ b/source/de/anomic/plasma/plasmaCrawlLoader.java @@ -44,25 +44,24 @@ package de.anomic.plasma; -import de.anomic.net.URL; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; - import org.apache.commons.pool.impl.GenericObjectPool; -import de.anomic.server.serverSemaphore; +import de.anomic.net.URL; +import de.anomic.plasma.crawler.plasmaCrawlWorker; +import de.anomic.plasma.crawler.plasmaCrawlerFactory; +import de.anomic.plasma.crawler.plasmaCrawlerMsgQueue; +import de.anomic.plasma.crawler.plasmaCrawlerPool; import de.anomic.server.logging.serverLog; public final class plasmaCrawlLoader extends Thread { - static plasmaSwitchboard switchboard; + public static plasmaSwitchboard switchboard; private final plasmaHTCache cacheManager; private final serverLog log; - private final CrawlerMessageQueue theQueue; - private final CrawlerPool crawlwerPool; + private final plasmaCrawlerMsgQueue theQueue; + private final plasmaCrawlerPool crawlwerPool; private GenericObjectPool.Config crawlerPoolConfig = null; private final ThreadGroup theThreadGroup = new ThreadGroup("CrawlerThreads"); private boolean stopped = false; @@ -77,7 +76,7 @@ public final class plasmaCrawlLoader extends Thread { this.log = theLog; // configuring the crawler messagequeue - this.theQueue = new CrawlerMessageQueue(); + this.theQueue = new plasmaCrawlerMsgQueue(); // configuring the crawler thread pool // implementation of session thread pool @@ -102,13 +101,13 @@ public final class plasmaCrawlLoader extends Thread { //this.crawlerPoolConfig.timeBetweenEvictionRunsMillis = 30000; // config.testOnReturn = true; - CrawlerFactory theFactory = new CrawlerFactory( + plasmaCrawlerFactory theFactory = new plasmaCrawlerFactory( this.theThreadGroup, switchboard, - cacheManager, - log); + this.cacheManager, + this.log); - this.crawlwerPool = new CrawlerPool(theFactory,this.crawlerPoolConfig,this.theThreadGroup); + this.crawlwerPool = new plasmaCrawlerPool(theFactory,this.crawlerPoolConfig,this.theThreadGroup); // start the crawl loader this.start(); @@ -199,7 +198,7 @@ public final class plasmaCrawlLoader extends Thread { } public int getNumIdleWorker() { - return crawlwerPool.getNumIdle(); + return this.crawlwerPool.getNumIdle(); } public int getNumActiveWorker() { @@ -207,268 +206,11 @@ public final class plasmaCrawlLoader extends Thread { } public int size() { - return crawlwerPool.getNumActive(); - } -} - -class CrawlerMessageQueue { - private final serverSemaphore readSync; - private final serverSemaphore writeSync; - private final ArrayList messageList; - - public CrawlerMessageQueue() { - this.readSync = new serverSemaphore (0); - this.writeSync = new serverSemaphore (1); - - this.messageList = new ArrayList(10); - } - - /** - * - * @param newMessage - * @throws MessageQueueLockedException - * @throws InterruptedException - */ - public void addMessage(plasmaCrawlLoaderMessage newMessage) - throws InterruptedException, NullPointerException - { - if (newMessage == null) throw new NullPointerException(); - - this.writeSync.P(); - - boolean insertionDoneSuccessfully = false; - synchronized(this.messageList) { - insertionDoneSuccessfully = this.messageList.add(newMessage); - } - - if (insertionDoneSuccessfully) { - this.sortMessages(); - this.readSync.V(); - } - - this.writeSync.V(); - } - - public plasmaCrawlLoaderMessage waitForMessage() throws InterruptedException { - this.readSync.P(); - this.writeSync.P(); - - plasmaCrawlLoaderMessage newMessage = null; - synchronized(this.messageList) { - newMessage = (plasmaCrawlLoaderMessage) this.messageList.remove(0); - } - - this.writeSync.V(); - return newMessage; - } - - protected void sortMessages() { - Collections.sort(this.messageList, new Comparator() { - public int compare(Object o1, Object o2) - { - plasmaCrawlLoaderMessage message1 = (plasmaCrawlLoaderMessage) o1; - plasmaCrawlLoaderMessage message2 = (plasmaCrawlLoaderMessage) o2; - - int message1Priority = message1.crawlingPriority; - int message2Priority = message2.crawlingPriority; - - if (message1Priority > message2Priority){ - return -1; - } else if (message1Priority < message2Priority) { - return 1; - } else { - return 0; - } - } - }); - } -} - -final class CrawlerPool extends GenericObjectPool { - private final ThreadGroup theThreadGroup; - public boolean isClosed = false; - - public CrawlerPool(CrawlerFactory objFactory, - GenericObjectPool.Config config, - ThreadGroup threadGroup) { - super(objFactory, config); - this.theThreadGroup = threadGroup; - objFactory.setPool(this); - } - - public Object borrowObject() throws Exception { - return super.borrowObject(); - } - - public void returnObject(Object obj) { - if (obj == null) return; - if (obj instanceof plasmaCrawlWorker) { - try { - ((plasmaCrawlWorker)obj).setName(plasmaCrawlWorker.threadBaseName + "_inPool"); - super.returnObject(obj); - } catch (Exception e) { - ((plasmaCrawlWorker)obj).setStopped(true); - serverLog.logSevere("CRAWLER-POOL","Unable to return crawler thread to pool.",e); - } - } else { - serverLog.logSevere("CRAWLER-POOL","Object of wront type '" + obj.getClass().getName() + - "' returned to pool."); - } - } - - public void invalidateObject(Object obj) { - if (obj == null) return; - if (this.isClosed) return; - if (obj instanceof plasmaCrawlWorker) { - try { - ((plasmaCrawlWorker)obj).setName(plasmaCrawlWorker.threadBaseName + "_invalidated"); - ((plasmaCrawlWorker)obj).setStopped(true); - super.invalidateObject(obj); - } catch (Exception e) { - serverLog.logSevere("CRAWLER-POOL","Unable to invalidate crawling thread.",e); - } - } - } - - public synchronized void close() throws Exception { - try { - /* - * shutdown all still running session threads ... - */ - this.isClosed = true; - - /* waiting for all threads to finish */ - int threadCount = this.theThreadGroup.activeCount(); - Thread[] threadList = new Thread[threadCount]; - threadCount = this.theThreadGroup.enumerate(threadList); - - // signaling shutdown to all still running or pooled threads ... - serverLog.logInfo("CRAWLER","Signaling shutdown to " + threadCount + " remaining crawler threads ..."); - for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) { - ((plasmaCrawlWorker)threadList[currentThreadIdx]).setStopped(true); - } - - // giving the crawlers some time to finish shutdown - try { Thread.sleep(500); } catch(Exception e) {} - - // sending interrupted signal to all remaining threads - serverLog.logInfo("CRAWLER","Sending interruption signal to " + this.theThreadGroup.activeCount() + " remaining crawler threads ..."); - this.theThreadGroup.interrupt(); - - // aborting all crawlers by closing all still open httpc sockets - serverLog.logInfo("CRAWLER","Trying to abort " + this.theThreadGroup.activeCount() + " remaining crawler threads ..."); - for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) { - Thread currentThread = threadList[currentThreadIdx]; - if (currentThread.isAlive()) { - serverLog.logInfo("CRAWLER","Trying to shutdown crawler thread '" + currentThread.getName() + "' [" + currentThreadIdx + "]."); - ((plasmaCrawlWorker)currentThread).close(); - } - } - - serverLog.logInfo("CRAWLER","Waiting for " + this.theThreadGroup.activeCount() + " remaining crawler threads to finish shutdown ..."); - for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) { - Thread currentThread = threadList[currentThreadIdx]; - if (currentThread.isAlive()) { - serverLog.logInfo("CRAWLER","Waiting for crawler thread '" + currentThread.getName() + "' [" + currentThreadIdx + "] to finish shutdown."); - try { currentThread.join(500); } catch (InterruptedException ex) {} - } - } - serverLog.logWarning("CRAWLER","Shutdown of remaining crawler threads finish."); - } - catch (Exception e) { - serverLog.logWarning("CRAWLER","Unexpected error while trying to shutdown all remaining crawler threads.",e); - } - - super.close(); - + return this.crawlwerPool.getNumActive(); } - } -final class CrawlerFactory implements org.apache.commons.pool.PoolableObjectFactory { - - private CrawlerPool thePool; - private final ThreadGroup theThreadGroup; - private final plasmaHTCache cacheManager; - private final serverLog theLog; - private final plasmaSwitchboard sb; - - public CrawlerFactory( - ThreadGroup threadGroup, - plasmaSwitchboard theSb, - plasmaHTCache theCacheManager, - serverLog log) { - - super(); - if (threadGroup == null) - throw new IllegalArgumentException("The threadgroup object must not be null."); - this.theThreadGroup = threadGroup; - this.cacheManager = theCacheManager; - this.sb = theSb; - this.theLog = log; - } - public void setPool(CrawlerPool pool) { - this.thePool = pool; - } - /** - * @see org.apache.commons.pool.PoolableObjectFactory#makeObject() - */ - public Object makeObject() { - return new plasmaCrawlWorker( - this.theThreadGroup, - this.thePool, - this.sb, - this.cacheManager, - this.theLog); - } - - /** - * @see org.apache.commons.pool.PoolableObjectFactory#destroyObject(java.lang.Object) - */ - public void destroyObject(Object obj) { - if (obj == null) return; - if (obj instanceof plasmaCrawlWorker) { - plasmaCrawlWorker theWorker = (plasmaCrawlWorker) obj; - synchronized(theWorker) { - theWorker.destroyed = true; - theWorker.setName(plasmaCrawlWorker.threadBaseName + "_destroyed"); - theWorker.setStopped(true); - theWorker.interrupt(); - } - } - } - - /** - * @see org.apache.commons.pool.PoolableObjectFactory#validateObject(java.lang.Object) - */ - public boolean validateObject(Object obj) { - return true; - } - - /** - * @param obj - * - */ - public void activateObject(Object obj) { - //log.debug(" activateObject..."); - } - - /** - * @param obj - * - */ - - public void passivateObject(Object obj) { - //log.debug(" passivateObject..." + obj); - /* - if (obj instanceof plasmaCrawlWorker) { - plasmaCrawlWorker theWorker = (plasmaCrawlWorker) obj; - } - */ - } - -} \ No newline at end of file diff --git a/source/de/anomic/plasma/plasmaParserConfig.java b/source/de/anomic/plasma/plasmaParserConfig.java index 80c2d4334..146e6e06e 100644 --- a/source/de/anomic/plasma/plasmaParserConfig.java +++ b/source/de/anomic/plasma/plasmaParserConfig.java @@ -1,6 +1,50 @@ -/** - * - */ +// plasmaParserConfig.java +// ------------------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2004 +// +// This file ist contributed by Martin Thelian +// +// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $ +// $LastChangedRevision: 1715 $ +// $LastChangedBy: theli $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + + package de.anomic.plasma; import de.anomic.net.URL; diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index d4a2efd69..a528fb689 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -44,6 +44,8 @@ package de.anomic.plasma; import java.io.IOException; import de.anomic.net.URL; +import de.anomic.plasma.crawler.plasmaCrawlWorker; + import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet;