From 252c6e4869519ae5d58e93d7b11f2cae306cf715 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 4 Jul 2005 15:07:33 +0000 Subject: [PATCH] added crawl queue monitor for global crawls git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@372 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexCreateIndexingQueue_p.java | 9 -- htroot/IndexCreateLoaderQueue_p.java | 9 -- htroot/IndexCreateWWWGlobalQueue_p.html | 48 +++++++ htroot/IndexCreateWWWGlobalQueue_p.java | 118 ++++++++++++++++++ ...p.html => IndexCreateWWWLocalQueue_p.html} | 17 +-- ...p.java => IndexCreateWWWLocalQueue_p.java} | 26 +--- htroot/IndexCreate_p.java | 5 - .../env/templates/submenuIndexCreate.template | 12 +- 8 files changed, 186 insertions(+), 58 deletions(-) create mode 100644 htroot/IndexCreateWWWGlobalQueue_p.html create mode 100644 htroot/IndexCreateWWWGlobalQueue_p.java rename htroot/{IndexCreateWWWLocalCrawlQueue_p.html => IndexCreateWWWLocalQueue_p.html} (62%) rename htroot/{IndexCreateWWWLocalCrawlQueue_p.java => IndexCreateWWWLocalQueue_p.java} (78%) diff --git a/htroot/IndexCreateIndexingQueue_p.java b/htroot/IndexCreateIndexingQueue_p.java index 7586b1e17..7e8c4a8a6 100644 --- a/htroot/IndexCreateIndexingQueue_p.java +++ b/htroot/IndexCreateIndexingQueue_p.java @@ -43,23 +43,14 @@ // javac -classpath .:../classes IndexCreate_p.java // if the shell's current path is HTROOT -import java.net.MalformedURLException; -import java.net.URL; import java.text.SimpleDateFormat; import java.util.Date; -import java.util.Enumeration; -import java.util.Iterator; import java.util.Locale; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaCrawlEURL; -import de.anomic.plasma.plasmaCrawlLoaderMessage; -import de.anomic.plasma.plasmaCrawlNURL; -import de.anomic.plasma.plasmaCrawlProfile; -import de.anomic.plasma.plasmaCrawlWorker; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaURL; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; diff --git a/htroot/IndexCreateLoaderQueue_p.java b/htroot/IndexCreateLoaderQueue_p.java index dfb9d46ec..1e7eaebe0 100644 --- a/htroot/IndexCreateLoaderQueue_p.java +++ b/htroot/IndexCreateLoaderQueue_p.java @@ -43,23 +43,14 @@ // javac -classpath .:../classes IndexCreate_p.java // if the shell's current path is HTROOT -import java.net.MalformedURLException; -import java.net.URL; import java.text.SimpleDateFormat; import java.util.Date; -import java.util.Enumeration; -import java.util.Iterator; import java.util.Locale; import de.anomic.http.httpHeader; -import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlLoaderMessage; -import de.anomic.plasma.plasmaCrawlNURL; -import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaCrawlWorker; -import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaURL; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; diff --git a/htroot/IndexCreateWWWGlobalQueue_p.html b/htroot/IndexCreateWWWGlobalQueue_p.html new file mode 100644 index 000000000..fd44c10c0 --- /dev/null +++ b/htroot/IndexCreateWWWGlobalQueue_p.html @@ -0,0 +1,48 @@ + + + +YaCy: Index Creation / WWW Global Crawl Queue +#[metas]# + + +#[header]# +#[submenuIndexCreate]# +
+

Index Creation: WWW Global Crawl Queue

+

+This queue stores the urls that shall be sent to other peers to perform a remote crawl. +If there is no peer for remote crawling available, the links are crawled locally. +

+

+#(crawler-queue)# +The global crawler queue is empty

+:: +

+ +
+
+There are #[num]# entries in the global crawler queue. Showing #[show-num]# most recent entries: + + + + + + + + +#{list}# + + + + + + + +#{/list}# +
InitiatorDepthModified DateAnchor NameURL
#[initiator]##[depth]##[modified]##[anchor]##[url]#
+#(/crawler-queue)# +

+ +#[footer]# + + diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java new file mode 100644 index 000000000..d7dbf45f0 --- /dev/null +++ b/htroot/IndexCreateWWWGlobalQueue_p.java @@ -0,0 +1,118 @@ +// IndexCreateWWWCrawlQueue_p.java +// ------------------------------- +// part of the AnomicHTTPD caching proxy +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2004, 2005 +// last major change: 04.07.2005 +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +// You must compile this file with +// javac -classpath .:../classes IndexCreate_p.java +// if the shell's current path is HTROOT + +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Locale; + +import de.anomic.http.httpHeader; +import de.anomic.plasma.plasmaCrawlNURL; +import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; +import de.anomic.yacy.yacyCore; +import de.anomic.yacy.yacySeed; + +public class IndexCreateWWWGlobalQueue_p { + + private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); + private static String daydate(Date date) { + if (date == null) return ""; else return dayFormatter.format(date); + } + + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { + // return variable that accumulates replacements + plasmaSwitchboard switchboard = (plasmaSwitchboard) env; + serverObjects prop = new serverObjects(); + + if (post != null) { + if (post.containsKey("clearcrawlqueue")) { + String urlHash; + int c = 0; + while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) > 0) { + urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash(); + if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } + } + prop.put("info", 3);//crawling queue cleared + prop.put("info_numEntries", c); + } + } + + int stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); + if (stackSize == 0) { + prop.put("crawler-queue", 0); + } else { + prop.put("crawler-queue", 1); + plasmaCrawlNURL.entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, 100); + prop.put("crawler-queue_num", stackSize);//num Entries + prop.put("crawler-queue_show-num", crawlerList.length); //showin sjow-num most recent + plasmaCrawlNURL.entry urle; + boolean dark = true; + yacySeed initiator; + int i; + for (i = 0; i < crawlerList.length; i++) { + urle = crawlerList[i]; + if (urle != null) { + initiator = yacyCore.seedDB.getConnected(urle.initiator()); + prop.put("crawler-queue_list_"+i+"_dark", ((dark) ? 1 : 0) ); + prop.put("crawler-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) ); + prop.put("crawler-queue_list_"+i+"_depth", urle.depth()); + prop.put("crawler-queue_list_"+i+"_modified", daydate(urle.loaddate()) ); + prop.put("crawler-queue_list_"+i+"_anchor", urle.name()); + prop.put("crawler-queue_list_"+i+"_url", urle.url()); + dark = !dark; + } + } + prop.put("crawler-queue_list", i); + } + + // return rewrite properties + return prop; + } + +} + + + diff --git a/htroot/IndexCreateWWWLocalCrawlQueue_p.html b/htroot/IndexCreateWWWLocalQueue_p.html similarity index 62% rename from htroot/IndexCreateWWWLocalCrawlQueue_p.html rename to htroot/IndexCreateWWWLocalQueue_p.html index cf84ef40c..9559e5205 100644 --- a/htroot/IndexCreateWWWLocalCrawlQueue_p.html +++ b/htroot/IndexCreateWWWLocalQueue_p.html @@ -1,24 +1,27 @@ -YaCy: Index Creation / WWW Crawl Queue +YaCy: Index Creation / WWW Local Crawl Queue #[metas]# #[header]# #[submenuIndexCreate]#
-

Index Creation: WWW Crawl Queue

- +

Index Creation: WWW Local Crawl Queue

+

+This queue stores the urls that shall be crawled localy by this peer. +It may also contain urls that are computed by the proxy-prefetch. +

#(crawler-queue)# -The crawler queue is empty

+The local crawler queue is empty

:: -

- + +

-There are #[num]# entries in the crawler queue. Showing #[show-num]# most recent entries: +There are #[num]# entries in the local crawler queue. Showing #[show-num]# most recent entries: diff --git a/htroot/IndexCreateWWWLocalCrawlQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java similarity index 78% rename from htroot/IndexCreateWWWLocalCrawlQueue_p.java rename to htroot/IndexCreateWWWLocalQueue_p.java index 54810e15c..9fb72806e 100644 --- a/htroot/IndexCreateWWWLocalCrawlQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -43,29 +43,19 @@ // javac -classpath .:../classes IndexCreate_p.java // if the shell's current path is HTROOT -import java.net.MalformedURLException; -import java.net.URL; import java.text.SimpleDateFormat; import java.util.Date; -import java.util.Enumeration; -import java.util.Iterator; import java.util.Locale; import de.anomic.http.httpHeader; -import de.anomic.plasma.plasmaCrawlEURL; -import de.anomic.plasma.plasmaCrawlLoaderMessage; import de.anomic.plasma.plasmaCrawlNURL; -import de.anomic.plasma.plasmaCrawlProfile; -import de.anomic.plasma.plasmaCrawlWorker; -import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaURL; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; -public class IndexCreateWWWLocalCrawlQueue_p { +public class IndexCreateWWWLocalQueue_p { private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); private static String daydate(Date date) { @@ -85,26 +75,18 @@ public class IndexCreateWWWLocalCrawlQueue_p { urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE).hash(); if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } } - while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) > 0) { - urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash(); - if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } - } - while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) > 0) { - urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash(); - if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } - } prop.put("info", 3);//crawling queue cleared prop.put("info_numEntries", c); } } - int localStackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); - if (localStackSize == 0) { + int stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); + if (stackSize == 0) { prop.put("crawler-queue", 0); } else { prop.put("crawler-queue", 1); plasmaCrawlNURL.entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, 100); - prop.put("crawler-queue_num", localStackSize);//num Entries + prop.put("crawler-queue_num", stackSize);//num Entries prop.put("crawler-queue_show-num", crawlerList.length); //showin sjow-num most recent plasmaCrawlNURL.entry urle; boolean dark = true; diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index 0ac1b6afa..13f5f6135 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -52,12 +52,7 @@ import java.util.Iterator; import java.util.Locale; import de.anomic.http.httpHeader; -import de.anomic.plasma.plasmaCrawlEURL; -import de.anomic.plasma.plasmaCrawlLoaderMessage; -import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlProfile; -import de.anomic.plasma.plasmaCrawlWorker; -import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; import de.anomic.server.serverObjects; diff --git a/htroot/env/templates/submenuIndexCreate.template b/htroot/env/templates/submenuIndexCreate.template index c8d4a92ae..1c8be4f85 100644 --- a/htroot/env/templates/submenuIndexCreate.template +++ b/htroot/env/templates/submenuIndexCreate.template @@ -19,21 +19,21 @@ Loader  +Local  +Global  +Overhang  +Images  +Movies  +Music 
Initiator
\ No newline at end of file