From 252c6e4869519ae5d58e93d7b11f2cae306cf715 Mon Sep 17 00:00:00 2001
From: orbiter
Date: Mon, 4 Jul 2005 15:07:33 +0000
Subject: [PATCH] added crawl queue monitor for global crawls
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@372 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
htroot/IndexCreateIndexingQueue_p.java | 9 --
htroot/IndexCreateLoaderQueue_p.java | 9 --
htroot/IndexCreateWWWGlobalQueue_p.html | 48 +++++++
htroot/IndexCreateWWWGlobalQueue_p.java | 118 ++++++++++++++++++
...p.html => IndexCreateWWWLocalQueue_p.html} | 17 +--
...p.java => IndexCreateWWWLocalQueue_p.java} | 26 +---
htroot/IndexCreate_p.java | 5 -
.../env/templates/submenuIndexCreate.template | 12 +-
8 files changed, 186 insertions(+), 58 deletions(-)
create mode 100644 htroot/IndexCreateWWWGlobalQueue_p.html
create mode 100644 htroot/IndexCreateWWWGlobalQueue_p.java
rename htroot/{IndexCreateWWWLocalCrawlQueue_p.html => IndexCreateWWWLocalQueue_p.html} (62%)
rename htroot/{IndexCreateWWWLocalCrawlQueue_p.java => IndexCreateWWWLocalQueue_p.java} (78%)
diff --git a/htroot/IndexCreateIndexingQueue_p.java b/htroot/IndexCreateIndexingQueue_p.java
index 7586b1e17..7e8c4a8a6 100644
--- a/htroot/IndexCreateIndexingQueue_p.java
+++ b/htroot/IndexCreateIndexingQueue_p.java
@@ -43,23 +43,14 @@
// javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT
-import java.net.MalformedURLException;
-import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
-import java.util.Enumeration;
-import java.util.Iterator;
import java.util.Locale;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlEURL;
-import de.anomic.plasma.plasmaCrawlLoaderMessage;
-import de.anomic.plasma.plasmaCrawlNURL;
-import de.anomic.plasma.plasmaCrawlProfile;
-import de.anomic.plasma.plasmaCrawlWorker;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard;
-import de.anomic.plasma.plasmaURL;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
diff --git a/htroot/IndexCreateLoaderQueue_p.java b/htroot/IndexCreateLoaderQueue_p.java
index dfb9d46ec..1e7eaebe0 100644
--- a/htroot/IndexCreateLoaderQueue_p.java
+++ b/htroot/IndexCreateLoaderQueue_p.java
@@ -43,23 +43,14 @@
// javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT
-import java.net.MalformedURLException;
-import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
-import java.util.Enumeration;
-import java.util.Iterator;
import java.util.Locale;
import de.anomic.http.httpHeader;
-import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLoaderMessage;
-import de.anomic.plasma.plasmaCrawlNURL;
-import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaCrawlWorker;
-import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard;
-import de.anomic.plasma.plasmaURL;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
diff --git a/htroot/IndexCreateWWWGlobalQueue_p.html b/htroot/IndexCreateWWWGlobalQueue_p.html
new file mode 100644
index 000000000..fd44c10c0
--- /dev/null
+++ b/htroot/IndexCreateWWWGlobalQueue_p.html
@@ -0,0 +1,48 @@
+
+
+
+YaCy: Index Creation / WWW Global Crawl Queue
+#[metas]#
+
+
+#[header]#
+#[submenuIndexCreate]#
+
+Index Creation: WWW Global Crawl Queue
+
+This queue stores the urls that shall be sent to other peers to perform a remote crawl.
+If there is no peer for remote crawling available, the links are crawled locally.
+
+
+#(crawler-queue)#
+The global crawler queue is empty
+::
+
+
+There are #[num]# entries in the global crawler queue. Showing #[show-num]# most recent entries:
+
+
+#{list}#
+
+#[initiator]# |
+#[depth]# |
+#[modified]# |
+#[anchor]# |
+#[url]# |
+
+#{/list}#
+
+#(/crawler-queue)#
+
+
+#[footer]#
+
+
diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java
new file mode 100644
index 000000000..d7dbf45f0
--- /dev/null
+++ b/htroot/IndexCreateWWWGlobalQueue_p.java
@@ -0,0 +1,118 @@
+// IndexCreateWWWCrawlQueue_p.java
+// -------------------------------
+// part of the AnomicHTTPD caching proxy
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2004, 2005
+// last major change: 04.07.2005
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+// You must compile this file with
+// javac -classpath .:../classes IndexCreate_p.java
+// if the shell's current path is HTROOT
+
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Locale;
+
+import de.anomic.http.httpHeader;
+import de.anomic.plasma.plasmaCrawlNURL;
+import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.server.serverObjects;
+import de.anomic.server.serverSwitch;
+import de.anomic.yacy.yacyCore;
+import de.anomic.yacy.yacySeed;
+
+public class IndexCreateWWWGlobalQueue_p {
+
+ private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
+ private static String daydate(Date date) {
+ if (date == null) return ""; else return dayFormatter.format(date);
+ }
+
+ public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
+ // return variable that accumulates replacements
+ plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
+ serverObjects prop = new serverObjects();
+
+ if (post != null) {
+ if (post.containsKey("clearcrawlqueue")) {
+ String urlHash;
+ int c = 0;
+ while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) > 0) {
+ urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash();
+ if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
+ }
+ prop.put("info", 3);//crawling queue cleared
+ prop.put("info_numEntries", c);
+ }
+ }
+
+ int stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
+ if (stackSize == 0) {
+ prop.put("crawler-queue", 0);
+ } else {
+ prop.put("crawler-queue", 1);
+ plasmaCrawlNURL.entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, 100);
+ prop.put("crawler-queue_num", stackSize);//num Entries
+ prop.put("crawler-queue_show-num", crawlerList.length); //showin sjow-num most recent
+ plasmaCrawlNURL.entry urle;
+ boolean dark = true;
+ yacySeed initiator;
+ int i;
+ for (i = 0; i < crawlerList.length; i++) {
+ urle = crawlerList[i];
+ if (urle != null) {
+ initiator = yacyCore.seedDB.getConnected(urle.initiator());
+ prop.put("crawler-queue_list_"+i+"_dark", ((dark) ? 1 : 0) );
+ prop.put("crawler-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
+ prop.put("crawler-queue_list_"+i+"_depth", urle.depth());
+ prop.put("crawler-queue_list_"+i+"_modified", daydate(urle.loaddate()) );
+ prop.put("crawler-queue_list_"+i+"_anchor", urle.name());
+ prop.put("crawler-queue_list_"+i+"_url", urle.url());
+ dark = !dark;
+ }
+ }
+ prop.put("crawler-queue_list", i);
+ }
+
+ // return rewrite properties
+ return prop;
+ }
+
+}
+
+
+
diff --git a/htroot/IndexCreateWWWLocalCrawlQueue_p.html b/htroot/IndexCreateWWWLocalQueue_p.html
similarity index 62%
rename from htroot/IndexCreateWWWLocalCrawlQueue_p.html
rename to htroot/IndexCreateWWWLocalQueue_p.html
index cf84ef40c..9559e5205 100644
--- a/htroot/IndexCreateWWWLocalCrawlQueue_p.html
+++ b/htroot/IndexCreateWWWLocalQueue_p.html
@@ -1,24 +1,27 @@
-YaCy: Index Creation / WWW Crawl Queue
+YaCy: Index Creation / WWW Local Crawl Queue
#[metas]#
#[header]#
#[submenuIndexCreate]#
-Index Creation: WWW Crawl Queue
-
+Index Creation: WWW Local Crawl Queue
+
+This queue stores the urls that shall be crawled localy by this peer.
+It may also contain urls that are computed by the proxy-prefetch.
+
#(crawler-queue)#
-The crawler queue is empty
+The local crawler queue is empty
::
-
-There are #[num]# entries in the crawler queue. Showing #[show-num]# most recent entries:
+There are #[num]# entries in the local crawler queue. Showing #[show-num]# most recent entries:
\ No newline at end of file