From 086239da36e9979f45f4aee1bd16a532b9b6b345 Mon Sep 17 00:00:00 2001 From: karlchenofhell Date: Wed, 16 May 2007 10:11:25 +0000 Subject: [PATCH] - added servlet: remote crawler queue overview - added servlet: crawl profile editor git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3731 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CrawlProfileEditor_p.html | 49 ++++++ htroot/CrawlProfileEditor_p.java | 165 ++++++++++++++++++ htroot/IndexCreateWWWRemoteQueue_p.html | 65 +++++++ htroot/IndexCreateWWWRemoteQueue_p.java | 142 +++++++++++++++ htroot/Status_p.inc | 2 +- .../env/templates/submenuIndexCreate.template | 9 +- .../de/anomic/plasma/plasmaCrawlProfile.java | 120 +++++++------ source/de/anomic/server/serverDate.java | 4 +- source/de/anomic/yacy/yacySeed.java | 2 - 9 files changed, 502 insertions(+), 56 deletions(-) create mode 100644 htroot/CrawlProfileEditor_p.html create mode 100644 htroot/CrawlProfileEditor_p.java create mode 100644 htroot/IndexCreateWWWRemoteQueue_p.html create mode 100644 htroot/IndexCreateWWWRemoteQueue_p.java diff --git a/htroot/CrawlProfileEditor_p.html b/htroot/CrawlProfileEditor_p.html new file mode 100644 index 000000000..36341fddf --- /dev/null +++ b/htroot/CrawlProfileEditor_p.html @@ -0,0 +1,49 @@ + + + + YaCy '#[clientname]#': Crawl Profile Editor + #%env/templates/metas.template%# + + + #%env/templates/header.template%# + #%env/templates/submenuIndexCreate.template%# +

Crawl Profile Editor

+

+ Crawl profiles hold information about a specific URL which is internally used to perform the crawl it belongs to. + The profiles for remote crawls, indexing via proxy and snippet fetches + cannot be altered here as they are hard-coded. +

+
+
Select the profile to edit + + +
+
+ + #(error)#:: +

An error occured during editing the crawl profile: #[message]#

+ #(/error)# + + #(edit)#:: +
+
Edit Profile #[name]# + +
#{entries}# +
#(readonly)#::#[label]##(/readonly)#
+
#(readonly)# + :: + #(type)##(checked)#false::true#(/checked)#::#[value]#::#[value]##(/type)##(/readonly)# +
#{/entries}# +
+ +
+
+ #(/edit)# + #%env/templates/footer.template%# + + \ No newline at end of file diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java new file mode 100644 index 000000000..0f0691065 --- /dev/null +++ b/htroot/CrawlProfileEditor_p.java @@ -0,0 +1,165 @@ +// CrawlProfileEditor_p.java +// ------------------------------- +// part of the AnomicHTTPD caching proxy +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2004, 2005 +// last major change: 04.07.2005 +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +// You must compile this file with +// javac -classpath .:../classes CrawlProfileEditor_p.java +// if the shell's current path is HTROOT + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; + +import de.anomic.http.httpHeader; +import de.anomic.plasma.plasmaCrawlProfile.entry; +import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; +import de.anomic.server.servletProperties; + +public class CrawlProfileEditor_p { + + public static class eentry { + public static final int BOOLEAN = 0; + public static final int INTEGER = 1; + public static final int STRING = 2; + + public final String name; + public final String label; + public final boolean readonly; + public final int type; + + public eentry(String name, String label, boolean readonly, int type) { + this.name = name; + this.label = label; + this.readonly = readonly; + this.type = type; + } + } + + private static final ArrayList /**/ labels = new ArrayList(); + static { + labels.add(new eentry(entry.NAME, "Name", true, eentry.STRING)); + labels.add(new eentry(entry.START_URL, "Start URL", true, eentry.STRING)); + labels.add(new eentry(entry.GENERAL_FILTER, "General Filter", false, eentry.STRING)); + labels.add(new eentry(entry.SPECIFIC_FILTER, "Specific Filter", false, eentry.STRING)); + labels.add(new eentry(entry.GENERAL_DEPTH, "General Depth", false, eentry.INTEGER)); + labels.add(new eentry(entry.SPECIFIC_DEPTH, "Specific Depth", false, eentry.INTEGER)); + labels.add(new eentry(entry.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER)); + labels.add(new eentry(entry.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER)); + labels.add(new eentry(entry.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER)); + labels.add(new eentry(entry.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN)); + labels.add(new eentry(entry.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN)); + labels.add(new eentry(entry.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN)); + labels.add(new eentry(entry.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN)); + labels.add(new eentry(entry.STORE_TXCACHE, "Store in TXCache", false, eentry.BOOLEAN)); + labels.add(new eentry(entry.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN)); + labels.add(new eentry(entry.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN)); + labels.add(new eentry(entry.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN)); + labels.add(new eentry(entry.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN)); + } + + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { + final servletProperties prop = new servletProperties(); + final plasmaSwitchboard sb = (plasmaSwitchboard)env; + + String handle = (post == null) ? "" : post.get("handle", ""); + + int count = 0; + Iterator it = sb.profiles.profiles(true); + entry e; + while (it.hasNext()) { + e = (entry)it.next(); + if (e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_PROXY) || + e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) || + e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) || + e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA)) + continue; + prop.put("profiles_" + count + "_name", e.name()); + prop.put("profiles_" + count + "_handle", e.handle()); + if (handle.equals(e.handle())) + prop.put("profiles_" + count + "_selected", 1); + count++; + } + prop.put("profiles", count); + + e = sb.profiles.getEntry(handle); + if (e == null) return prop; + if (post.containsKey("submit")) try { + it = labels.iterator(); + eentry tee; + while (it.hasNext()) { + tee = (eentry)it.next(); + String cval = (String)e.map().get(tee.name); + String val = (tee.type == eentry.BOOLEAN) + ? Boolean.toString(post.containsKey(tee.name)) + : post.get(tee.name, cval); + if (!cval.equals(val)) + e.changeEntry(tee.name, val); + } + } catch (IOException ex) { + prop.put("error", 1); + prop.put("error_message", ex.getMessage()); + } + + prop.put("edit", 1); + prop.put("edit_name", e.name()); + prop.put("edit_handle", e.handle()); + it = labels.iterator(); + count = 0; + while (it.hasNext()) { + eentry ee = (eentry)it.next(); + Object val = e.map().get(ee.name); + prop.put("edit_entries_" + count + "_readonly", ee.readonly ? 1 : 0); + prop.put("edit_entries_" + count + "_readonly_name", ee.name); + prop.put("edit_entries_" + count + "_readonly_label", ee.label); + prop.put("edit_entries_" + count + "_readonly_type", ee.type); + if (ee.type == eentry.BOOLEAN) { + prop.put("edit_entries_" + count + "_readonly_type_checked", Boolean.parseBoolean((String)val) ? 1 : 0); + } else { + prop.put("edit_entries_" + count + "_readonly_type_value", val); + } + count++; + } + prop.put("edit_entries", count); + + return prop; + } +} diff --git a/htroot/IndexCreateWWWRemoteQueue_p.html b/htroot/IndexCreateWWWRemoteQueue_p.html new file mode 100644 index 000000000..1d296d2af --- /dev/null +++ b/htroot/IndexCreateWWWRemoteQueue_p.html @@ -0,0 +1,65 @@ + + + + YaCy '#[clientname]#': Index Creation / WWW Remote Crawl Queue + #%env/templates/metas.template%# + + + #%env/templates/header.template%# + #%env/templates/submenuIndexCreate.template%# +

Index Creation: WWW Remote Crawl Queue

+

+ This queue stores the urls that other peers sent to you in order to perform a remote crawl for them. +

+ #(crawler-queue)# +

The remote crawler queue is empty

+ :: +
+
+ +
+
+

+ There are #[num]# entries in the remote crawler queue. + Showing #[show-num]# most recent entries. +

+

+ Show last 50 | + 100 | + 250 | + 500 entries. +

+ + + + + + + + + + + + + + + + + + + #{list}# + + + + + + + + + + #{/list}# +
InitiatorProfileDepthModified DateAnchor NameURLDelete
#[initiator]##[profile]##[depth]##[modified]##[anchor]##[url]#[Delete]
+ #(/crawler-queue)# + #%env/templates/footer.template%# + + diff --git a/htroot/IndexCreateWWWRemoteQueue_p.java b/htroot/IndexCreateWWWRemoteQueue_p.java new file mode 100644 index 000000000..a7ca40370 --- /dev/null +++ b/htroot/IndexCreateWWWRemoteQueue_p.java @@ -0,0 +1,142 @@ +// IndexCreateWWWRemoteQueue_p.java +// ------------------------------- +// part of the AnomicHTTPD caching proxy +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2004, 2005 +// last major change: 04.07.2005 +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +// You must compile this file with +// javac -classpath .:../classes IndexCreateWWWRemoteQueue_p.java +// if the shell's current path is HTROOT + +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Locale; + +import de.anomic.http.httpHeader; +import de.anomic.plasma.plasmaCrawlEntry; +import de.anomic.plasma.plasmaCrawlNURL; +import de.anomic.plasma.plasmaCrawlProfile; +import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; +import de.anomic.server.servletProperties; +import de.anomic.yacy.yacyCore; +import de.anomic.yacy.yacySeed; + +public class IndexCreateWWWRemoteQueue_p { + + private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); + private static String daydate(Date date) { + if (date == null) return ""; + return dayFormatter.format(date); + } + + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { + final servletProperties prop = new servletProperties(); + final plasmaSwitchboard sb = (plasmaSwitchboard)env; + + int showLimit = 100; + if (post != null) { + if (post.containsKey("limit")) { + try { + showLimit = Integer.parseInt((String)post.get("limit")); + } catch (NumberFormatException e) { } + } + + if (post.containsKey("clearcrawlqueue")) { + int c = sb.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE); + sb.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_REMOTE); + try { sb.cleanProfiles(); } catch (InterruptedException e) { /* Ignore this */} + /* + int c = 0; + while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) > 0) { + urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash(); + if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } + } + */ + prop.put("info", 3); // crawling queue cleared + prop.put("info_numEntries", c); + } else if (post.containsKey("deleteEntry")) { + String urlHash = (String) post.get("deleteEntry"); + sb.noticeURL.remove(urlHash); + prop.put("LOCATION",""); + return prop; + } + } + + int stackSize = sb.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE); + if (stackSize == 0) { + prop.put("crawler-queue", 0); + } else { + prop.put("crawler-queue", 1); + plasmaCrawlEntry[] crawlerList = sb.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_REMOTE, showLimit); + + plasmaCrawlEntry urle; + boolean dark = true; + yacySeed initiator; + String profileHandle; + plasmaCrawlProfile.entry profileEntry; + int i, showNum = 0; + for (i = 0; (i < crawlerList.length) && (showNum < showLimit); i++) { + urle = crawlerList[i]; + if (urle != null && urle.url() != null) { + initiator = yacyCore.seedDB.getConnected(urle.initiator()); + profileHandle = urle.profileHandle(); + profileEntry = (profileHandle == null) ? null : sb.profiles.getEntry(profileHandle); + prop.put("crawler-queue_list_" + showNum + "_dark", ((dark) ? 1 : 0) ); + prop.put("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName())); + prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); + prop.put("crawler-queue_list_" + showNum + "_depth", urle.depth()); + prop.put("crawler-queue_list_" + showNum + "_modified", daydate(urle.loaddate()) ); + prop.put("crawler-queue_list_" + showNum + "_anchor", urle.name()); + prop.put("crawler-queue_list_" + showNum + "_url", urle.url().toString()); + prop.put("crawler-queue_list_" + showNum + "_hash", urle.urlhash()); + dark = !dark; + showNum++; + } else { + stackSize--; + } + } + prop.put("crawler-queue_show-num", showNum); //showin sjow-num most recent + prop.put("crawler-queue_num", stackSize);//num Entries + prop.put("crawler-queue_list", showNum); + } + + return prop; + } +} diff --git a/htroot/Status_p.inc b/htroot/Status_p.inc index e93f1b272..435de4f5f 100644 --- a/htroot/Status_p.inc +++ b/htroot/Status_p.inc @@ -71,7 +71,7 @@ #(localCrawlPaused)# ::(paused)#(/localCrawlPaused)# - Remote triggered Crawl + Remote triggered Crawl #[remoteTriggeredCrawlQueueSize]# #(remoteTriggeredCrawlPaused)#pause remote triggered crawl::continue remote triggered crawl#(/remoteTriggeredCrawlPaused)# #(remoteTriggeredCrawlPaused)# ::(paused)#(/remoteTriggeredCrawlPaused)# diff --git a/htroot/env/templates/submenuIndexCreate.template b/htroot/env/templates/submenuIndexCreate.template index 6540d34da..2ab337c64 100644 --- a/htroot/env/templates/submenuIndexCreate.template +++ b/htroot/env/templates/submenuIndexCreate.template @@ -1,9 +1,15 @@