From 9ad1d8dde25326632f01270f57db54b80803fcb3 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 2 Feb 2012 21:33:42 +0100 Subject: [PATCH] complete redesign of crawl queue monitoring: do not look at a ready-prepared crawl list but at the stacks of the domains that are stored for balanced crawling. This affects also the balancer since that does not need to prepare the pre-selected crawl list for monitoring. As a effect: - it is no more possible to see the correct order of next to-be-crawled links, since that depends on the actual state of the balancer stack the next time another url is requested for loading - the balancer works better since the next url can be selected according to the current situation and not according to a pre-selected order. --- htroot/Crawler_p.html | 61 ++- htroot/IndexCreateQueues_p.html | 95 +++++ ...lQueue_p.java => IndexCreateQueues_p.java} | 361 ++++++++---------- htroot/IndexCreateWWWGlobalQueue_p.html | 58 --- htroot/IndexCreateWWWGlobalQueue_p.java | 125 ------ htroot/IndexCreateWWWLocalQueue_p.html | 69 ---- htroot/IndexCreateWWWRemoteQueue_p.html | 65 ---- htroot/IndexCreateWWWRemoteQueue_p.java | 120 ------ htroot/api/queues_p.java | 124 ------ htroot/api/queues_p.xml | 71 ---- htroot/api/status_p.java | 53 ++- htroot/api/status_p.xml | 41 +- htroot/env/grafics/trash.gif | Bin 0 -> 932 bytes .../templates/submenuCrawlMonitor.template | 7 +- htroot/js/Crawler.js | 155 +++----- htroot/yacy/urls.java | 2 +- source/de/anomic/crawler/Balancer.java | 247 +++--------- source/de/anomic/crawler/CrawlQueues.java | 20 +- source/de/anomic/crawler/CrawlStacker.java | 6 +- source/de/anomic/crawler/Latency.java | 2 +- source/de/anomic/crawler/NoticedURL.java | 56 +-- source/de/anomic/http/client/Cache.java | 17 +- .../document/parser/html/ContentScraper.java | 1 + source/net/yacy/search/Switchboard.java | 2 +- 24 files changed, 528 insertions(+), 1230 deletions(-) create mode 100644 htroot/IndexCreateQueues_p.html rename htroot/{IndexCreateWWWLocalQueue_p.java => IndexCreateQueues_p.java} (53%) delete mode 100644 htroot/IndexCreateWWWGlobalQueue_p.html delete mode 100644 htroot/IndexCreateWWWGlobalQueue_p.java delete mode 100644 htroot/IndexCreateWWWLocalQueue_p.html delete mode 100644 htroot/IndexCreateWWWRemoteQueue_p.html delete mode 100644 htroot/IndexCreateWWWRemoteQueue_p.java delete mode 100755 htroot/api/queues_p.java delete mode 100644 htroot/api/queues_p.xml create mode 100644 htroot/env/grafics/trash.gif diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html index df78753a1..959ad4dec 100644 --- a/htroot/Crawler_p.html +++ b/htroot/Crawler_p.html @@ -6,14 +6,22 @@ - - + + + + #%env/templates/header.template%# #%env/templates/submenuCrawlMonitor.template%#

Crawler Queues

-

Next update in seconds. empty - See a access timing here

+ @@ -71,20 +79,6 @@
-
- - - - - - - - - - - -
Speed
PPM
-
@@ -103,15 +97,24 @@
+
+ + + + - + @@ -126,6 +129,7 @@
Indicator Level
Speed + + PPM + +
PPM (Pages Per Minute)          
+

#(info)# @@ -157,23 +161,10 @@ -

Crawl Queue:

- - - - - - - - - - - - - - -
QueueProfileInitiatorDepthModified DateAnchor NameURLSizeDelete
+

See an access timing

+ + #%env/templates/footer.template%# diff --git a/htroot/IndexCreateQueues_p.html b/htroot/IndexCreateQueues_p.html new file mode 100644 index 000000000..2a803a87b --- /dev/null +++ b/htroot/IndexCreateQueues_p.html @@ -0,0 +1,95 @@ + + + + YaCy '#[clientname]#': '#[queuename]#' Crawl Queue + #%env/templates/metas.template%# + + +
+ #(embed)# + #%env/templates/header.template%# + #%env/templates/submenuCrawlMonitor.template%# +

'#[queuename]#' Crawl Queue

+ ::#(/embed)# + + #(crawler)# +

This crawler queue is empty

+ :: + #(embed)# +
+
+ Delete Entries: + + + + +
+
+ ::#(/embed)# + + + + + + + + + + + + + + + + + + + + + + + + + + #{host}# + + + + + + + #{list}# + + + + + + + + + + + #{/list}# + + + #{/host}# + #(/crawler)# + #(embed)# + #%env/templates/footer.template%# + ::#(/embed)# + + + + \ No newline at end of file diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateQueues_p.java similarity index 53% rename from htroot/IndexCreateWWWLocalQueue_p.java rename to htroot/IndexCreateQueues_p.java index cb109a155..9582de5a4 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateQueues_p.java @@ -1,192 +1,169 @@ -// IndexCreateWWWLocalQueue_p.java -// ------------------------------- -// part of the AnomicHTTPD caching proxy -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004, 2005 -// -//$LastChangedDate$ -//$LastChangedRevision$ -//$LastChangedBy$ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -// You must compile this file with -// javac -classpath .:../classes IndexCreate_p.java -// if the shell's current path is HTROOT - -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Date; -import java.util.Iterator; -import java.util.List; -import java.util.Locale; -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; - -import net.yacy.cora.document.ASCII; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.kelondro.logging.Log; -import net.yacy.peers.Seed; -import net.yacy.search.Switchboard; - -import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.NoticedURL; -import de.anomic.crawler.CrawlSwitchboard; -import de.anomic.crawler.retrieval.Request; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; - -public class IndexCreateWWWLocalQueue_p { - - private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); - private static String daydate(final Date date) { - if (date == null) return ""; - return dayFormatter.format(date); - } - - private static final int INVALID = 0; - private static final int URL = 1; - private static final int ANCHOR = 2; - private static final int PROFILE = 3; - private static final int DEPTH = 4; - private static final int INITIATOR = 5; - private static final int MODIFIED = 6; - - public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - // return variable that accumulates replacements - final Switchboard sb = (Switchboard) env; - final serverObjects prop = new serverObjects(); - - int showLimit = 100; - if (post != null) { - showLimit = post.getInt("limit", 100); - - if (post.containsKey("deleteEntries")) { - int c = 0; - - final String pattern = post.get("pattern", ".*").trim(); - final int option = post.getInt("option", INVALID); - if (".*".equals(pattern)) { - c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE); - sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.CORE); - try { sb.cleanProfiles(); } catch (final InterruptedException e) {/* ignore this */} - } else if (option > INVALID) { - try { - // compiling the regular expression - final Pattern compiledPattern = Pattern.compile(pattern); - - if (option == PROFILE) { - // search and delete the crawl profile (_much_ faster, independant of queue size) - // XXX: what to do about the annoying LOST PROFILE messages in the log? - CrawlProfile entry; - for (final byte[] handle: sb.crawler.getActive()) { - entry = sb.crawler.getActive(handle); - final String name = entry.name(); - if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) - continue; - if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes()); - } - } else { - // iterating through the list of URLs - final Iterator iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.StackType.CORE); - Request entry; - final List removehashes = new ArrayList(); - while (iter.hasNext()) { - if ((entry = iter.next()) == null) continue; - String value = null; - - location: switch (option) { - case URL: value = (entry.url() == null) ? null : entry.url().toString(); break location; - case ANCHOR: value = entry.name(); break location; - case DEPTH: value = Integer.toString(entry.depth()); break location; - case INITIATOR: - value = (entry.initiator() == null || entry.initiator().length == 0) ? "proxy" : ASCII.String(entry.initiator()); - break location; - case MODIFIED: value = daydate(entry.appdate()); break location; - default: value = null; break location; - } - - if (value != null && compiledPattern.matcher(value).matches()) removehashes.add(entry.url().hash()); - } - Log.logInfo("IndexCreateWWWLocalQueue", "created a remove list with " + removehashes.size() + " entries for pattern '" + pattern + "'"); - for (final byte[] b: removehashes) { - sb.crawlQueues.noticeURL.removeByURLHash(b); - } - } - } catch (final PatternSyntaxException e) { - Log.logException(e); - } - } - - prop.put("info", "3");//crawling queue cleared - prop.putNum("info_numEntries", c); - } else if (post.containsKey("deleteEntry")) { - final String urlHash = post.get("deleteEntry"); - sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes()); - prop.put("LOCATION",""); - return prop; - } - } - - int showNum = 0, stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE); - if (stackSize == 0) { - prop.put("crawler-queue", "0"); - } else { - prop.put("crawler-queue", "1"); - final List crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, (int) (showLimit * 1.20)); - - Request urle; - boolean dark = true; - Seed initiator; - String profileHandle; - CrawlProfile profileEntry; - int i; - for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { - urle = crawlerList.get(i); - if ((urle != null)&&(urle.url()!=null)) { - initiator = sb.peers.getConnected(urle.initiator() == null ? "" : ASCII.String(urle.initiator())); - profileHandle = urle.profileHandle(); - profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes()); - prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0"); - prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) ); - prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); - prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth()); - prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) ); - prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name()); - prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true)); - prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash()); - dark = !dark; - showNum++; - } else { - stackSize--; - } - } - prop.putNum("crawler-queue_list", showNum); - prop.putNum("crawler-queue_num", stackSize);//num Entries - prop.putNum("crawler-queue_show-num", showNum); //showin sjow-num most recent - - } - - // return rewrite properties - return prop; - } -} + +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import net.yacy.cora.document.ASCII; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.kelondro.logging.Log; +import net.yacy.peers.Seed; +import net.yacy.search.Switchboard; +import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.CrawlSwitchboard; +import de.anomic.crawler.NoticedURL.StackType; +import de.anomic.crawler.retrieval.Request; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; + +public class IndexCreateQueues_p { + + private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); + private static String daydate(final Date date) { + if (date == null) return ""; + return dayFormatter.format(date); + } + + private static final int INVALID = 0; + private static final int URL = 1; + private static final int ANCHOR = 2; + private static final int PROFILE = 3; + private static final int DEPTH = 4; + private static final int INITIATOR = 5; + private static final int MODIFIED = 6; + + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { + // return variable that accumulates replacements + final Switchboard sb = (Switchboard) env; + final serverObjects prop = new serverObjects(); + StackType stackType = StackType.LOCAL; + int urlsPerHost = 5; + boolean embed = false; + String deletepattern = ".*"; + + if (post != null) { + stackType = StackType.valueOf(post.get("stack", stackType.name()).toUpperCase()); + urlsPerHost = post.getInt("urlsPerHost", urlsPerHost); + if (post.containsKey("embed")) embed = true; + + if (post.containsKey("delete")) { + deletepattern = post.get("pattern", deletepattern).trim(); + final int option = post.getInt("option", INVALID); + if (".*".equals(deletepattern)) { + sb.crawlQueues.noticeURL.clear(stackType); + try { sb.cleanProfiles(); } catch (final InterruptedException e) {/* ignore this */} + } else if (option > INVALID) { + try { + // compiling the regular expression + final Pattern compiledPattern = Pattern.compile(deletepattern); + + if (option == PROFILE) { + // search and delete the crawl profile (_much_ faster, independant of queue size) + // XXX: what to do about the annoying LOST PROFILE messages in the log? + CrawlProfile entry; + for (final byte[] handle: sb.crawler.getActive()) { + entry = sb.crawler.getActive(handle); + final String name = entry.name(); + if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) || + name.equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) || + name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) || + name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) || + name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) || + name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) || + name.equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) + continue; + if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes()); + } + } else { + // iterating through the list of URLs + final Iterator iter = sb.crawlQueues.noticeURL.iterator(stackType); + Request entry; + final List removehashes = new ArrayList(); + while (iter.hasNext()) { + if ((entry = iter.next()) == null) continue; + String value = null; + + location: switch (option) { + case URL: value = (entry.url() == null) ? null : entry.url().toString(); break location; + case ANCHOR: value = entry.name(); break location; + case DEPTH: value = Integer.toString(entry.depth()); break location; + case INITIATOR: + value = (entry.initiator() == null || entry.initiator().length == 0) ? "proxy" : ASCII.String(entry.initiator()); + break location; + case MODIFIED: value = daydate(entry.appdate()); break location; + default: value = null; break location; + } + + if (value != null && compiledPattern.matcher(value).matches()) removehashes.add(entry.url().hash()); + } + Log.logInfo("IndexCreateQueues_p", "created a remove list with " + removehashes.size() + " entries for pattern '" + deletepattern + "'"); + for (final byte[] b: removehashes) { + sb.crawlQueues.noticeURL.removeByURLHash(b); + } + } + } catch (final PatternSyntaxException e) { + Log.logException(e); + } + } + } + } + + int stackSize = sb.crawlQueues.noticeURL.stackSize(stackType); + if (stackSize == 0) { + prop.put("crawler", "0"); + } else { + prop.put("crawler", "1"); + prop.put("crawler_embed", embed ? 1 : 0); + prop.put("crawler_embed_deletepattern", deletepattern); + prop.put("crawler_embed_queuename", stackType.name()); + + final Map hosts = sb.crawlQueues.noticeURL.getDomainStackHosts(stackType); + + int hc = 0; + for (Map.Entry host: hosts.entrySet()) { + prop.putHTML("crawler_host_" + hc + "_hostname", host.getKey()); + prop.put("crawler_host_" + hc + "_embed", embed ? 1 : 0); + prop.put("crawler_host_" + hc + "_urlsPerHost", urlsPerHost); + prop.putHTML("crawler_host_" + hc + "_queuename", stackType.name()); + prop.put("crawler_host_" + hc + "_hostcount", host.getValue()[0]); + prop.put("crawler_host_" + hc + "_hostdelta", host.getValue()[1]); + List domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost); + + Seed initiator; + String profileHandle; + CrawlProfile profileEntry; + int count = 0; + for (Request request: domainStackReferences) { + if (request == null) continue; + initiator = sb.peers.getConnected(request.initiator() == null ? "" : ASCII.String(request.initiator())); + profileHandle = request.profileHandle(); + profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes()); + prop.putHTML("crawler_host_" + hc + "_list_" + count + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()) ); + prop.put("crawler_host_" + hc + "_list_" + count + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); + prop.put("crawler_host_" + hc + "_list_" + count + "_depth", request.depth()); + prop.put("crawler_host_" + hc + "_list_" + count + "_modified", daydate(request.appdate()) ); + prop.putHTML("crawler_host_" + hc + "_list_" + count + "_anchor", request.name()); + prop.put("crawler_host_" + hc + "_list_" + count + "_delta", sb.crawlQueues.noticeURL.getDomainSleepTime(stackType, sb.crawler, request)); + prop.putHTML("crawler_host_" + hc + "_list_" + count + "_url", request.url().toNormalform(false, true)); + prop.put("crawler_host_" + hc + "_list_" + count + "_hash", request.url().hash()); + count++; + } + prop.putNum("crawler_host_" + hc + "_list", count); + hc++; + } + prop.put("crawler_host", hc); + } + + prop.put("embed", embed ? 1 : 0); + prop.put("queuename", stackType.name().charAt(0) + stackType.name().substring(1).toLowerCase()); + prop.put("embed_queuename", stackType.name().charAt(0) + stackType.name().substring(1).toLowerCase()); + + // return rewrite properties + return prop; + } +} diff --git a/htroot/IndexCreateWWWGlobalQueue_p.html b/htroot/IndexCreateWWWGlobalQueue_p.html deleted file mode 100644 index 6151b7c0d..000000000 --- a/htroot/IndexCreateWWWGlobalQueue_p.html +++ /dev/null @@ -1,58 +0,0 @@ - - - - YaCy '#[clientname]#': Global Crawl Queue - #%env/templates/metas.template%# - - - #%env/templates/header.template%# - #%env/templates/submenuCrawlMonitor.template%# -

Global Crawl Queue

-

- This queue stores the urls that shall be sent to other peers to perform a remote crawl. - If there is no peer for remote crawling available, the links are crawled locally. -

- #(crawler-queue)# -

The global crawler queue is empty

- :: -
-
- -
- -

There are #[num]# entries in the global crawler queue. Showing #[show-num]# most recent entries.

-

Show last 50 | 100 | 250 | 500 entries.

-
CountDelta/msHostInitiatorProfileDepthModified DateAnchor NameDelta/msURL
#[hostcount]##[hostdelta]# #[hostname]#
#[initiator]##[profile]##[depth]##[modified]##[anchor]##[delta]##[url]#
- - - - - - - - - - - - - - - - - - #{list}# - - - - - - - - - - #{/list}# -
InitiatorProfileDepthModified DateAnchor NameURLDelete
#[initiator]##[profile]##[depth]##[modified]##[anchor]##[url]#[Delete]
- #(/crawler-queue)# - #%env/templates/footer.template%# - - diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java deleted file mode 100644 index efa3222e3..000000000 --- a/htroot/IndexCreateWWWGlobalQueue_p.java +++ /dev/null @@ -1,125 +0,0 @@ -// IndexCreateWWWGlobalQueue_p.java -// ------------------------------- -// part of the AnomicHTTPD caching proxy -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004, 2005 -// -//$LastChangedDate$ -//$LastChangedRevision$ -//$LastChangedBy$ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -// You must compile this file with -// javac -classpath .:../classes IndexCreate_p.java -// if the shell's current path is HTROOT - -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.List; -import java.util.Locale; - -import net.yacy.cora.document.ASCII; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.peers.Seed; -import net.yacy.search.Switchboard; - -import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.NoticedURL; -import de.anomic.crawler.retrieval.Request; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; - -public class IndexCreateWWWGlobalQueue_p { - - private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); - private static String daydate(final Date date) { - if (date == null) return ""; - return dayFormatter.format(date); - } - - public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - // return variable that accumulates replacements - final Switchboard sb = (Switchboard) env; - final serverObjects prop = new serverObjects(); - - int showLimit = 100; - if (post != null) { - showLimit = post.getInt("limit", 100); - - if (post.containsKey("clearcrawlqueue")) { - final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT); - sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.LIMIT); - try { sb.cleanProfiles(); } catch (final InterruptedException e) { /* Ignore this */} - /* - int c = 0; - while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.StackType.LIMIT) > 0) { - urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.StackType.LIMIT).hash(); - if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } - } - */ - prop.put("info", "3");//crawling queue cleared - prop.putNum("info_numEntries", c); - } else if (post.containsKey("deleteEntry")) { - final String urlHash = post.get("deleteEntry"); - sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes()); - prop.put("LOCATION",""); - return prop; - } - } - - int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT); - if (stackSize == 0) { - prop.put("crawler-queue", "0"); - } else { - prop.put("crawler-queue", "1"); - final List crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, showLimit); - - Request urle; - boolean dark = true; - Seed initiator; - String profileHandle; - CrawlProfile profileEntry; - int i, showNum = 0; - for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { - urle = crawlerList.get(i); - if (urle != null && urle.url() != null) { - initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator())); - profileHandle = urle.profileHandle(); - profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes()); - prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0"); - prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) ); - prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); - prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth()); - prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) ); - prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name()); - prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true)); - prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash()); - dark = !dark; - showNum++; - } else { - stackSize--; - } - } - prop.putNum("crawler-queue_show-num", showNum); //showin sjow-num most recent - prop.putNum("crawler-queue_num", stackSize);//num Entries - prop.putNum("crawler-queue_list", showNum); - } - - // return rewrite properties - return prop; - } -} diff --git a/htroot/IndexCreateWWWLocalQueue_p.html b/htroot/IndexCreateWWWLocalQueue_p.html deleted file mode 100644 index 8c7bfeb01..000000000 --- a/htroot/IndexCreateWWWLocalQueue_p.html +++ /dev/null @@ -1,69 +0,0 @@ - - - - YaCy '#[clientname]#': Local Crawl Queue - #%env/templates/metas.template%# - - - #%env/templates/header.template%# - #%env/templates/submenuCrawlMonitor.template%# -

Local Crawl Queue

-

- This queue stores the urls that shall be crawled localy by this peer. - It may also contain urls that are computed by the proxy-prefetch. -

- - #(crawler-queue)# -

The local crawler queue is empty

- :: -
-
- Delete Entries: - - - This may take a quite long time. -
-
-

There are #[num]# entries in the local crawler queue. Showing #[show-num]# most recent entries.

-

Show last 50 | 100 | 250 | 500 entries.

- - - - - - - - - - - - - - - - - - - #{list}# - - - - - - - - - - #{/list}# -
InitiatorProfileDepthModified DateAnchor NameURLDelete
#[initiator]##[profile]##[depth]##[modified]##[anchor]##[url]#[Delete]
- #(/crawler-queue)# - #%env/templates/footer.template%# - - \ No newline at end of file diff --git a/htroot/IndexCreateWWWRemoteQueue_p.html b/htroot/IndexCreateWWWRemoteQueue_p.html deleted file mode 100644 index abb6cc543..000000000 --- a/htroot/IndexCreateWWWRemoteQueue_p.html +++ /dev/null @@ -1,65 +0,0 @@ - - - - YaCy '#[clientname]#': Remote Crawl Queue - #%env/templates/metas.template%# - - - #%env/templates/header.template%# - #%env/templates/submenuCrawlMonitor.template%# -

Remote Crawl Queue

-

- This queue stores the urls that other peers sent to you in order to perform a remote crawl for them. -

- #(crawler-queue)# -

The remote crawler queue is empty

- :: -
-
- -
-
-

- There are #[num]# entries in the remote crawler queue. - Showing #[show-num]# most recent entries. -

-

- Show last 50 | - 100 | - 250 | - 500 entries. -

- - - - - - - - - - - - - - - - - - - #{list}# - - - - - - - - - - #{/list}# -
InitiatorProfileDepthModified DateAnchor NameURLDelete
#[initiator]##[profile]##[depth]##[modified]##[anchor]##[url]#[Delete]
- #(/crawler-queue)# - #%env/templates/footer.template%# - - diff --git a/htroot/IndexCreateWWWRemoteQueue_p.java b/htroot/IndexCreateWWWRemoteQueue_p.java deleted file mode 100644 index a82ee5221..000000000 --- a/htroot/IndexCreateWWWRemoteQueue_p.java +++ /dev/null @@ -1,120 +0,0 @@ -// IndexCreateWWWRemoteQueue_p.java -// ------------------------------- -// part of the AnomicHTTPD caching proxy -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004, 2005 -// last major change: 04.07.2005 -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -// You must compile this file with -// javac -classpath .:../classes IndexCreateWWWRemoteQueue_p.java -// if the shell's current path is HTROOT - -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.List; -import java.util.Locale; - -import net.yacy.cora.document.ASCII; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.peers.Seed; -import net.yacy.search.Switchboard; - -import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.NoticedURL; -import de.anomic.crawler.retrieval.Request; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; -import de.anomic.server.servletProperties; - -public class IndexCreateWWWRemoteQueue_p { - - private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); - private static String daydate(final Date date) { - if (date == null) return ""; - return dayFormatter.format(date); - } - - public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - final servletProperties prop = new servletProperties(); - final Switchboard sb = (Switchboard)env; - - int showLimit = 100; - if (post != null) { - showLimit = post.getInt("limit", 100); - - if (post.containsKey("clearcrawlqueue")) { - final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.REMOTE); - sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.REMOTE); - try { sb.cleanProfiles(); } catch (final InterruptedException e) { /* Ignore this */} - /* - int c = 0; - while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.StackType.LIMIT) > 0) { - urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.StackType.LIMIT).hash(); - if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } - } - */ - prop.put("info", "3"); // crawling queue cleared - prop.putNum("info_numEntries", c); - } else if (post.containsKey("deleteEntry")) { - final String urlHash = post.get("deleteEntry"); - sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes()); - prop.put("LOCATION",""); - return prop; - } - } - - int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.REMOTE); - if (stackSize == 0) { - prop.put("crawler-queue", "0"); - } else { - prop.put("crawler-queue", "1"); - final List crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.REMOTE, showLimit); - - Request urle; - boolean dark = true; - Seed initiator; - String profileHandle; - CrawlProfile profileEntry; - int i, showNum = 0; - for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { - urle = crawlerList.get(i); - if (urle != null && urle.url() != null) { - initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator())); - profileHandle = urle.profileHandle(); - profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes()); - prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0"); - prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName())); - prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); - prop.put("crawler-queue_list_" + showNum + "_depth", urle.depth()); - prop.put("crawler-queue_list_" + showNum + "_modified", daydate(urle.appdate()) ); - prop.putHTML("crawler-queue_list_" + showNum + "_anchor", urle.name()); - prop.putHTML("crawler-queue_list_" + showNum + "_url", urle.url().toString()); - prop.put("crawler-queue_list_" + showNum + "_hash", urle.url().hash()); - dark = !dark; - showNum++; - } else { - stackSize--; - } - } - prop.putNum("crawler-queue_show-num", showNum); //showin sjow-num most recent - prop.putNum("crawler-queue_num", stackSize);//num Entries - prop.putNum("crawler-queue_list", showNum); - } - return prop; - } -} diff --git a/htroot/api/queues_p.java b/htroot/api/queues_p.java deleted file mode 100755 index 2c9b81bb4..000000000 --- a/htroot/api/queues_p.java +++ /dev/null @@ -1,124 +0,0 @@ -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.List; -import java.util.Locale; - -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.peers.Seed; -import net.yacy.search.Switchboard; -import net.yacy.search.SwitchboardConstants; -import net.yacy.search.index.Segment; -import net.yacy.search.index.Segments; -import de.anomic.crawler.NoticedURL; -import de.anomic.crawler.retrieval.Request; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; - -public class queues_p { - - public static final String STATE_RUNNING = "running"; - public static final String STATE_PAUSED = "paused"; - - private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); - private static String daydate(final Date date) { - if (date == null) return ""; - return dayFormatter.format(date); - } - - public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - // return variable that accumulates replacements - final Switchboard sb = (Switchboard) env; - //wikiCode wikiTransformer = new wikiCode(switchboard); - final serverObjects prop = new serverObjects(); - Segment segment = null; - final boolean html = post != null && post.containsKey("html"); - prop.setLocalized(html); - if (post != null && post.containsKey("segment") && sb.verifyAuthentication(header)) { - segment = sb.indexSegments.segment(post.get("segment")); - } - if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC); - prop.put("rejected", "0"); - //int showRejectedCount = 10; - - Seed initiator; - - // index size - prop.putNum("urlpublictextSize", segment.urlMetadata().size()); - prop.putNum("rwipublictextSize", segment.termIndex().sizesMax()); - - // loader queue - prop.putNum("loaderSize", sb.crawlQueues.workerSize()); - prop.putNum("loaderMax", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10)); - if (sb.crawlQueues.workerSize() == 0) { - prop.put("list-loader", "0"); - } else { - final Request[] w = sb.crawlQueues.activeWorkerEntries(); - int count = 0; - for (final Request r : w) { - if (r == null) continue; - prop.put("list-loader_"+count+"_profile", r.profileHandle()); - initiator = sb.peers.getConnected((r.initiator() == null) ? "" : ASCII.String(r.initiator())); - prop.putHTML("list-loader_"+count+"_initiator", ((initiator == null) ? "proxy" : initiator.getName())); - prop.put("list-loader_"+count+"_depth", r.depth()); - prop.putXML("list-loader_"+count+"_url", r.url().toString()); - count++; - } - prop.put("list-loader", count); - } - - //local crawl queue - prop.putNum("localCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL).getJobCount()); - prop.put("localCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) ? STATE_PAUSED : STATE_RUNNING); - int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE); - addNTable(sb, prop, "list-local", sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, Math.min(10, stackSize))); - - //global crawl queue - prop.putNum("limitCrawlSize", sb.crawlQueues.limitCrawlJobSize()); - prop.put("limitCrawlState", STATE_RUNNING); - stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT); - - //remote crawl queue - prop.putNum("remoteCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount()); - prop.put("remoteCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING); - stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT); - - if (stackSize == 0) { - prop.put("list-remote", "0"); - } else { - addNTable(sb, prop, "list-remote", sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, Math.min(10, stackSize))); - } - - //noload crawl queue - prop.putNum("noloadCrawlSize", sb.crawlQueues.noloadCrawlJobSize()); - prop.put("noloadCrawlState", STATE_RUNNING); - //stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.NOLOAD); - - - // return rewrite properties - return prop; - } - - - public static final void addNTable(final Switchboard sb, final serverObjects prop, final String tableName, final List crawlerList) { - - int showNum = 0; - Seed initiator; - for (final Request urle : crawlerList) { - if ((urle != null) && (urle.url() != null)) { - initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : UTF8.String(urle.initiator())); - prop.put(tableName + "_" + showNum + "_profile", urle.profileHandle()); - prop.put(tableName + "_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName())); - prop.put(tableName + "_" + showNum + "_depth", urle.depth()); - prop.put(tableName + "_" + showNum + "_modified", daydate(urle.appdate())); - prop.putXML(tableName + "_" + showNum + "_anchor", urle.name()); - prop.putXML(tableName + "_" + showNum + "_url", urle.url().toNormalform(false, true)); - prop.put(tableName + "_" + showNum + "_hash", urle.url().hash()); - showNum++; - } - } - prop.put(tableName, showNum); - - } -} diff --git a/htroot/api/queues_p.xml b/htroot/api/queues_p.xml deleted file mode 100644 index d8d3d8abb..000000000 --- a/htroot/api/queues_p.xml +++ /dev/null @@ -1,71 +0,0 @@ - - - - #[urlpublictextSize]# - #[rwipublictextSize]# - - - #[loaderSize]# - #[loaderMax]# -#{list-loader}# - - #[profile]# - #[initiator]# - #[depth]# - #[url]# - -#{/list-loader}# - - - #[localCrawlSize]# - #[localCrawlState]# -#{list-local}# - - #[profile]# - #[initiator]# - #[depth]# - #[modified]# - #[anchor]# - #[url]# - #[hash]# - #(inProcess)#false::true#(/inProcess)# - -#{/list-local}# - - - #[limitCrawlSize]# - #[limitCrawlState]# -#{list-limit}# - - #[profile]# - #[initiator]# - #[depth]# - #[modified]# - #[anchor]# - #[url]# - #[hash]# - #(inProcess)#false::true#(/inProcess)# - -#{/list-limit}# - - - #[remoteCrawlSize]# - #[remoteCrawlState]# -#{list-remote}# - - #[profile]# - #[initiator]# - #[depth]# - #[modified]# - #[anchor]# - #[url]# - #[hash]# - #(inProcess)#false::true#(/inProcess)# - -#{/list-remote}# - - - #[noloadCrawlSize]# - #[noloadCrawlState]# - - diff --git a/htroot/api/status_p.java b/htroot/api/status_p.java index ce26f1b65..83ba95e41 100644 --- a/htroot/api/status_p.java +++ b/htroot/api/status_p.java @@ -1,4 +1,29 @@ - +// status_p +// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 18.12.2006 on http://www.anomic.de +// this file was created using the an implementation from IndexCreate_p.java, published 02.12.2004 +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.io.ByteCount; @@ -13,6 +38,8 @@ import de.anomic.server.serverSwitch; public class status_p { + public static final String STATE_RUNNING = "running"; + public static final String STATE_PAUSED = "paused"; public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements @@ -53,6 +80,30 @@ public class status_p { prop.put("trafficProxy", ByteCount.getAccountCount(ByteCount.PROXY)); prop.put("trafficCrawler", ByteCount.getAccountCount(ByteCount.CRAWLER)); + // index size + prop.putNum("urlpublictextSize", segment.urlMetadata().size()); + prop.putNum("rwipublictextSize", segment.termIndex().sizesMax()); + + // loader queue + prop.putNum("loaderSize", sb.crawlQueues.workerSize()); + prop.putNum("loaderMax", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10)); + + //local crawl queue + prop.putNum("localCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL).getJobCount()); + prop.put("localCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) ? STATE_PAUSED : STATE_RUNNING); + + //global crawl queue + prop.putNum("limitCrawlSize", sb.crawlQueues.limitCrawlJobSize()); + prop.put("limitCrawlState", STATE_RUNNING); + + //remote crawl queue + prop.putNum("remoteCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount()); + prop.put("remoteCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING); + + //noload crawl queue + prop.putNum("noloadCrawlSize", sb.crawlQueues.noloadCrawlJobSize()); + prop.put("noloadCrawlState", STATE_RUNNING); + // return rewrite properties return prop; } diff --git a/htroot/api/status_p.xml b/htroot/api/status_p.xml index a8f4ae4eb..25c62234b 100644 --- a/htroot/api/status_p.xml +++ b/htroot/api/status_p.xml @@ -1,35 +1,52 @@ #[ppm]# + #[wordCacheSize]# #[wordCacheMaxSize]# + + + #[freeMemory]# + #[totalMemory]# + #[maxMemory]# + + + #[processors]# + + + #[trafficIn]# + #[trafficProxy]# + #[trafficCrawler]# + + + + #[urlpublictextSize]# + #[rwipublictextSize]# + #[loaderSize]# #[loaderMax]# + #[localCrawlSize]# + #[localCrawlState]# + #[limitCrawlSize]# + #[limitCrawlState]# + #[remoteCrawlSize]# + #[remoteCrawlState]# + #[noloadCrawlSize]# + #[noloadCrawlState]# - - - #[freeMemory]# - #[totalMemory]# - #[maxMemory]# - - #[processors]# - - #[trafficIn]# - #[trafficProxy]# - #[trafficCrawler]# - + diff --git a/htroot/env/grafics/trash.gif b/htroot/env/grafics/trash.gif new file mode 100644 index 0000000000000000000000000000000000000000..3ff757f770d3e7e664aa13ed140e7490ae6fc2d5 GIT binary patch literal 932 zcmZ?wbhEHb6krfw_|9PX|Ns9VB!U|xf(xX86C{EIB!Uejf)yl!1*Cu(B!USff({Zv z11X?_L{LB?$RH6UkO*#mT+YD2p!k!8k%8eKgAR}l%n=LNe>r7ttb1^vnSa`SwdpPd#TdzUx=<;uFVqVR!qxPo(6$=9?8{1fH{t@^_9>F5Fdv$yuXxR#r? zS^VuT+sd13q&xD?*0G)48LZ$l-=Vch;goRZg(W_{-$J-T(}Pzzuk|WjWwL7j;)v;4 Ze{Zdwt37L7(X!m|5QXF0nVFdwtO3P_CEox5 literal 0 HcmV?d00001 diff --git a/htroot/env/templates/submenuCrawlMonitor.template b/htroot/env/templates/submenuCrawlMonitor.template index 542d1d6c7..992c36bd5 100644 --- a/htroot/env/templates/submenuCrawlMonitor.template +++ b/htroot/env/templates/submenuCrawlMonitor.template @@ -14,9 +14,10 @@ diff --git a/htroot/js/Crawler.js b/htroot/js/Crawler.js index ef0e861f5..9747dba41 100644 --- a/htroot/js/Crawler.js +++ b/htroot/js/Crawler.js @@ -5,12 +5,10 @@ WORDCACHEBAR_LENGTH=1/4; var statusRPC; -var queuesRPC; -var refreshInterval=5; +var refreshInterval=3; var wait=0; var changing=false; //change the interval var statusLoaded=true; -var queueLoaded=true; function initCrawler(){ refresh(); @@ -38,21 +36,20 @@ function newInterval(){ countInterval=window.setInterval("countdown()", 1000); changing=false; } + function countdown(){ - if(statusLoaded && queueLoaded){ - document.getElementById("nextUpdate").value=wait; - wait--; - if(wait==0){ + if(statusLoaded){ + wait--; + if (wait == 0) { refresh(); } } } + function refresh(){ wait=refreshInterval; statusLoaded=false; - queueLoaded=false; requestStatus(); - requestQueues(); } function requestStatus(){ @@ -61,13 +58,6 @@ function requestStatus(){ statusRPC.onreadystatechange = handleStatus; statusRPC.send(null); } -function requestQueues(){ - queuesRPC=createRequestObject(); - queuesRPC.open('get', '/api/queues_p.xml?html='); - queuesRPC.onreadystatechange = handleQueues; - queuesRPC.send(null); - -} function handleStatus(){ if(statusRPC.readyState != 4){ @@ -118,65 +108,44 @@ function handleStatus(){ img.setAttribute("src", BAR_IMG1); wordCacheSpan.appendChild(img); } - statusLoaded=true; -} - -function handleQueues(){ - if(queuesRPC.readyState != 4){ - return; - } - var queuesResponse = queuesRPC.responseXML; - //xml=getFirstChild(queuesResponse); - xml=getFirstChild(queuesResponse, "queues"); - if(queuesResponse != null){ - clearTable(document.getElementById("queueTable"), 1); - dbsize=getFirstChild(xml, "dbsize"); - urlpublictextSize=getValue(getFirstChild(dbsize, "urlpublictext")); - rwipublictextSize=getValue(getFirstChild(dbsize, "rwipublictext")); - document.getElementById("urldbsize").firstChild.nodeValue=urlpublictextSize; - document.getElementById("rwidbsize").firstChild.nodeValue=rwipublictextSize; - - loaderqueue=getFirstChild(xml, "loaderqueue"); - updateTable(loaderqueue, "loader"); - - loaderqueue_size=getValue(getFirstChild(loaderqueue, "size")); - loaderqueue_max=getValue(getFirstChild(loaderqueue, "max")); - document.getElementById("loaderqueuesize").firstChild.nodeValue=loaderqueue_size; - document.getElementById("loaderqueuemax").firstChild.nodeValue=loaderqueue_max; - - localcrawlerqueue=getFirstChild(xml, "localcrawlerqueue"); - localcrawlerqueue_size=getValue(getFirstChild(localcrawlerqueue, "size")); - localcrawlerqueue_state=getValue(getFirstChild(localcrawlerqueue, "state")); - document.getElementById("localcrawlerqueuesize").firstChild.nodeValue=localcrawlerqueue_size; - putQueueState("localcrawler", localcrawlerqueue_state); - - updateTable(localcrawlerqueue, "local crawler"); - - limitcrawlerqueue=getFirstChild(xml, "limitcrawlerqueue"); - updateTable(limitcrawlerqueue, "limitCrawlerTable"); - limitcrawlerqueue_size=getValue(getFirstChild(limitcrawlerqueue, "size")); - limitcrawlerqueue_state=getValue(getFirstChild(limitcrawlerqueue, "state")); - document.getElementById("limitcrawlerqueuesize").firstChild.nodeValue=limitcrawlerqueue_size; - putQueueState("limitcrawler", limitcrawlerqueue_state); - updateTable(limitcrawlerqueue, "limit crawler"); - - remotecrawlerqueue=getFirstChild(xml, "remotecrawlerqueue"); - updateTable(remotecrawlerqueue, "remoteCrawlerTable"); - remotecrawlerqueue_size=getValue(getFirstChild(remotecrawlerqueue, "size")); - remotecrawlerqueue_state=getValue(getFirstChild(remotecrawlerqueue, "state")); - document.getElementById("remotecrawlerqueuesize").firstChild.nodeValue=remotecrawlerqueue_size; - putQueueState("remotecrawler", remotecrawlerqueue_state); - updateTable(remotecrawlerqueue, "remote crawler"); - - noloadcrawlerqueue=getFirstChild(xml, "noloadcrawlerqueue"); - noloadcrawlerqueue_size=getValue(getFirstChild(noloadcrawlerqueue, "size")); - noloadcrawlerqueue_state=getValue(getFirstChild(noloadcrawlerqueue, "state")); - document.getElementById("noloadcrawlerqueuesize").firstChild.nodeValue=noloadcrawlerqueue_size; - putQueueState("noloadcrawler", noloadcrawlerqueue_state); + dbsize=getFirstChild(statusTag, "dbsize"); + urlpublictextSize=getValue(getFirstChild(dbsize, "urlpublictext")); + rwipublictextSize=getValue(getFirstChild(dbsize, "rwipublictext")); + document.getElementById("urldbsize").firstChild.nodeValue=urlpublictextSize; + document.getElementById("rwidbsize").firstChild.nodeValue=rwipublictextSize; + + loaderqueue=getFirstChild(statusTag, "loaderqueue"); + loaderqueue_size=getValue(getFirstChild(loaderqueue, "size")); + loaderqueue_max=getValue(getFirstChild(loaderqueue, "max")); + document.getElementById("loaderqueuesize").firstChild.nodeValue=loaderqueue_size; + document.getElementById("loaderqueuemax").firstChild.nodeValue=loaderqueue_max; + + localcrawlerqueue=getFirstChild(statusTag, "localcrawlerqueue"); + localcrawlerqueue_size=getValue(getFirstChild(localcrawlerqueue, "size")); + localcrawlerqueue_state=getValue(getFirstChild(localcrawlerqueue, "state")); + document.getElementById("localcrawlerqueuesize").firstChild.nodeValue=localcrawlerqueue_size; + putQueueState("localcrawler", localcrawlerqueue_state); + + limitcrawlerqueue=getFirstChild(statusTag, "limitcrawlerqueue"); + limitcrawlerqueue_size=getValue(getFirstChild(limitcrawlerqueue, "size")); + limitcrawlerqueue_state=getValue(getFirstChild(limitcrawlerqueue, "state")); + document.getElementById("limitcrawlerqueuesize").firstChild.nodeValue=limitcrawlerqueue_size; + putQueueState("limitcrawler", limitcrawlerqueue_state); + + remotecrawlerqueue=getFirstChild(statusTag, "remotecrawlerqueue"); + remotecrawlerqueue_size=getValue(getFirstChild(remotecrawlerqueue, "size")); + remotecrawlerqueue_state=getValue(getFirstChild(remotecrawlerqueue, "state")); + document.getElementById("remotecrawlerqueuesize").firstChild.nodeValue=remotecrawlerqueue_size; + putQueueState("remotecrawler", remotecrawlerqueue_state); + + noloadcrawlerqueue=getFirstChild(statusTag, "noloadcrawlerqueue"); + noloadcrawlerqueue_size=getValue(getFirstChild(noloadcrawlerqueue, "size")); + noloadcrawlerqueue_state=getValue(getFirstChild(noloadcrawlerqueue, "state")); + document.getElementById("noloadcrawlerqueuesize").firstChild.nodeValue=noloadcrawlerqueue_size; + putQueueState("noloadcrawler", noloadcrawlerqueue_state); - } - queueLoaded=true; + statusLoaded=true; } function putQueueState(queue, state) { @@ -184,53 +153,17 @@ function putQueueState(queue, state) { img = document.getElementById(queue + "stateIMG"); if (state == "paused") { a.href = "Crawler_p.html?continue=" + queue; - a.title = "Continue this queue"; + a.title = "Continue this queue (" + state + ")"; img.src = "/env/grafics/start.gif"; img.alt = "Continue this queue"; } else { a.href = "Crawler_p.html?pause=" + queue; - a.title = "Pause this queue"; + a.title = "Pause this queue (" + state + ")"; img.src = "/env/grafics/stop.gif"; img.alt = "Pause this queue"; } } -function updateTable(indexingqueue, tablename){ - indexingTable=document.getElementById("queueTable"); - entries=indexingqueue.getElementsByTagName("entry"); - - dark=false; - for(i=0;i 80) { diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index e01f091fb..20116c629 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -60,7 +60,7 @@ public class urls { if (post.get("call", "").equals("remotecrawl")) { // perform a remote crawl url handover - final NoticedURL.StackType stackType = NoticedURL.StackType.LIMIT; + final NoticedURL.StackType stackType = NoticedURL.StackType.GLOBAL; int maxCount = Math.min(100, post.getInt("count", 10)); final long maxTime = Math.min(20000, Math.max(1000, post.getInt("time", 10000))); final long timeout = System.currentTimeMillis() + maxTime; diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index a9d3438ca..b6bf3af44 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -29,22 +29,20 @@ package de.anomic.crawler; import java.io.File; import java.io.IOException; import java.util.ArrayList; -import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.SortedMap; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ConcurrentMap; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.UTF8; import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.services.federated.yacy.CacheStrategy; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.index.BufferedObjectIndex; import net.yacy.kelondro.index.HandleSet; @@ -53,7 +51,6 @@ import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.table.Table; -import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.MemoryControl; import de.anomic.crawler.retrieval.Request; import de.anomic.http.client.Cache; @@ -74,9 +71,6 @@ public class Balancer { // class variables computed during operation private final ConcurrentMap domainStacks; // a map from host name to lists with url hashs - private final ConcurrentLinkedQueue top; // a list of url-hashes that shall be taken next - private final SortedMap delayed; - private final HandleSet ddc; private final HandleSet double_push_check; // for debugging private long lastDomainStackFill; private int domStackInitSize; @@ -91,13 +85,10 @@ public class Balancer { final boolean exceed134217727) { this.cacheStacksPath = cachePath; this.domainStacks = new ConcurrentHashMap(); - this.top = new ConcurrentLinkedQueue(); - this.delayed = new TreeMap(); this.minimumLocalDelta = minimumLocalDelta; this.minimumGlobalDelta = minimumGlobalDelta; this.myAgentIDs = myAgentIDs; this.domStackInitSize = Integer.MAX_VALUE; - this.ddc = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.double_push_check = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); // create a stack for newly entered entries @@ -145,12 +136,7 @@ public class Balancer { Log.logException(e); } this.domainStacks.clear(); - this.top.clear(); - synchronized (this.delayed) { - this.delayed.clear(); - } this.double_push_check.clear(); - this.ddc.clear(); } public Request get(final byte[] urlhash) throws IOException { @@ -202,28 +188,11 @@ public class Balancer { if (entry != null) removedCounter++; // remove from double-check caches - this.ddc.remove(urlhash); this.double_push_check.remove(urlhash); } if (removedCounter == 0) return 0; assert this.urlFileIndex.size() + removedCounter == s : "urlFileIndex.size() = " + this.urlFileIndex.size() + ", s = " + s; - // iterate through the top list - final Iterator j = this.top.iterator(); - byte[] urlhash; - while (j.hasNext()) { - urlhash = j.next(); - if (urlHashes.has(urlhash)) j.remove(); - } - - // remove from delayed - synchronized (this.delayed) { - final Iterator> k = this.delayed.entrySet().iterator(); - while (k.hasNext()) { - if (urlHashes.has(k.next().getValue())) k.remove(); - } - } - // iterate through the domain stacks final Iterator> q = this.domainStacks.entrySet().iterator(); HandleSet stack; @@ -237,7 +206,7 @@ public class Balancer { } public boolean has(final byte[] urlhashb) { - return this.urlFileIndex.has(urlhashb) || this.ddc.has(urlhashb); + return this.urlFileIndex.has(urlhashb) || this.double_push_check.has(urlhashb); } public boolean notEmpty() { @@ -277,7 +246,6 @@ public class Balancer { synchronized (this) { // double-check if (this.double_push_check.has(hash)) return "double occurrence in double_push_check"; - if (this.ddc.has(hash)) return "double occurrence in ddc"; if (this.urlFileIndex.has(hash)) return "double occurrence in urlFileIndex"; if (this.double_push_check.size() > 10000 || MemoryControl.shortStatus()) this.double_push_check.clear(); @@ -297,16 +265,16 @@ public class Balancer { /** * get a list of domains that are currently maintained as domain stacks - * @return a map of clear text strings of host names to the size of the domain stack + * @return a map of clear text strings of host names to an integer array: {the size of the domain stack, guessed delta waiting time} */ - public Map getDomainStackHosts() { - Map map = new HashMap(); + public Map getDomainStackHosts() { + Map map = new TreeMap(); // we use a tree map to get a stable ordering for (Map.Entry entry: this.domainStacks.entrySet()) { - map.put(entry.getKey(), entry.getValue().size()); + map.put(entry.getKey(), new Integer[]{entry.getValue().size(), (int) Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta)}); } return map; } - + /** * compute the current sleep time for a given crawl entry * @param cs @@ -315,20 +283,20 @@ public class Balancer { */ public long getDomainSleepTime(final CrawlSwitchboard cs, Request crawlEntry) { final CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); - return getDomainSleepTime(cs, profileEntry, crawlEntry); + return getDomainSleepTime(cs, profileEntry, crawlEntry.url()); } - - private long getDomainSleepTime(final CrawlSwitchboard cs, final CrawlProfile profileEntry, Request crawlEntry) { + + private long getDomainSleepTime(final CrawlSwitchboard cs, final CrawlProfile profileEntry, final DigestURI crawlURL) { if (profileEntry == null) { return 0; } long sleeptime = ( profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY || - (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlEntry.url())) - ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server + (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash())) + ) ? 0 : Latency.waitingRemaining(crawlURL, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server return sleeptime; } - + /** * get lists of crawl request entries for a specific host * @param host @@ -360,7 +328,7 @@ public class Balancer { } return cel; } - + private void pushHashToDomainStacks(String host, final byte[] urlhash) throws RowSpaceExceededException { // extend domain stack if (host == null) host = localhost; @@ -388,21 +356,6 @@ public class Balancer { if (domainList.isEmpty()) this.domainStacks.remove(host); } - private byte[] nextFromDelayed() { - if (this.delayed.isEmpty()) return null; - final Long first = this.delayed.firstKey(); - if (first.longValue() < System.currentTimeMillis()) { - return this.delayed.remove(first); - } - return null; - } - - private byte[] anyFromDelayed() { - if (this.delayed.isEmpty()) return null; - final Long first = this.delayed.firstKey(); - return this.delayed.remove(first); - } - /** * get the next entry in this crawl queue in such a way that the domain access time delta is maximized * and always above the given minimum delay time. An additional delay time is computed using the robots.txt @@ -418,41 +371,13 @@ public class Balancer { public Request pop(final boolean delay, final CrawlSwitchboard cs) throws IOException { // returns a crawl entry from the stack and ensures minimum delta times - try { - filltop(delay, -600000, false); - filltop(delay, -60000, false); - filltop(delay, -10000, false); - filltop(delay, -6000, false); - filltop(delay, -4000, false); - filltop(delay, -3000, false); - filltop(delay, -2000, false); - filltop(delay, -1000, false); - filltop(delay, -500, false); - filltop(delay, 0, true); - filltop(delay, 500, true); - filltop(delay, 1000, true); - filltop(delay, 2000, true); - filltop(delay, 3000, true); - filltop(delay, 4000, true); - filltop(delay, 6000, true); - filltop(delay, Long.MAX_VALUE, true); - } catch (final RowSpaceExceededException e) {} - long sleeptime = 0; Request crawlEntry = null; synchronized (this) { byte[] failhash = null; while (!this.urlFileIndex.isEmpty()) { - // first simply take one of the entries in the top list, that should be one without any delay - byte[] nexthash = nextFromDelayed(); - //System.out.println("*** nextFromDelayed=" + nexthash); - if (nexthash == null && !this.top.isEmpty()) { - nexthash = this.top.remove(); - //System.out.println("*** top.remove()=" + nexthash); - } - if (nexthash == null) { - nexthash = anyFromDelayed(); - } + byte[] nexthash = getbest(); + if (nexthash == null) return null; // check minimumDelta and if necessary force a sleep //final int s = urlFileIndex.size(); @@ -485,37 +410,14 @@ public class Balancer { return null; } // depending on the caching policy we need sleep time to avoid DoS-like situations - sleeptime = getDomainSleepTime(cs, profileEntry, crawlEntry); - + sleeptime = getDomainSleepTime(cs, profileEntry, crawlEntry.url()); + assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes()); assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash()); if (failhash != null && Base64Order.enhancedCoder.equal(failhash, nexthash)) break; // prevent endless loops - - if (delay && sleeptime > 0 && this.domStackInitSize > 1) { - //System.out.println("*** putback: nexthash=" + nexthash + ", failhash="+failhash); - // put that thing back to omit a delay here - if (!ByteBuffer.contains(this.delayed.values(), nexthash)) { - //System.out.println("*** delayed +=" + nexthash); - this.delayed.put(Long.valueOf(System.currentTimeMillis() + sleeptime + 1), nexthash); - } - try { - this.urlFileIndex.put(rowEntry); - String host = crawlEntry.url().getHost(); - if (host == null) host = localhost; - this.domainStacks.remove(host); - failhash = nexthash; - } catch (final RowSpaceExceededException e) { - Log.logException(e); - } - continue; - } break; } - if (crawlEntry != null) { - if (this.ddc.size() > 10000 || MemoryControl.shortStatus()) this.ddc.clear(); - try { this.ddc.put(crawlEntry.url().hash()); } catch (final RowSpaceExceededException e) {} - } } if (crawlEntry == null) return null; @@ -524,7 +426,7 @@ public class Balancer { // in best case, this should never happen if the balancer works propertly // this is only to protection against the worst case, where the crawler could // behave in a DoS-manner - Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta) + ", top.size() = " + this.top.size() + ", delayed.size() = " + this.delayed.size() + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize); + Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta) + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize); long loops = sleeptime / 1000; long rest = sleeptime % 1000; if (loops < 3) { @@ -537,15 +439,11 @@ public class Balancer { try {this.wait(1000); } catch (final InterruptedException e) {} } } - this.ddc.remove(crawlEntry.url().hash()); Latency.update(crawlEntry.url()); return crawlEntry; } - private void filltop(final boolean delay, final long maximumwaiting, final boolean acceptonebest) throws RowSpaceExceededException { - if (!this.top.isEmpty()) return; - - //System.out.println("*** DEBUG started filltop delay=" + ((delay) ? "true":"false") + ", maximumwaiting=" + maximumwaiting + ", acceptonebest=" + ((acceptonebest) ? "true":"false")); + private byte[] getbest() { // check if we need to get entries from the file index try { @@ -560,6 +458,7 @@ public class Balancer { long smallestWaiting = Long.MAX_VALUE; byte[] besturlhash = null; String besthost = null; + Map zeroWaitingCandidates = new HashMap(); while (i.hasNext()) { entry = i.next(); @@ -571,34 +470,52 @@ public class Balancer { final byte[] n = entry.getValue().removeOne(); if (n == null) continue; - if (delay) { - final long w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta); - if (w > maximumwaiting) { - if (w < smallestWaiting) { - smallestWaiting = w; - besturlhash = n; - besthost = entry.getKey(); - } - entry.getValue().put(n); // put entry back - continue; + final long w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta); + if (w < smallestWaiting) { + smallestWaiting = w; + besturlhash = n; + besthost = entry.getKey(); + if (w <= 0) { + zeroWaitingCandidates.put(besthost, besturlhash); } } - - this.top.add(n); - if (entry.getValue().isEmpty()) i.remove(); + try { + entry.getValue().put(n); // put entry back, we are checking only + } catch (RowSpaceExceededException e) { + e.printStackTrace(); + } } - // if we could not find any entry, then take the best we have seen so far - if (acceptonebest && !this.top.isEmpty() && besturlhash != null) { - removeHashFromDomainStacks(besthost, besturlhash); - this.top.add(besturlhash); + if (besturlhash == null) return null; // worst case + + // best case would be, if we have some zeroWaitingCandidates, + // then we select that one with the largest stack + if (zeroWaitingCandidates.size() > 0) { + int largestStack = -1; + String largestStackHost = null; + byte[] largestStackHash = null; + for (Map.Entry z: zeroWaitingCandidates.entrySet()) { + HandleSet hs = this.domainStacks.get(z.getKey()); + if (hs == null || hs.size() <= largestStack) continue; + largestStack = hs.size(); + largestStackHost = z.getKey(); + largestStackHash = z.getValue(); + } + if (largestStackHost != null && largestStackHash != null) { + removeHashFromDomainStacks(largestStackHost, largestStackHash); + //Log.logInfo("Balancer", "*** picked one from largest stack"); + return largestStackHash; + } } + + // default case: just take that one with least waiting + removeHashFromDomainStacks(besthost, besturlhash); + return besturlhash; } private void fillDomainStacks() throws IOException { - if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 120000L) return; + if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return; this.domainStacks.clear(); - this.top.clear(); this.lastDomainStackFill = System.currentTimeMillis(); final HandleSet handles = this.urlFileIndex.keysFromBuffer(objectIndexBufferSize / 2); final CloneableIterator i = handles.keys(true, null); @@ -621,51 +538,6 @@ public class Balancer { this.domStackInitSize = this.domainStacks.size(); } - public List top(int count) { - final List cel = new ArrayList(); - if (count == 0) return cel; - byte[][] ta = new byte[Math.min(count, this.top.size())][]; - ta = this.top.toArray(ta); - for (final byte[] n: ta) { - if (n == null) break; - try { - final Row.Entry rowEntry = this.urlFileIndex.get(n, false); - if (rowEntry == null) continue; - final Request crawlEntry = new Request(rowEntry); - cel.add(crawlEntry); - count--; - if (count <= 0) break; - } catch (final IOException e) {} - } - - int depth = 0; - loop: while (count > 0) { - // iterate over the domain stacks - final int celsize = cel.size(); - ll: for (final HandleSet list: this.domainStacks.values()) { - if (list.size() <= depth) continue ll; - final byte[] n = list.getOne(depth); - if (n == null) continue ll; - try { - final Row.Entry rowEntry = this.urlFileIndex.get(n, false); - if (rowEntry == null) continue; - final Request crawlEntry = new Request(rowEntry); - cel.add(crawlEntry); - count--; - if (count <= 0) break loop; - } catch (final IOException e) {} - } - if (cel.size() == celsize) break loop; - depth++; - } - - if (cel.size() < count) try { - final List list = this.urlFileIndex.top(count - cel.size()); - for (final Row.Entry entry: list) cel.add(new Request(entry)); - } catch (final IOException e) { } - return cel; - } - public Iterator iterator() throws IOException { return new EntryIterator(); } @@ -678,10 +550,12 @@ public class Balancer { this.rowIterator = Balancer.this.urlFileIndex.rows(); } + @Override public boolean hasNext() { return (this.rowIterator == null) ? false : this.rowIterator.hasNext(); } + @Override public Request next() { final Row.Entry entry = this.rowIterator.next(); try { @@ -693,6 +567,7 @@ public class Balancer { } } + @Override public void remove() { if (this.rowIterator != null) this.rowIterator.remove(); } diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 1a7208212..bb441902c 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -215,7 +215,7 @@ public class CrawlQueues { } public int coreCrawlJobSize() { - return this.noticeURL.stackSize(NoticedURL.StackType.CORE) + this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD); + return this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD); } public boolean coreCrawlJob() { @@ -226,14 +226,14 @@ public class CrawlQueues { // move some tasks to the core crawl job so we have something to do final int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance for (int i = 0; i < toshift; i++) { - this.noticeURL.shift(NoticedURL.StackType.LIMIT, NoticedURL.StackType.CORE, this.sb.crawler); + this.noticeURL.shift(NoticedURL.StackType.GLOBAL, NoticedURL.StackType.LOCAL, this.sb.crawler); } this.log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() + ", limitCrawlJobSize()=" + limitCrawlJobSize() + ", cluster.mode=" + this.sb.getConfig(SwitchboardConstants.CLUSTER_MODE, "") + ", robinsonMode=" + ((this.sb.isRobinsonMode()) ? "on" : "off")); } - final String queueCheckCore = loadIsPossible(NoticedURL.StackType.CORE); + final String queueCheckCore = loadIsPossible(NoticedURL.StackType.LOCAL); final String queueCheckNoload = loadIsPossible(NoticedURL.StackType.NOLOAD); if (queueCheckCore != null && queueCheckNoload != null) { if (this.log.isFine()) { @@ -251,11 +251,11 @@ public class CrawlQueues { // do a local crawl Request urlEntry; - while (this.noticeURL.stackSize(NoticedURL.StackType.CORE) > 0 || this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) { + while (this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) > 0 || this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) { final String stats = "LOCALCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) + ", " + - this.noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " + - this.noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " + + this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " + + this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]"; try { @@ -284,7 +284,7 @@ public class CrawlQueues { return true; } - urlEntry = this.noticeURL.pop(NoticedURL.StackType.CORE, true, this.sb.crawler); + urlEntry = this.noticeURL.pop(NoticedURL.StackType.LOCAL, true, this.sb.crawler); if (urlEntry == null) { continue; } @@ -300,7 +300,7 @@ public class CrawlQueues { } catch (final IOException e) { this.log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e); if (e.getMessage().indexOf("hash is null",0) > 0) { - this.noticeURL.clear(NoticedURL.StackType.CORE); + this.noticeURL.clear(NoticedURL.StackType.LOCAL); } } } @@ -547,7 +547,7 @@ public class CrawlQueues { } public int limitCrawlJobSize() { - return this.noticeURL.stackSize(NoticedURL.StackType.LIMIT); + return this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL); } public int noloadCrawlJobSize() { @@ -579,7 +579,7 @@ public class CrawlQueues { } // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view) - final String stats = "REMOTETRIGGEREDCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", " + final String stats = "REMOTETRIGGEREDCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]"; try { final Request urlEntry = this.noticeURL.pop(NoticedURL.StackType.REMOTE, true, this.sb.crawler); diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index d0d29a34f..9a0c6c557 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -370,14 +370,14 @@ public final class CrawlStacker { // it may be possible that global == true and local == true, so do not check an error case against it if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LIMIT, entry); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry); } else if (local) { if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry); } else if (proxy) { if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry); } else if (remote) { warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry); } diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java index 3c6dca7da..85644f80d 100644 --- a/source/de/anomic/crawler/Latency.java +++ b/source/de/anomic/crawler/Latency.java @@ -146,7 +146,7 @@ public class Latency { // return time that is remaining //System.out.println("Latency: " + (waiting - timeSinceLastAccess)); - return waiting - timeSinceLastAccess; + return Math.max(0, waiting - timeSinceLastAccess); } /** diff --git a/source/de/anomic/crawler/NoticedURL.java b/source/de/anomic/crawler/NoticedURL.java index 93220c5b0..fa5df6683 100644 --- a/source/de/anomic/crawler/NoticedURL.java +++ b/source/de/anomic/crawler/NoticedURL.java @@ -44,7 +44,7 @@ import de.anomic.crawler.retrieval.Request; public class NoticedURL { public enum StackType { - NULL, CORE, LIMIT, OVERHANG, REMOTE, NOLOAD, IMAGE, MOVIE, MUSIC; + LOCAL, GLOBAL, OVERHANG, REMOTE, NOLOAD; } public static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain @@ -146,8 +146,8 @@ public class NoticedURL { public int stackSize(final StackType stackType) { switch (stackType) { case NOLOAD: return (this.noloadStack == null) ? 0 : this.noloadStack.size(); - case CORE: return (this.coreStack == null) ? 0 : this.coreStack.size(); - case LIMIT: return (this.limitStack == null) ? 0 : this.limitStack.size(); + case LOCAL: return (this.coreStack == null) ? 0 : this.coreStack.size(); + case GLOBAL: return (this.limitStack == null) ? 0 : this.limitStack.size(); case OVERHANG: return 0; case REMOTE: return (this.remoteStack == null) ? 0 : this.remoteStack.size(); default: return -1; @@ -172,9 +172,9 @@ public class NoticedURL { public String push(final StackType stackType, final Request entry) { try { switch (stackType) { - case CORE: + case LOCAL: return this.coreStack.push(entry); - case LIMIT: + case GLOBAL: return this.limitStack.push(entry); case REMOTE: return this.remoteStack.push(entry); @@ -233,30 +233,30 @@ public class NoticedURL { * get a list of domains that are currently maintained as domain stacks * @return a map of clear text strings of host names to the size of the domain stacks */ - public Map getDomainStackHosts(final StackType stackType) { + public Map getDomainStackHosts(final StackType stackType) { switch (stackType) { - case CORE: return this.coreStack.getDomainStackHosts(); - case LIMIT: return this.limitStack.getDomainStackHosts(); + case LOCAL: return this.coreStack.getDomainStackHosts(); + case GLOBAL: return this.limitStack.getDomainStackHosts(); case REMOTE: return this.remoteStack.getDomainStackHosts(); case NOLOAD: return this.noloadStack.getDomainStackHosts(); default: return null; } } - + /** * get a list of domains that are currently maintained as domain stacks * @return a collection of clear text strings of host names */ public long getDomainSleepTime(final StackType stackType, final CrawlSwitchboard cs, Request crawlEntry) { switch (stackType) { - case CORE: return this.coreStack.getDomainSleepTime(cs, crawlEntry); - case LIMIT: return this.limitStack.getDomainSleepTime(cs, crawlEntry); + case LOCAL: return this.coreStack.getDomainSleepTime(cs, crawlEntry); + case GLOBAL: return this.limitStack.getDomainSleepTime(cs, crawlEntry); case REMOTE: return this.remoteStack.getDomainSleepTime(cs, crawlEntry); case NOLOAD: return this.noloadStack.getDomainSleepTime(cs, crawlEntry); default: return 0; } } - + /** * get lists of crawl request entries for a specific host * @param host @@ -265,28 +265,18 @@ public class NoticedURL { */ public List getDomainStackReferences(final StackType stackType, String host, int maxcount) { switch (stackType) { - case CORE: return this.coreStack.getDomainStackReferences(host, maxcount); - case LIMIT: return this.limitStack.getDomainStackReferences(host, maxcount); + case LOCAL: return this.coreStack.getDomainStackReferences(host, maxcount); + case GLOBAL: return this.limitStack.getDomainStackReferences(host, maxcount); case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount); case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount); default: return null; } } - - public List top(final StackType stackType, final int count) { - switch (stackType) { - case CORE: return top(this.coreStack, count); - case LIMIT: return top(this.limitStack, count); - case REMOTE: return top(this.remoteStack, count); - case NOLOAD: return top(this.noloadStack, count); - default: return null; - } - } public Request pop(final StackType stackType, final boolean delay, final CrawlSwitchboard cs) throws IOException { switch (stackType) { - case CORE: return pop(this.coreStack, delay, cs); - case LIMIT: return pop(this.limitStack, delay, cs); + case LOCAL: return pop(this.coreStack, delay, cs); + case GLOBAL: return pop(this.limitStack, delay, cs); case REMOTE: return pop(this.remoteStack, delay, cs); case NOLOAD: return pop(this.noloadStack, false, cs); default: return null; @@ -310,8 +300,8 @@ public class NoticedURL { public void clear(final StackType stackType) { Log.logInfo("NoticedURL", "CLEARING STACK " + stackType); switch (stackType) { - case CORE: this.coreStack.clear(); break; - case LIMIT: this.limitStack.clear(); break; + case LOCAL: this.coreStack.clear(); break; + case GLOBAL: this.limitStack.clear(); break; case REMOTE: this.remoteStack.clear(); break; case NOLOAD: this.noloadStack.clear(); break; default: return; @@ -340,17 +330,11 @@ public class NoticedURL { return null; } - private static List top(final Balancer balancer, int count) { - // this is a filo - top - if (count > balancer.size()) count = balancer.size(); - return balancer.top(count); - } - public Iterator iterator(final StackType stackType) { // returns an iterator of plasmaCrawlBalancerEntry Objects try {switch (stackType) { - case CORE: return this.coreStack.iterator(); - case LIMIT: return this.limitStack.iterator(); + case LOCAL: return this.coreStack.iterator(); + case GLOBAL: return this.limitStack.iterator(); case REMOTE: return this.remoteStack.iterator(); case NOLOAD: return this.noloadStack.iterator(); default: return null; diff --git a/source/de/anomic/http/client/Cache.java b/source/de/anomic/http/client/Cache.java index b886d1924..f89f1d10c 100644 --- a/source/de/anomic/http/client/Cache.java +++ b/source/de/anomic/http/client/Cache.java @@ -40,6 +40,7 @@ import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.Map; +import net.yacy.cora.document.ASCII; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.kelondro.blob.ArrayStack; import net.yacy.kelondro.blob.Compressor; @@ -172,26 +173,30 @@ public final class Cache { * @return true if the content of the url is in the cache, false otherwise */ public static boolean has(final DigestURI url) { + return has(url.hash()); + } + + public static boolean has(final byte[] urlhash) { boolean headerExists; boolean fileExists; //synchronized (responseHeaderDB) { - headerExists = responseHeaderDB.containsKey(url.hash()); - fileExists = fileDB.containsKey(url.hash()); + headerExists = responseHeaderDB.containsKey(urlhash); + fileExists = fileDB.containsKey(urlhash); //} if (headerExists && fileExists) return true; if (!headerExists && !fileExists) return false; // if not both is there then we do a clean-up if (headerExists) try { - log.logWarning("header but not content of url " + url.toString() + " in cache; cleaned up"); + log.logWarning("header but not content of urlhash " + ASCII.String(urlhash) + " in cache; cleaned up"); if (responseHeaderDB instanceof MapHeap) { - ((MapHeap) responseHeaderDB).delete(url.hash()); + ((MapHeap) responseHeaderDB).delete(urlhash); } else { - responseHeaderDB.remove(url.hash()); + responseHeaderDB.remove(urlhash); } } catch (final IOException e) {} if (fileExists) try { //log.logWarning("content but not header of url " + url.toString() + " in cache; cleaned up"); - fileDB.delete(url.hash()); + fileDB.delete(urlhash); } catch (final IOException e) {} return false; } diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index f0f754478..d48f3ac4e 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -152,6 +152,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { // the root value here will not be used to load the resource. // it is only the reference for relative links super(linkTags0, linkTags1); + assert root != null; this.root = root; this.evaluationScores = new Evaluation(); this.rss = new HashMap(); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 90c6a5d90..87bf68729 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -3328,7 +3328,7 @@ public final class Switchboard extends serverSwitch this.peers.mySeed().put(Seed.NCOUNT, Integer.toString(this.crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's) this.peers.mySeed().put( Seed.RCOUNT, - Integer.toString(this.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's) + Integer.toString(this.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.GLOBAL))); // the number of links that the peer provides for remote crawling (ZURL's) this.peers.mySeed().put(Seed.ICOUNT, Long.toString(this.indexSegments.RWICount())); // the minimum number of words that the peer has indexed (as it says) this.peers.mySeed().put(Seed.SCOUNT, Integer.toString(this.peers.sizeConnected())); // the number of seeds that the peer has stored this.peers.mySeed().put(