diff --git a/bin/apicat.sh b/bin/apicat.sh new file mode 100755 index 000000000..f361778ac --- /dev/null +++ b/bin/apicat.sh @@ -0,0 +1,12 @@ +#!/bin/bash +cd "`dirname $0`" +port=$(grep ^port= ../DATA/SETTINGS/yacy.conf |cut -d= -f2) +pw=$(grep ^adminAccountBase64MD5= ../DATA/SETTINGS/yacy.conf |cut -d= -f2) + +if which curl &>/dev/null; then + curl -s --header "Authorization: realm=$pw" "http://127.0.0.1:$port/$1" +elif which wget &>/dev/null; then + wget -q -t 1 --timeout=5 --header "Authorization: realm=$pw" "http://127.0.0.1:$port/$1" +else + exit 1 +fi diff --git a/build.xml b/build.xml index 388838334..9d34481eb 100644 --- a/build.xml +++ b/build.xml @@ -388,6 +388,7 @@ + diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index 02c01e369..fa5ca6e23 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -109,8 +109,8 @@ inboundlinks_tag_txt ## total number of inbound links, int inboundlinkscount_i -## number of inbound links with noindex tag, int -inboundlinksnoindexcount_i +## number of inbound links with nofollow tag, int +inboundlinksnofollowcount_i ## external links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen outboundlinks_tag_txt @@ -136,8 +136,8 @@ outboundlinks_tag_txt ## external number of inbound links, int outboundlinkscount_i -## number of external links with noindex tag, int -outboundlinksnoindexcount_i +## number of external links with nofollow tag, int +outboundlinksnofollowcount_i ## all image tags, encoded as tag inclusive alt- and title property, textgen images_tag_txt diff --git a/htroot/ConfigParser.java b/htroot/ConfigParser.java index a861da556..1ac18f6f8 100644 --- a/htroot/ConfigParser.java +++ b/htroot/ConfigParser.java @@ -61,6 +61,7 @@ public class ConfigParser { } } env.setConfig(SwitchboardConstants.PARSER_MIME_DENY, TextParser.getDenyMime()); + env.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, TextParser.getDenyExtension()); } } diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html index df78753a1..ce86f9321 100644 --- a/htroot/Crawler_p.html +++ b/htroot/Crawler_p.html @@ -6,14 +6,22 @@ - - + + + + #%env/templates/header.template%# #%env/templates/submenuCrawlMonitor.template%#

Crawler Queues

-

Next update in seconds. empty - See a access timing here

+ @@ -71,20 +79,6 @@
-
- - - - - - - - - - - -
Speed
PPM
-
@@ -103,15 +97,24 @@
+
+ + + + - + @@ -126,6 +129,7 @@
Indicator Level
Speed + + PPM + +
PPM (Pages Per Minute)          
+

#(info)# @@ -157,23 +161,10 @@ -

Crawl Queue:

- - - - - - - - - - - - - - -
QueueProfileInitiatorDepthModified DateAnchor NameURLSizeDelete
+

See an access timing

+ + #%env/templates/footer.template%# diff --git a/htroot/IndexCreateQueues_p.html b/htroot/IndexCreateQueues_p.html new file mode 100644 index 000000000..2a803a87b --- /dev/null +++ b/htroot/IndexCreateQueues_p.html @@ -0,0 +1,95 @@ + + + + YaCy '#[clientname]#': '#[queuename]#' Crawl Queue + #%env/templates/metas.template%# + + +
+ #(embed)# + #%env/templates/header.template%# + #%env/templates/submenuCrawlMonitor.template%# +

'#[queuename]#' Crawl Queue

+ ::#(/embed)# + + #(crawler)# +

This crawler queue is empty

+ :: + #(embed)# +
+
+ Delete Entries: + + + + +
+
+ ::#(/embed)# + + + + + + + + + + + + + + + + + + + + + + + + + + #{host}# + + + + + + + #{list}# + + + + + + + + + + + #{/list}# + + + #{/host}# + #(/crawler)# + #(embed)# + #%env/templates/footer.template%# + ::#(/embed)# + + + + \ No newline at end of file diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateQueues_p.java similarity index 53% rename from htroot/IndexCreateWWWLocalQueue_p.java rename to htroot/IndexCreateQueues_p.java index cb109a155..9582de5a4 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateQueues_p.java @@ -1,192 +1,169 @@ -// IndexCreateWWWLocalQueue_p.java -// ------------------------------- -// part of the AnomicHTTPD caching proxy -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004, 2005 -// -//$LastChangedDate$ -//$LastChangedRevision$ -//$LastChangedBy$ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -// You must compile this file with -// javac -classpath .:../classes IndexCreate_p.java -// if the shell's current path is HTROOT - -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Date; -import java.util.Iterator; -import java.util.List; -import java.util.Locale; -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; - -import net.yacy.cora.document.ASCII; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.kelondro.logging.Log; -import net.yacy.peers.Seed; -import net.yacy.search.Switchboard; - -import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.NoticedURL; -import de.anomic.crawler.CrawlSwitchboard; -import de.anomic.crawler.retrieval.Request; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; - -public class IndexCreateWWWLocalQueue_p { - - private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); - private static String daydate(final Date date) { - if (date == null) return ""; - return dayFormatter.format(date); - } - - private static final int INVALID = 0; - private static final int URL = 1; - private static final int ANCHOR = 2; - private static final int PROFILE = 3; - private static final int DEPTH = 4; - private static final int INITIATOR = 5; - private static final int MODIFIED = 6; - - public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - // return variable that accumulates replacements - final Switchboard sb = (Switchboard) env; - final serverObjects prop = new serverObjects(); - - int showLimit = 100; - if (post != null) { - showLimit = post.getInt("limit", 100); - - if (post.containsKey("deleteEntries")) { - int c = 0; - - final String pattern = post.get("pattern", ".*").trim(); - final int option = post.getInt("option", INVALID); - if (".*".equals(pattern)) { - c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE); - sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.CORE); - try { sb.cleanProfiles(); } catch (final InterruptedException e) {/* ignore this */} - } else if (option > INVALID) { - try { - // compiling the regular expression - final Pattern compiledPattern = Pattern.compile(pattern); - - if (option == PROFILE) { - // search and delete the crawl profile (_much_ faster, independant of queue size) - // XXX: what to do about the annoying LOST PROFILE messages in the log? - CrawlProfile entry; - for (final byte[] handle: sb.crawler.getActive()) { - entry = sb.crawler.getActive(handle); - final String name = entry.name(); - if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) - continue; - if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes()); - } - } else { - // iterating through the list of URLs - final Iterator iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.StackType.CORE); - Request entry; - final List removehashes = new ArrayList(); - while (iter.hasNext()) { - if ((entry = iter.next()) == null) continue; - String value = null; - - location: switch (option) { - case URL: value = (entry.url() == null) ? null : entry.url().toString(); break location; - case ANCHOR: value = entry.name(); break location; - case DEPTH: value = Integer.toString(entry.depth()); break location; - case INITIATOR: - value = (entry.initiator() == null || entry.initiator().length == 0) ? "proxy" : ASCII.String(entry.initiator()); - break location; - case MODIFIED: value = daydate(entry.appdate()); break location; - default: value = null; break location; - } - - if (value != null && compiledPattern.matcher(value).matches()) removehashes.add(entry.url().hash()); - } - Log.logInfo("IndexCreateWWWLocalQueue", "created a remove list with " + removehashes.size() + " entries for pattern '" + pattern + "'"); - for (final byte[] b: removehashes) { - sb.crawlQueues.noticeURL.removeByURLHash(b); - } - } - } catch (final PatternSyntaxException e) { - Log.logException(e); - } - } - - prop.put("info", "3");//crawling queue cleared - prop.putNum("info_numEntries", c); - } else if (post.containsKey("deleteEntry")) { - final String urlHash = post.get("deleteEntry"); - sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes()); - prop.put("LOCATION",""); - return prop; - } - } - - int showNum = 0, stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE); - if (stackSize == 0) { - prop.put("crawler-queue", "0"); - } else { - prop.put("crawler-queue", "1"); - final List crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, (int) (showLimit * 1.20)); - - Request urle; - boolean dark = true; - Seed initiator; - String profileHandle; - CrawlProfile profileEntry; - int i; - for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { - urle = crawlerList.get(i); - if ((urle != null)&&(urle.url()!=null)) { - initiator = sb.peers.getConnected(urle.initiator() == null ? "" : ASCII.String(urle.initiator())); - profileHandle = urle.profileHandle(); - profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes()); - prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0"); - prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) ); - prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); - prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth()); - prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) ); - prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name()); - prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true)); - prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash()); - dark = !dark; - showNum++; - } else { - stackSize--; - } - } - prop.putNum("crawler-queue_list", showNum); - prop.putNum("crawler-queue_num", stackSize);//num Entries - prop.putNum("crawler-queue_show-num", showNum); //showin sjow-num most recent - - } - - // return rewrite properties - return prop; - } -} + +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import net.yacy.cora.document.ASCII; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.kelondro.logging.Log; +import net.yacy.peers.Seed; +import net.yacy.search.Switchboard; +import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.CrawlSwitchboard; +import de.anomic.crawler.NoticedURL.StackType; +import de.anomic.crawler.retrieval.Request; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; + +public class IndexCreateQueues_p { + + private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); + private static String daydate(final Date date) { + if (date == null) return ""; + return dayFormatter.format(date); + } + + private static final int INVALID = 0; + private static final int URL = 1; + private static final int ANCHOR = 2; + private static final int PROFILE = 3; + private static final int DEPTH = 4; + private static final int INITIATOR = 5; + private static final int MODIFIED = 6; + + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { + // return variable that accumulates replacements + final Switchboard sb = (Switchboard) env; + final serverObjects prop = new serverObjects(); + StackType stackType = StackType.LOCAL; + int urlsPerHost = 5; + boolean embed = false; + String deletepattern = ".*"; + + if (post != null) { + stackType = StackType.valueOf(post.get("stack", stackType.name()).toUpperCase()); + urlsPerHost = post.getInt("urlsPerHost", urlsPerHost); + if (post.containsKey("embed")) embed = true; + + if (post.containsKey("delete")) { + deletepattern = post.get("pattern", deletepattern).trim(); + final int option = post.getInt("option", INVALID); + if (".*".equals(deletepattern)) { + sb.crawlQueues.noticeURL.clear(stackType); + try { sb.cleanProfiles(); } catch (final InterruptedException e) {/* ignore this */} + } else if (option > INVALID) { + try { + // compiling the regular expression + final Pattern compiledPattern = Pattern.compile(deletepattern); + + if (option == PROFILE) { + // search and delete the crawl profile (_much_ faster, independant of queue size) + // XXX: what to do about the annoying LOST PROFILE messages in the log? + CrawlProfile entry; + for (final byte[] handle: sb.crawler.getActive()) { + entry = sb.crawler.getActive(handle); + final String name = entry.name(); + if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) || + name.equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) || + name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) || + name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) || + name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) || + name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) || + name.equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) + continue; + if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes()); + } + } else { + // iterating through the list of URLs + final Iterator iter = sb.crawlQueues.noticeURL.iterator(stackType); + Request entry; + final List removehashes = new ArrayList(); + while (iter.hasNext()) { + if ((entry = iter.next()) == null) continue; + String value = null; + + location: switch (option) { + case URL: value = (entry.url() == null) ? null : entry.url().toString(); break location; + case ANCHOR: value = entry.name(); break location; + case DEPTH: value = Integer.toString(entry.depth()); break location; + case INITIATOR: + value = (entry.initiator() == null || entry.initiator().length == 0) ? "proxy" : ASCII.String(entry.initiator()); + break location; + case MODIFIED: value = daydate(entry.appdate()); break location; + default: value = null; break location; + } + + if (value != null && compiledPattern.matcher(value).matches()) removehashes.add(entry.url().hash()); + } + Log.logInfo("IndexCreateQueues_p", "created a remove list with " + removehashes.size() + " entries for pattern '" + deletepattern + "'"); + for (final byte[] b: removehashes) { + sb.crawlQueues.noticeURL.removeByURLHash(b); + } + } + } catch (final PatternSyntaxException e) { + Log.logException(e); + } + } + } + } + + int stackSize = sb.crawlQueues.noticeURL.stackSize(stackType); + if (stackSize == 0) { + prop.put("crawler", "0"); + } else { + prop.put("crawler", "1"); + prop.put("crawler_embed", embed ? 1 : 0); + prop.put("crawler_embed_deletepattern", deletepattern); + prop.put("crawler_embed_queuename", stackType.name()); + + final Map hosts = sb.crawlQueues.noticeURL.getDomainStackHosts(stackType); + + int hc = 0; + for (Map.Entry host: hosts.entrySet()) { + prop.putHTML("crawler_host_" + hc + "_hostname", host.getKey()); + prop.put("crawler_host_" + hc + "_embed", embed ? 1 : 0); + prop.put("crawler_host_" + hc + "_urlsPerHost", urlsPerHost); + prop.putHTML("crawler_host_" + hc + "_queuename", stackType.name()); + prop.put("crawler_host_" + hc + "_hostcount", host.getValue()[0]); + prop.put("crawler_host_" + hc + "_hostdelta", host.getValue()[1]); + List domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost); + + Seed initiator; + String profileHandle; + CrawlProfile profileEntry; + int count = 0; + for (Request request: domainStackReferences) { + if (request == null) continue; + initiator = sb.peers.getConnected(request.initiator() == null ? "" : ASCII.String(request.initiator())); + profileHandle = request.profileHandle(); + profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes()); + prop.putHTML("crawler_host_" + hc + "_list_" + count + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()) ); + prop.put("crawler_host_" + hc + "_list_" + count + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); + prop.put("crawler_host_" + hc + "_list_" + count + "_depth", request.depth()); + prop.put("crawler_host_" + hc + "_list_" + count + "_modified", daydate(request.appdate()) ); + prop.putHTML("crawler_host_" + hc + "_list_" + count + "_anchor", request.name()); + prop.put("crawler_host_" + hc + "_list_" + count + "_delta", sb.crawlQueues.noticeURL.getDomainSleepTime(stackType, sb.crawler, request)); + prop.putHTML("crawler_host_" + hc + "_list_" + count + "_url", request.url().toNormalform(false, true)); + prop.put("crawler_host_" + hc + "_list_" + count + "_hash", request.url().hash()); + count++; + } + prop.putNum("crawler_host_" + hc + "_list", count); + hc++; + } + prop.put("crawler_host", hc); + } + + prop.put("embed", embed ? 1 : 0); + prop.put("queuename", stackType.name().charAt(0) + stackType.name().substring(1).toLowerCase()); + prop.put("embed_queuename", stackType.name().charAt(0) + stackType.name().substring(1).toLowerCase()); + + // return rewrite properties + return prop; + } +} diff --git a/htroot/IndexCreateWWWGlobalQueue_p.html b/htroot/IndexCreateWWWGlobalQueue_p.html deleted file mode 100644 index 6151b7c0d..000000000 --- a/htroot/IndexCreateWWWGlobalQueue_p.html +++ /dev/null @@ -1,58 +0,0 @@ - - - - YaCy '#[clientname]#': Global Crawl Queue - #%env/templates/metas.template%# - - - #%env/templates/header.template%# - #%env/templates/submenuCrawlMonitor.template%# -

Global Crawl Queue

-

- This queue stores the urls that shall be sent to other peers to perform a remote crawl. - If there is no peer for remote crawling available, the links are crawled locally. -

- #(crawler-queue)# -

The global crawler queue is empty

- :: -
-
- -
- -

There are #[num]# entries in the global crawler queue. Showing #[show-num]# most recent entries.

-

Show last 50 | 100 | 250 | 500 entries.

-
CountDelta/msHostInitiatorProfileDepthModified DateAnchor NameDelta/msURL
#[hostcount]##[hostdelta]# #[hostname]#
#[initiator]##[profile]##[depth]##[modified]##[anchor]##[delta]##[url]#
- - - - - - - - - - - - - - - - - - #{list}# - - - - - - - - - - #{/list}# -
InitiatorProfileDepthModified DateAnchor NameURLDelete
#[initiator]##[profile]##[depth]##[modified]##[anchor]##[url]#[Delete]
- #(/crawler-queue)# - #%env/templates/footer.template%# - - diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java deleted file mode 100644 index efa3222e3..000000000 --- a/htroot/IndexCreateWWWGlobalQueue_p.java +++ /dev/null @@ -1,125 +0,0 @@ -// IndexCreateWWWGlobalQueue_p.java -// ------------------------------- -// part of the AnomicHTTPD caching proxy -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004, 2005 -// -//$LastChangedDate$ -//$LastChangedRevision$ -//$LastChangedBy$ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -// You must compile this file with -// javac -classpath .:../classes IndexCreate_p.java -// if the shell's current path is HTROOT - -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.List; -import java.util.Locale; - -import net.yacy.cora.document.ASCII; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.peers.Seed; -import net.yacy.search.Switchboard; - -import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.NoticedURL; -import de.anomic.crawler.retrieval.Request; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; - -public class IndexCreateWWWGlobalQueue_p { - - private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); - private static String daydate(final Date date) { - if (date == null) return ""; - return dayFormatter.format(date); - } - - public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - // return variable that accumulates replacements - final Switchboard sb = (Switchboard) env; - final serverObjects prop = new serverObjects(); - - int showLimit = 100; - if (post != null) { - showLimit = post.getInt("limit", 100); - - if (post.containsKey("clearcrawlqueue")) { - final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT); - sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.LIMIT); - try { sb.cleanProfiles(); } catch (final InterruptedException e) { /* Ignore this */} - /* - int c = 0; - while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.StackType.LIMIT) > 0) { - urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.StackType.LIMIT).hash(); - if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } - } - */ - prop.put("info", "3");//crawling queue cleared - prop.putNum("info_numEntries", c); - } else if (post.containsKey("deleteEntry")) { - final String urlHash = post.get("deleteEntry"); - sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes()); - prop.put("LOCATION",""); - return prop; - } - } - - int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT); - if (stackSize == 0) { - prop.put("crawler-queue", "0"); - } else { - prop.put("crawler-queue", "1"); - final List crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, showLimit); - - Request urle; - boolean dark = true; - Seed initiator; - String profileHandle; - CrawlProfile profileEntry; - int i, showNum = 0; - for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { - urle = crawlerList.get(i); - if (urle != null && urle.url() != null) { - initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator())); - profileHandle = urle.profileHandle(); - profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes()); - prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0"); - prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) ); - prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); - prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth()); - prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) ); - prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name()); - prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true)); - prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash()); - dark = !dark; - showNum++; - } else { - stackSize--; - } - } - prop.putNum("crawler-queue_show-num", showNum); //showin sjow-num most recent - prop.putNum("crawler-queue_num", stackSize);//num Entries - prop.putNum("crawler-queue_list", showNum); - } - - // return rewrite properties - return prop; - } -} diff --git a/htroot/IndexCreateWWWLocalQueue_p.html b/htroot/IndexCreateWWWLocalQueue_p.html deleted file mode 100644 index 8c7bfeb01..000000000 --- a/htroot/IndexCreateWWWLocalQueue_p.html +++ /dev/null @@ -1,69 +0,0 @@ - - - - YaCy '#[clientname]#': Local Crawl Queue - #%env/templates/metas.template%# - - - #%env/templates/header.template%# - #%env/templates/submenuCrawlMonitor.template%# -

Local Crawl Queue

-

- This queue stores the urls that shall be crawled localy by this peer. - It may also contain urls that are computed by the proxy-prefetch. -

- - #(crawler-queue)# -

The local crawler queue is empty

- :: -
-
- Delete Entries: - - - This may take a quite long time. -
-
-

There are #[num]# entries in the local crawler queue. Showing #[show-num]# most recent entries.

-

Show last 50 | 100 | 250 | 500 entries.

- - - - - - - - - - - - - - - - - - - #{list}# - - - - - - - - - - #{/list}# -
InitiatorProfileDepthModified DateAnchor NameURLDelete
#[initiator]##[profile]##[depth]##[modified]##[anchor]##[url]#[Delete]
- #(/crawler-queue)# - #%env/templates/footer.template%# - - \ No newline at end of file diff --git a/htroot/IndexCreateWWWRemoteQueue_p.html b/htroot/IndexCreateWWWRemoteQueue_p.html deleted file mode 100644 index abb6cc543..000000000 --- a/htroot/IndexCreateWWWRemoteQueue_p.html +++ /dev/null @@ -1,65 +0,0 @@ - - - - YaCy '#[clientname]#': Remote Crawl Queue - #%env/templates/metas.template%# - - - #%env/templates/header.template%# - #%env/templates/submenuCrawlMonitor.template%# -

Remote Crawl Queue

-

- This queue stores the urls that other peers sent to you in order to perform a remote crawl for them. -

- #(crawler-queue)# -

The remote crawler queue is empty

- :: -
-
- -
-
-

- There are #[num]# entries in the remote crawler queue. - Showing #[show-num]# most recent entries. -

-

- Show last 50 | - 100 | - 250 | - 500 entries. -

- - - - - - - - - - - - - - - - - - - #{list}# - - - - - - - - - - #{/list}# -
InitiatorProfileDepthModified DateAnchor NameURLDelete
#[initiator]##[profile]##[depth]##[modified]##[anchor]##[url]#[Delete]
- #(/crawler-queue)# - #%env/templates/footer.template%# - - diff --git a/htroot/IndexCreateWWWRemoteQueue_p.java b/htroot/IndexCreateWWWRemoteQueue_p.java deleted file mode 100644 index a82ee5221..000000000 --- a/htroot/IndexCreateWWWRemoteQueue_p.java +++ /dev/null @@ -1,120 +0,0 @@ -// IndexCreateWWWRemoteQueue_p.java -// ------------------------------- -// part of the AnomicHTTPD caching proxy -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004, 2005 -// last major change: 04.07.2005 -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -// You must compile this file with -// javac -classpath .:../classes IndexCreateWWWRemoteQueue_p.java -// if the shell's current path is HTROOT - -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.List; -import java.util.Locale; - -import net.yacy.cora.document.ASCII; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.peers.Seed; -import net.yacy.search.Switchboard; - -import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.NoticedURL; -import de.anomic.crawler.retrieval.Request; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; -import de.anomic.server.servletProperties; - -public class IndexCreateWWWRemoteQueue_p { - - private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); - private static String daydate(final Date date) { - if (date == null) return ""; - return dayFormatter.format(date); - } - - public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - final servletProperties prop = new servletProperties(); - final Switchboard sb = (Switchboard)env; - - int showLimit = 100; - if (post != null) { - showLimit = post.getInt("limit", 100); - - if (post.containsKey("clearcrawlqueue")) { - final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.REMOTE); - sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.REMOTE); - try { sb.cleanProfiles(); } catch (final InterruptedException e) { /* Ignore this */} - /* - int c = 0; - while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.StackType.LIMIT) > 0) { - urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.StackType.LIMIT).hash(); - if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } - } - */ - prop.put("info", "3"); // crawling queue cleared - prop.putNum("info_numEntries", c); - } else if (post.containsKey("deleteEntry")) { - final String urlHash = post.get("deleteEntry"); - sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes()); - prop.put("LOCATION",""); - return prop; - } - } - - int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.REMOTE); - if (stackSize == 0) { - prop.put("crawler-queue", "0"); - } else { - prop.put("crawler-queue", "1"); - final List crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.REMOTE, showLimit); - - Request urle; - boolean dark = true; - Seed initiator; - String profileHandle; - CrawlProfile profileEntry; - int i, showNum = 0; - for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { - urle = crawlerList.get(i); - if (urle != null && urle.url() != null) { - initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator())); - profileHandle = urle.profileHandle(); - profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes()); - prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0"); - prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName())); - prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); - prop.put("crawler-queue_list_" + showNum + "_depth", urle.depth()); - prop.put("crawler-queue_list_" + showNum + "_modified", daydate(urle.appdate()) ); - prop.putHTML("crawler-queue_list_" + showNum + "_anchor", urle.name()); - prop.putHTML("crawler-queue_list_" + showNum + "_url", urle.url().toString()); - prop.put("crawler-queue_list_" + showNum + "_hash", urle.url().hash()); - dark = !dark; - showNum++; - } else { - stackSize--; - } - } - prop.putNum("crawler-queue_show-num", showNum); //showin sjow-num most recent - prop.putNum("crawler-queue_num", stackSize);//num Entries - prop.putNum("crawler-queue_list", showNum); - } - return prop; - } -} diff --git a/htroot/api/queues_p.java b/htroot/api/queues_p.java deleted file mode 100755 index 2c9b81bb4..000000000 --- a/htroot/api/queues_p.java +++ /dev/null @@ -1,124 +0,0 @@ -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.List; -import java.util.Locale; - -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.peers.Seed; -import net.yacy.search.Switchboard; -import net.yacy.search.SwitchboardConstants; -import net.yacy.search.index.Segment; -import net.yacy.search.index.Segments; -import de.anomic.crawler.NoticedURL; -import de.anomic.crawler.retrieval.Request; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; - -public class queues_p { - - public static final String STATE_RUNNING = "running"; - public static final String STATE_PAUSED = "paused"; - - private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); - private static String daydate(final Date date) { - if (date == null) return ""; - return dayFormatter.format(date); - } - - public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - // return variable that accumulates replacements - final Switchboard sb = (Switchboard) env; - //wikiCode wikiTransformer = new wikiCode(switchboard); - final serverObjects prop = new serverObjects(); - Segment segment = null; - final boolean html = post != null && post.containsKey("html"); - prop.setLocalized(html); - if (post != null && post.containsKey("segment") && sb.verifyAuthentication(header)) { - segment = sb.indexSegments.segment(post.get("segment")); - } - if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC); - prop.put("rejected", "0"); - //int showRejectedCount = 10; - - Seed initiator; - - // index size - prop.putNum("urlpublictextSize", segment.urlMetadata().size()); - prop.putNum("rwipublictextSize", segment.termIndex().sizesMax()); - - // loader queue - prop.putNum("loaderSize", sb.crawlQueues.workerSize()); - prop.putNum("loaderMax", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10)); - if (sb.crawlQueues.workerSize() == 0) { - prop.put("list-loader", "0"); - } else { - final Request[] w = sb.crawlQueues.activeWorkerEntries(); - int count = 0; - for (final Request r : w) { - if (r == null) continue; - prop.put("list-loader_"+count+"_profile", r.profileHandle()); - initiator = sb.peers.getConnected((r.initiator() == null) ? "" : ASCII.String(r.initiator())); - prop.putHTML("list-loader_"+count+"_initiator", ((initiator == null) ? "proxy" : initiator.getName())); - prop.put("list-loader_"+count+"_depth", r.depth()); - prop.putXML("list-loader_"+count+"_url", r.url().toString()); - count++; - } - prop.put("list-loader", count); - } - - //local crawl queue - prop.putNum("localCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL).getJobCount()); - prop.put("localCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) ? STATE_PAUSED : STATE_RUNNING); - int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE); - addNTable(sb, prop, "list-local", sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, Math.min(10, stackSize))); - - //global crawl queue - prop.putNum("limitCrawlSize", sb.crawlQueues.limitCrawlJobSize()); - prop.put("limitCrawlState", STATE_RUNNING); - stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT); - - //remote crawl queue - prop.putNum("remoteCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount()); - prop.put("remoteCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING); - stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT); - - if (stackSize == 0) { - prop.put("list-remote", "0"); - } else { - addNTable(sb, prop, "list-remote", sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, Math.min(10, stackSize))); - } - - //noload crawl queue - prop.putNum("noloadCrawlSize", sb.crawlQueues.noloadCrawlJobSize()); - prop.put("noloadCrawlState", STATE_RUNNING); - //stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.NOLOAD); - - - // return rewrite properties - return prop; - } - - - public static final void addNTable(final Switchboard sb, final serverObjects prop, final String tableName, final List crawlerList) { - - int showNum = 0; - Seed initiator; - for (final Request urle : crawlerList) { - if ((urle != null) && (urle.url() != null)) { - initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : UTF8.String(urle.initiator())); - prop.put(tableName + "_" + showNum + "_profile", urle.profileHandle()); - prop.put(tableName + "_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName())); - prop.put(tableName + "_" + showNum + "_depth", urle.depth()); - prop.put(tableName + "_" + showNum + "_modified", daydate(urle.appdate())); - prop.putXML(tableName + "_" + showNum + "_anchor", urle.name()); - prop.putXML(tableName + "_" + showNum + "_url", urle.url().toNormalform(false, true)); - prop.put(tableName + "_" + showNum + "_hash", urle.url().hash()); - showNum++; - } - } - prop.put(tableName, showNum); - - } -} diff --git a/htroot/api/queues_p.xml b/htroot/api/queues_p.xml deleted file mode 100644 index d8d3d8abb..000000000 --- a/htroot/api/queues_p.xml +++ /dev/null @@ -1,71 +0,0 @@ - - - - #[urlpublictextSize]# - #[rwipublictextSize]# - - - #[loaderSize]# - #[loaderMax]# -#{list-loader}# - - #[profile]# - #[initiator]# - #[depth]# - #[url]# - -#{/list-loader}# - - - #[localCrawlSize]# - #[localCrawlState]# -#{list-local}# - - #[profile]# - #[initiator]# - #[depth]# - #[modified]# - #[anchor]# - #[url]# - #[hash]# - #(inProcess)#false::true#(/inProcess)# - -#{/list-local}# - - - #[limitCrawlSize]# - #[limitCrawlState]# -#{list-limit}# - - #[profile]# - #[initiator]# - #[depth]# - #[modified]# - #[anchor]# - #[url]# - #[hash]# - #(inProcess)#false::true#(/inProcess)# - -#{/list-limit}# - - - #[remoteCrawlSize]# - #[remoteCrawlState]# -#{list-remote}# - - #[profile]# - #[initiator]# - #[depth]# - #[modified]# - #[anchor]# - #[url]# - #[hash]# - #(inProcess)#false::true#(/inProcess)# - -#{/list-remote}# - - - #[noloadCrawlSize]# - #[noloadCrawlState]# - - diff --git a/htroot/api/schema_p.xml b/htroot/api/schema_p.xml index e81ee0cd0..667588e44 100755 --- a/htroot/api/schema_p.xml +++ b/htroot/api/schema_p.xml @@ -39,7 +39,7 @@ - + @@ -57,7 +57,7 @@ id - description + sku diff --git a/htroot/api/status_p.java b/htroot/api/status_p.java index ce26f1b65..83ba95e41 100644 --- a/htroot/api/status_p.java +++ b/htroot/api/status_p.java @@ -1,4 +1,29 @@ - +// status_p +// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 18.12.2006 on http://www.anomic.de +// this file was created using the an implementation from IndexCreate_p.java, published 02.12.2004 +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.io.ByteCount; @@ -13,6 +38,8 @@ import de.anomic.server.serverSwitch; public class status_p { + public static final String STATE_RUNNING = "running"; + public static final String STATE_PAUSED = "paused"; public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements @@ -53,6 +80,30 @@ public class status_p { prop.put("trafficProxy", ByteCount.getAccountCount(ByteCount.PROXY)); prop.put("trafficCrawler", ByteCount.getAccountCount(ByteCount.CRAWLER)); + // index size + prop.putNum("urlpublictextSize", segment.urlMetadata().size()); + prop.putNum("rwipublictextSize", segment.termIndex().sizesMax()); + + // loader queue + prop.putNum("loaderSize", sb.crawlQueues.workerSize()); + prop.putNum("loaderMax", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10)); + + //local crawl queue + prop.putNum("localCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL).getJobCount()); + prop.put("localCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) ? STATE_PAUSED : STATE_RUNNING); + + //global crawl queue + prop.putNum("limitCrawlSize", sb.crawlQueues.limitCrawlJobSize()); + prop.put("limitCrawlState", STATE_RUNNING); + + //remote crawl queue + prop.putNum("remoteCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount()); + prop.put("remoteCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING); + + //noload crawl queue + prop.putNum("noloadCrawlSize", sb.crawlQueues.noloadCrawlJobSize()); + prop.put("noloadCrawlState", STATE_RUNNING); + // return rewrite properties return prop; } diff --git a/htroot/api/status_p.xml b/htroot/api/status_p.xml index a8f4ae4eb..25c62234b 100644 --- a/htroot/api/status_p.xml +++ b/htroot/api/status_p.xml @@ -1,35 +1,52 @@ #[ppm]# + #[wordCacheSize]# #[wordCacheMaxSize]# + + + #[freeMemory]# + #[totalMemory]# + #[maxMemory]# + + + #[processors]# + + + #[trafficIn]# + #[trafficProxy]# + #[trafficCrawler]# + + + + #[urlpublictextSize]# + #[rwipublictextSize]# + #[loaderSize]# #[loaderMax]# + #[localCrawlSize]# + #[localCrawlState]# + #[limitCrawlSize]# + #[limitCrawlState]# + #[remoteCrawlSize]# + #[remoteCrawlState]# + #[noloadCrawlSize]# + #[noloadCrawlState]# - - - #[freeMemory]# - #[totalMemory]# - #[maxMemory]# - - #[processors]# - - #[trafficIn]# - #[trafficProxy]# - #[trafficCrawler]# - + diff --git a/htroot/api/timeline.java b/htroot/api/timeline.java index 43d59bc3b..bb269dcbb 100644 --- a/htroot/api/timeline.java +++ b/htroot/api/timeline.java @@ -9,7 +9,7 @@ // $LastChangedBy: orbiter $ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -24,9 +24,9 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +import java.util.Collection; import java.util.Date; import java.util.Iterator; -import java.util.TreeSet; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.protocol.RequestHeader; @@ -43,7 +43,6 @@ import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; import net.yacy.search.index.Segments; import net.yacy.search.query.QueryParams; - import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -52,18 +51,18 @@ public final class timeline { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; - + final serverObjects prop = new serverObjects(); if ((post == null) || (env == null)) return prop; final boolean authenticated = sb.adminAuthenticated(header) >= 2; - + Segment segment = null; if (post.containsKey("segment") && authenticated) { segment = sb.indexSegments.segment(post.get("segment")); } else { segment = sb.indexSegments.segment(Segments.Process.PUBLIC); } - + final String querystring = post.get("query", ""); // a string of word hashes that shall be searched and combined final int count = Math.min((authenticated) ? 1000 : 10, post.getInt("maximumRecords", 1000)); // SRU syntax final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE); @@ -75,22 +74,22 @@ public final class timeline { language = (agent == null) ? "en" : ISO639.userAgentLanguageDetection(agent); if (language == null) language = "en"; } - final TreeSet[] query = QueryParams.cleanQuery(querystring); // converts also umlaute + final Collection[] query = QueryParams.cleanQuery(querystring); // converts also umlaute HandleSet q = Word.words2hashesHandles(query[0]); - + // tell all threads to do nothing for a specific time sb.intermissionAllThreads(3000); // prepare search final long timestamp = System.currentTimeMillis(); - + // prepare an abstract result int indexabstractContainercount = 0; int joincount = 0; // retrieve index containers //yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links"); - + // get the index container with the result vector TermSearch search = null; try { @@ -99,7 +98,7 @@ public final class timeline { Log.logException(e); } ReferenceContainer index = search.joined(); - + Iterator i = index.entries(); WordReference entry; int c = 0; @@ -117,14 +116,14 @@ public final class timeline { c++; } prop.put("event", c); - + // log Network.log.logInfo("EXIT TIMELINE SEARCH: " + QueryParams.anonymizedQueryHashes(q) + " - " + joincount + " links found, " + prop.get("linkcount", "?") + " links selected, " + indexabstractContainercount + " index abstracts, " + (System.currentTimeMillis() - timestamp) + " milliseconds"); - + return prop; } diff --git a/htroot/env/grafics/trash.gif b/htroot/env/grafics/trash.gif new file mode 100644 index 000000000..3ff757f77 Binary files /dev/null and b/htroot/env/grafics/trash.gif differ diff --git a/htroot/env/templates/submenuCrawlMonitor.template b/htroot/env/templates/submenuCrawlMonitor.template index 542d1d6c7..992c36bd5 100644 --- a/htroot/env/templates/submenuCrawlMonitor.template +++ b/htroot/env/templates/submenuCrawlMonitor.template @@ -14,9 +14,10 @@ diff --git a/htroot/index.html b/htroot/index.html index 4039a881d..a7fa56240 100644 --- a/htroot/index.html +++ b/htroot/index.html @@ -100,17 +100,17 @@ : - 10 - 50 - 100 + + + #(resource-select)#:: : - the peer-to-peer network - only the local index + + #(/resource-select)# diff --git a/htroot/js/Crawler.js b/htroot/js/Crawler.js index ef0e861f5..9747dba41 100644 --- a/htroot/js/Crawler.js +++ b/htroot/js/Crawler.js @@ -5,12 +5,10 @@ WORDCACHEBAR_LENGTH=1/4; var statusRPC; -var queuesRPC; -var refreshInterval=5; +var refreshInterval=3; var wait=0; var changing=false; //change the interval var statusLoaded=true; -var queueLoaded=true; function initCrawler(){ refresh(); @@ -38,21 +36,20 @@ function newInterval(){ countInterval=window.setInterval("countdown()", 1000); changing=false; } + function countdown(){ - if(statusLoaded && queueLoaded){ - document.getElementById("nextUpdate").value=wait; - wait--; - if(wait==0){ + if(statusLoaded){ + wait--; + if (wait == 0) { refresh(); } } } + function refresh(){ wait=refreshInterval; statusLoaded=false; - queueLoaded=false; requestStatus(); - requestQueues(); } function requestStatus(){ @@ -61,13 +58,6 @@ function requestStatus(){ statusRPC.onreadystatechange = handleStatus; statusRPC.send(null); } -function requestQueues(){ - queuesRPC=createRequestObject(); - queuesRPC.open('get', '/api/queues_p.xml?html='); - queuesRPC.onreadystatechange = handleQueues; - queuesRPC.send(null); - -} function handleStatus(){ if(statusRPC.readyState != 4){ @@ -118,65 +108,44 @@ function handleStatus(){ img.setAttribute("src", BAR_IMG1); wordCacheSpan.appendChild(img); } - statusLoaded=true; -} - -function handleQueues(){ - if(queuesRPC.readyState != 4){ - return; - } - var queuesResponse = queuesRPC.responseXML; - //xml=getFirstChild(queuesResponse); - xml=getFirstChild(queuesResponse, "queues"); - if(queuesResponse != null){ - clearTable(document.getElementById("queueTable"), 1); - dbsize=getFirstChild(xml, "dbsize"); - urlpublictextSize=getValue(getFirstChild(dbsize, "urlpublictext")); - rwipublictextSize=getValue(getFirstChild(dbsize, "rwipublictext")); - document.getElementById("urldbsize").firstChild.nodeValue=urlpublictextSize; - document.getElementById("rwidbsize").firstChild.nodeValue=rwipublictextSize; - - loaderqueue=getFirstChild(xml, "loaderqueue"); - updateTable(loaderqueue, "loader"); - - loaderqueue_size=getValue(getFirstChild(loaderqueue, "size")); - loaderqueue_max=getValue(getFirstChild(loaderqueue, "max")); - document.getElementById("loaderqueuesize").firstChild.nodeValue=loaderqueue_size; - document.getElementById("loaderqueuemax").firstChild.nodeValue=loaderqueue_max; - - localcrawlerqueue=getFirstChild(xml, "localcrawlerqueue"); - localcrawlerqueue_size=getValue(getFirstChild(localcrawlerqueue, "size")); - localcrawlerqueue_state=getValue(getFirstChild(localcrawlerqueue, "state")); - document.getElementById("localcrawlerqueuesize").firstChild.nodeValue=localcrawlerqueue_size; - putQueueState("localcrawler", localcrawlerqueue_state); - - updateTable(localcrawlerqueue, "local crawler"); - - limitcrawlerqueue=getFirstChild(xml, "limitcrawlerqueue"); - updateTable(limitcrawlerqueue, "limitCrawlerTable"); - limitcrawlerqueue_size=getValue(getFirstChild(limitcrawlerqueue, "size")); - limitcrawlerqueue_state=getValue(getFirstChild(limitcrawlerqueue, "state")); - document.getElementById("limitcrawlerqueuesize").firstChild.nodeValue=limitcrawlerqueue_size; - putQueueState("limitcrawler", limitcrawlerqueue_state); - updateTable(limitcrawlerqueue, "limit crawler"); - - remotecrawlerqueue=getFirstChild(xml, "remotecrawlerqueue"); - updateTable(remotecrawlerqueue, "remoteCrawlerTable"); - remotecrawlerqueue_size=getValue(getFirstChild(remotecrawlerqueue, "size")); - remotecrawlerqueue_state=getValue(getFirstChild(remotecrawlerqueue, "state")); - document.getElementById("remotecrawlerqueuesize").firstChild.nodeValue=remotecrawlerqueue_size; - putQueueState("remotecrawler", remotecrawlerqueue_state); - updateTable(remotecrawlerqueue, "remote crawler"); - - noloadcrawlerqueue=getFirstChild(xml, "noloadcrawlerqueue"); - noloadcrawlerqueue_size=getValue(getFirstChild(noloadcrawlerqueue, "size")); - noloadcrawlerqueue_state=getValue(getFirstChild(noloadcrawlerqueue, "state")); - document.getElementById("noloadcrawlerqueuesize").firstChild.nodeValue=noloadcrawlerqueue_size; - putQueueState("noloadcrawler", noloadcrawlerqueue_state); + dbsize=getFirstChild(statusTag, "dbsize"); + urlpublictextSize=getValue(getFirstChild(dbsize, "urlpublictext")); + rwipublictextSize=getValue(getFirstChild(dbsize, "rwipublictext")); + document.getElementById("urldbsize").firstChild.nodeValue=urlpublictextSize; + document.getElementById("rwidbsize").firstChild.nodeValue=rwipublictextSize; + + loaderqueue=getFirstChild(statusTag, "loaderqueue"); + loaderqueue_size=getValue(getFirstChild(loaderqueue, "size")); + loaderqueue_max=getValue(getFirstChild(loaderqueue, "max")); + document.getElementById("loaderqueuesize").firstChild.nodeValue=loaderqueue_size; + document.getElementById("loaderqueuemax").firstChild.nodeValue=loaderqueue_max; + + localcrawlerqueue=getFirstChild(statusTag, "localcrawlerqueue"); + localcrawlerqueue_size=getValue(getFirstChild(localcrawlerqueue, "size")); + localcrawlerqueue_state=getValue(getFirstChild(localcrawlerqueue, "state")); + document.getElementById("localcrawlerqueuesize").firstChild.nodeValue=localcrawlerqueue_size; + putQueueState("localcrawler", localcrawlerqueue_state); + + limitcrawlerqueue=getFirstChild(statusTag, "limitcrawlerqueue"); + limitcrawlerqueue_size=getValue(getFirstChild(limitcrawlerqueue, "size")); + limitcrawlerqueue_state=getValue(getFirstChild(limitcrawlerqueue, "state")); + document.getElementById("limitcrawlerqueuesize").firstChild.nodeValue=limitcrawlerqueue_size; + putQueueState("limitcrawler", limitcrawlerqueue_state); + + remotecrawlerqueue=getFirstChild(statusTag, "remotecrawlerqueue"); + remotecrawlerqueue_size=getValue(getFirstChild(remotecrawlerqueue, "size")); + remotecrawlerqueue_state=getValue(getFirstChild(remotecrawlerqueue, "state")); + document.getElementById("remotecrawlerqueuesize").firstChild.nodeValue=remotecrawlerqueue_size; + putQueueState("remotecrawler", remotecrawlerqueue_state); + + noloadcrawlerqueue=getFirstChild(statusTag, "noloadcrawlerqueue"); + noloadcrawlerqueue_size=getValue(getFirstChild(noloadcrawlerqueue, "size")); + noloadcrawlerqueue_state=getValue(getFirstChild(noloadcrawlerqueue, "state")); + document.getElementById("noloadcrawlerqueuesize").firstChild.nodeValue=noloadcrawlerqueue_size; + putQueueState("noloadcrawler", noloadcrawlerqueue_state); - } - queueLoaded=true; + statusLoaded=true; } function putQueueState(queue, state) { @@ -184,53 +153,17 @@ function putQueueState(queue, state) { img = document.getElementById(queue + "stateIMG"); if (state == "paused") { a.href = "Crawler_p.html?continue=" + queue; - a.title = "Continue this queue"; + a.title = "Continue this queue (" + state + ")"; img.src = "/env/grafics/start.gif"; img.alt = "Continue this queue"; } else { a.href = "Crawler_p.html?pause=" + queue; - a.title = "Pause this queue"; + a.title = "Pause this queue (" + state + ")"; img.src = "/env/grafics/stop.gif"; img.alt = "Pause this queue"; } } -function updateTable(indexingqueue, tablename){ - indexingTable=document.getElementById("queueTable"); - entries=indexingqueue.getElementsByTagName("entry"); - - dark=false; - for(i=0;i 80) { diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index e01f091fb..20116c629 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -60,7 +60,7 @@ public class urls { if (post.get("call", "").equals("remotecrawl")) { // perform a remote crawl url handover - final NoticedURL.StackType stackType = NoticedURL.StackType.LIMIT; + final NoticedURL.StackType stackType = NoticedURL.StackType.GLOBAL; int maxCount = Math.min(100, post.getInt("count", 10)); final long maxTime = Math.min(20000, Math.max(1000, post.getInt("time", 10000))); final long timeout = System.currentTimeMillis() + maxTime; diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index f7b659cf6..6f1a7835a 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -602,14 +602,14 @@ public class yacysearch { (post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", ""); // the query - final TreeSet[] query = QueryParams.cleanQuery(querystring.trim()); // converts also umlaute + final Collection[] query = QueryParams.cleanQuery(querystring.trim()); // converts also umlaute final int maxDistance = (querystring.indexOf('"', 0) >= 0) ? query.length - 1 : Integer.MAX_VALUE; // filter out stopwords - final SortedSet filtered = SetTools.joinConstructive(query[0], Switchboard.stopwords); + final SortedSet filtered = SetTools.joinConstructiveByTest(query[0], Switchboard.stopwords); if ( !filtered.isEmpty() ) { - SetTools.excludeDestructive(query[0], Switchboard.stopwords); + SetTools.excludeDestructiveByTestSmallInLarge(query[0], Switchboard.stopwords); } // if a minus-button was hit, remove a special reference first diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 089556392..d6a5dce32 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -25,8 +25,8 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import java.net.MalformedURLException; +import java.util.Collection; import java.util.List; -import java.util.Set; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; @@ -165,7 +165,7 @@ public class yacysearchitem { prop.putHTML("content_publisher", result.publisher()); prop.putHTML("content_creator", result.creator());// author prop.putHTML("content_subject", result.subject()); - final Set[] query = theQuery.queryWords(); + final Collection[] query = theQuery.queryWords(); final StringBuilder s = new StringBuilder(query[0].size() * 20); for (final String t: query[0]) { s.append('+').append(t); diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index a9d3438ca..b6bf3af44 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -29,22 +29,20 @@ package de.anomic.crawler; import java.io.File; import java.io.IOException; import java.util.ArrayList; -import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.SortedMap; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ConcurrentMap; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.UTF8; import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.services.federated.yacy.CacheStrategy; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.index.BufferedObjectIndex; import net.yacy.kelondro.index.HandleSet; @@ -53,7 +51,6 @@ import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.table.Table; -import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.MemoryControl; import de.anomic.crawler.retrieval.Request; import de.anomic.http.client.Cache; @@ -74,9 +71,6 @@ public class Balancer { // class variables computed during operation private final ConcurrentMap domainStacks; // a map from host name to lists with url hashs - private final ConcurrentLinkedQueue top; // a list of url-hashes that shall be taken next - private final SortedMap delayed; - private final HandleSet ddc; private final HandleSet double_push_check; // for debugging private long lastDomainStackFill; private int domStackInitSize; @@ -91,13 +85,10 @@ public class Balancer { final boolean exceed134217727) { this.cacheStacksPath = cachePath; this.domainStacks = new ConcurrentHashMap(); - this.top = new ConcurrentLinkedQueue(); - this.delayed = new TreeMap(); this.minimumLocalDelta = minimumLocalDelta; this.minimumGlobalDelta = minimumGlobalDelta; this.myAgentIDs = myAgentIDs; this.domStackInitSize = Integer.MAX_VALUE; - this.ddc = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.double_push_check = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); // create a stack for newly entered entries @@ -145,12 +136,7 @@ public class Balancer { Log.logException(e); } this.domainStacks.clear(); - this.top.clear(); - synchronized (this.delayed) { - this.delayed.clear(); - } this.double_push_check.clear(); - this.ddc.clear(); } public Request get(final byte[] urlhash) throws IOException { @@ -202,28 +188,11 @@ public class Balancer { if (entry != null) removedCounter++; // remove from double-check caches - this.ddc.remove(urlhash); this.double_push_check.remove(urlhash); } if (removedCounter == 0) return 0; assert this.urlFileIndex.size() + removedCounter == s : "urlFileIndex.size() = " + this.urlFileIndex.size() + ", s = " + s; - // iterate through the top list - final Iterator j = this.top.iterator(); - byte[] urlhash; - while (j.hasNext()) { - urlhash = j.next(); - if (urlHashes.has(urlhash)) j.remove(); - } - - // remove from delayed - synchronized (this.delayed) { - final Iterator> k = this.delayed.entrySet().iterator(); - while (k.hasNext()) { - if (urlHashes.has(k.next().getValue())) k.remove(); - } - } - // iterate through the domain stacks final Iterator> q = this.domainStacks.entrySet().iterator(); HandleSet stack; @@ -237,7 +206,7 @@ public class Balancer { } public boolean has(final byte[] urlhashb) { - return this.urlFileIndex.has(urlhashb) || this.ddc.has(urlhashb); + return this.urlFileIndex.has(urlhashb) || this.double_push_check.has(urlhashb); } public boolean notEmpty() { @@ -277,7 +246,6 @@ public class Balancer { synchronized (this) { // double-check if (this.double_push_check.has(hash)) return "double occurrence in double_push_check"; - if (this.ddc.has(hash)) return "double occurrence in ddc"; if (this.urlFileIndex.has(hash)) return "double occurrence in urlFileIndex"; if (this.double_push_check.size() > 10000 || MemoryControl.shortStatus()) this.double_push_check.clear(); @@ -297,16 +265,16 @@ public class Balancer { /** * get a list of domains that are currently maintained as domain stacks - * @return a map of clear text strings of host names to the size of the domain stack + * @return a map of clear text strings of host names to an integer array: {the size of the domain stack, guessed delta waiting time} */ - public Map getDomainStackHosts() { - Map map = new HashMap(); + public Map getDomainStackHosts() { + Map map = new TreeMap(); // we use a tree map to get a stable ordering for (Map.Entry entry: this.domainStacks.entrySet()) { - map.put(entry.getKey(), entry.getValue().size()); + map.put(entry.getKey(), new Integer[]{entry.getValue().size(), (int) Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta)}); } return map; } - + /** * compute the current sleep time for a given crawl entry * @param cs @@ -315,20 +283,20 @@ public class Balancer { */ public long getDomainSleepTime(final CrawlSwitchboard cs, Request crawlEntry) { final CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); - return getDomainSleepTime(cs, profileEntry, crawlEntry); + return getDomainSleepTime(cs, profileEntry, crawlEntry.url()); } - - private long getDomainSleepTime(final CrawlSwitchboard cs, final CrawlProfile profileEntry, Request crawlEntry) { + + private long getDomainSleepTime(final CrawlSwitchboard cs, final CrawlProfile profileEntry, final DigestURI crawlURL) { if (profileEntry == null) { return 0; } long sleeptime = ( profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY || - (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlEntry.url())) - ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server + (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash())) + ) ? 0 : Latency.waitingRemaining(crawlURL, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server return sleeptime; } - + /** * get lists of crawl request entries for a specific host * @param host @@ -360,7 +328,7 @@ public class Balancer { } return cel; } - + private void pushHashToDomainStacks(String host, final byte[] urlhash) throws RowSpaceExceededException { // extend domain stack if (host == null) host = localhost; @@ -388,21 +356,6 @@ public class Balancer { if (domainList.isEmpty()) this.domainStacks.remove(host); } - private byte[] nextFromDelayed() { - if (this.delayed.isEmpty()) return null; - final Long first = this.delayed.firstKey(); - if (first.longValue() < System.currentTimeMillis()) { - return this.delayed.remove(first); - } - return null; - } - - private byte[] anyFromDelayed() { - if (this.delayed.isEmpty()) return null; - final Long first = this.delayed.firstKey(); - return this.delayed.remove(first); - } - /** * get the next entry in this crawl queue in such a way that the domain access time delta is maximized * and always above the given minimum delay time. An additional delay time is computed using the robots.txt @@ -418,41 +371,13 @@ public class Balancer { public Request pop(final boolean delay, final CrawlSwitchboard cs) throws IOException { // returns a crawl entry from the stack and ensures minimum delta times - try { - filltop(delay, -600000, false); - filltop(delay, -60000, false); - filltop(delay, -10000, false); - filltop(delay, -6000, false); - filltop(delay, -4000, false); - filltop(delay, -3000, false); - filltop(delay, -2000, false); - filltop(delay, -1000, false); - filltop(delay, -500, false); - filltop(delay, 0, true); - filltop(delay, 500, true); - filltop(delay, 1000, true); - filltop(delay, 2000, true); - filltop(delay, 3000, true); - filltop(delay, 4000, true); - filltop(delay, 6000, true); - filltop(delay, Long.MAX_VALUE, true); - } catch (final RowSpaceExceededException e) {} - long sleeptime = 0; Request crawlEntry = null; synchronized (this) { byte[] failhash = null; while (!this.urlFileIndex.isEmpty()) { - // first simply take one of the entries in the top list, that should be one without any delay - byte[] nexthash = nextFromDelayed(); - //System.out.println("*** nextFromDelayed=" + nexthash); - if (nexthash == null && !this.top.isEmpty()) { - nexthash = this.top.remove(); - //System.out.println("*** top.remove()=" + nexthash); - } - if (nexthash == null) { - nexthash = anyFromDelayed(); - } + byte[] nexthash = getbest(); + if (nexthash == null) return null; // check minimumDelta and if necessary force a sleep //final int s = urlFileIndex.size(); @@ -485,37 +410,14 @@ public class Balancer { return null; } // depending on the caching policy we need sleep time to avoid DoS-like situations - sleeptime = getDomainSleepTime(cs, profileEntry, crawlEntry); - + sleeptime = getDomainSleepTime(cs, profileEntry, crawlEntry.url()); + assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes()); assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash()); if (failhash != null && Base64Order.enhancedCoder.equal(failhash, nexthash)) break; // prevent endless loops - - if (delay && sleeptime > 0 && this.domStackInitSize > 1) { - //System.out.println("*** putback: nexthash=" + nexthash + ", failhash="+failhash); - // put that thing back to omit a delay here - if (!ByteBuffer.contains(this.delayed.values(), nexthash)) { - //System.out.println("*** delayed +=" + nexthash); - this.delayed.put(Long.valueOf(System.currentTimeMillis() + sleeptime + 1), nexthash); - } - try { - this.urlFileIndex.put(rowEntry); - String host = crawlEntry.url().getHost(); - if (host == null) host = localhost; - this.domainStacks.remove(host); - failhash = nexthash; - } catch (final RowSpaceExceededException e) { - Log.logException(e); - } - continue; - } break; } - if (crawlEntry != null) { - if (this.ddc.size() > 10000 || MemoryControl.shortStatus()) this.ddc.clear(); - try { this.ddc.put(crawlEntry.url().hash()); } catch (final RowSpaceExceededException e) {} - } } if (crawlEntry == null) return null; @@ -524,7 +426,7 @@ public class Balancer { // in best case, this should never happen if the balancer works propertly // this is only to protection against the worst case, where the crawler could // behave in a DoS-manner - Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta) + ", top.size() = " + this.top.size() + ", delayed.size() = " + this.delayed.size() + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize); + Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta) + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize); long loops = sleeptime / 1000; long rest = sleeptime % 1000; if (loops < 3) { @@ -537,15 +439,11 @@ public class Balancer { try {this.wait(1000); } catch (final InterruptedException e) {} } } - this.ddc.remove(crawlEntry.url().hash()); Latency.update(crawlEntry.url()); return crawlEntry; } - private void filltop(final boolean delay, final long maximumwaiting, final boolean acceptonebest) throws RowSpaceExceededException { - if (!this.top.isEmpty()) return; - - //System.out.println("*** DEBUG started filltop delay=" + ((delay) ? "true":"false") + ", maximumwaiting=" + maximumwaiting + ", acceptonebest=" + ((acceptonebest) ? "true":"false")); + private byte[] getbest() { // check if we need to get entries from the file index try { @@ -560,6 +458,7 @@ public class Balancer { long smallestWaiting = Long.MAX_VALUE; byte[] besturlhash = null; String besthost = null; + Map zeroWaitingCandidates = new HashMap(); while (i.hasNext()) { entry = i.next(); @@ -571,34 +470,52 @@ public class Balancer { final byte[] n = entry.getValue().removeOne(); if (n == null) continue; - if (delay) { - final long w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta); - if (w > maximumwaiting) { - if (w < smallestWaiting) { - smallestWaiting = w; - besturlhash = n; - besthost = entry.getKey(); - } - entry.getValue().put(n); // put entry back - continue; + final long w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta); + if (w < smallestWaiting) { + smallestWaiting = w; + besturlhash = n; + besthost = entry.getKey(); + if (w <= 0) { + zeroWaitingCandidates.put(besthost, besturlhash); } } - - this.top.add(n); - if (entry.getValue().isEmpty()) i.remove(); + try { + entry.getValue().put(n); // put entry back, we are checking only + } catch (RowSpaceExceededException e) { + e.printStackTrace(); + } } - // if we could not find any entry, then take the best we have seen so far - if (acceptonebest && !this.top.isEmpty() && besturlhash != null) { - removeHashFromDomainStacks(besthost, besturlhash); - this.top.add(besturlhash); + if (besturlhash == null) return null; // worst case + + // best case would be, if we have some zeroWaitingCandidates, + // then we select that one with the largest stack + if (zeroWaitingCandidates.size() > 0) { + int largestStack = -1; + String largestStackHost = null; + byte[] largestStackHash = null; + for (Map.Entry z: zeroWaitingCandidates.entrySet()) { + HandleSet hs = this.domainStacks.get(z.getKey()); + if (hs == null || hs.size() <= largestStack) continue; + largestStack = hs.size(); + largestStackHost = z.getKey(); + largestStackHash = z.getValue(); + } + if (largestStackHost != null && largestStackHash != null) { + removeHashFromDomainStacks(largestStackHost, largestStackHash); + //Log.logInfo("Balancer", "*** picked one from largest stack"); + return largestStackHash; + } } + + // default case: just take that one with least waiting + removeHashFromDomainStacks(besthost, besturlhash); + return besturlhash; } private void fillDomainStacks() throws IOException { - if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 120000L) return; + if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return; this.domainStacks.clear(); - this.top.clear(); this.lastDomainStackFill = System.currentTimeMillis(); final HandleSet handles = this.urlFileIndex.keysFromBuffer(objectIndexBufferSize / 2); final CloneableIterator i = handles.keys(true, null); @@ -621,51 +538,6 @@ public class Balancer { this.domStackInitSize = this.domainStacks.size(); } - public List top(int count) { - final List cel = new ArrayList(); - if (count == 0) return cel; - byte[][] ta = new byte[Math.min(count, this.top.size())][]; - ta = this.top.toArray(ta); - for (final byte[] n: ta) { - if (n == null) break; - try { - final Row.Entry rowEntry = this.urlFileIndex.get(n, false); - if (rowEntry == null) continue; - final Request crawlEntry = new Request(rowEntry); - cel.add(crawlEntry); - count--; - if (count <= 0) break; - } catch (final IOException e) {} - } - - int depth = 0; - loop: while (count > 0) { - // iterate over the domain stacks - final int celsize = cel.size(); - ll: for (final HandleSet list: this.domainStacks.values()) { - if (list.size() <= depth) continue ll; - final byte[] n = list.getOne(depth); - if (n == null) continue ll; - try { - final Row.Entry rowEntry = this.urlFileIndex.get(n, false); - if (rowEntry == null) continue; - final Request crawlEntry = new Request(rowEntry); - cel.add(crawlEntry); - count--; - if (count <= 0) break loop; - } catch (final IOException e) {} - } - if (cel.size() == celsize) break loop; - depth++; - } - - if (cel.size() < count) try { - final List list = this.urlFileIndex.top(count - cel.size()); - for (final Row.Entry entry: list) cel.add(new Request(entry)); - } catch (final IOException e) { } - return cel; - } - public Iterator iterator() throws IOException { return new EntryIterator(); } @@ -678,10 +550,12 @@ public class Balancer { this.rowIterator = Balancer.this.urlFileIndex.rows(); } + @Override public boolean hasNext() { return (this.rowIterator == null) ? false : this.rowIterator.hasNext(); } + @Override public Request next() { final Row.Entry entry = this.rowIterator.next(); try { @@ -693,6 +567,7 @@ public class Balancer { } } + @Override public void remove() { if (this.rowIterator != null) this.rowIterator.remove(); } diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 1a7208212..bb441902c 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -215,7 +215,7 @@ public class CrawlQueues { } public int coreCrawlJobSize() { - return this.noticeURL.stackSize(NoticedURL.StackType.CORE) + this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD); + return this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD); } public boolean coreCrawlJob() { @@ -226,14 +226,14 @@ public class CrawlQueues { // move some tasks to the core crawl job so we have something to do final int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance for (int i = 0; i < toshift; i++) { - this.noticeURL.shift(NoticedURL.StackType.LIMIT, NoticedURL.StackType.CORE, this.sb.crawler); + this.noticeURL.shift(NoticedURL.StackType.GLOBAL, NoticedURL.StackType.LOCAL, this.sb.crawler); } this.log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() + ", limitCrawlJobSize()=" + limitCrawlJobSize() + ", cluster.mode=" + this.sb.getConfig(SwitchboardConstants.CLUSTER_MODE, "") + ", robinsonMode=" + ((this.sb.isRobinsonMode()) ? "on" : "off")); } - final String queueCheckCore = loadIsPossible(NoticedURL.StackType.CORE); + final String queueCheckCore = loadIsPossible(NoticedURL.StackType.LOCAL); final String queueCheckNoload = loadIsPossible(NoticedURL.StackType.NOLOAD); if (queueCheckCore != null && queueCheckNoload != null) { if (this.log.isFine()) { @@ -251,11 +251,11 @@ public class CrawlQueues { // do a local crawl Request urlEntry; - while (this.noticeURL.stackSize(NoticedURL.StackType.CORE) > 0 || this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) { + while (this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) > 0 || this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) { final String stats = "LOCALCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) + ", " + - this.noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " + - this.noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " + + this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " + + this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]"; try { @@ -284,7 +284,7 @@ public class CrawlQueues { return true; } - urlEntry = this.noticeURL.pop(NoticedURL.StackType.CORE, true, this.sb.crawler); + urlEntry = this.noticeURL.pop(NoticedURL.StackType.LOCAL, true, this.sb.crawler); if (urlEntry == null) { continue; } @@ -300,7 +300,7 @@ public class CrawlQueues { } catch (final IOException e) { this.log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e); if (e.getMessage().indexOf("hash is null",0) > 0) { - this.noticeURL.clear(NoticedURL.StackType.CORE); + this.noticeURL.clear(NoticedURL.StackType.LOCAL); } } } @@ -547,7 +547,7 @@ public class CrawlQueues { } public int limitCrawlJobSize() { - return this.noticeURL.stackSize(NoticedURL.StackType.LIMIT); + return this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL); } public int noloadCrawlJobSize() { @@ -579,7 +579,7 @@ public class CrawlQueues { } // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view) - final String stats = "REMOTETRIGGEREDCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", " + final String stats = "REMOTETRIGGEREDCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]"; try { final Request urlEntry = this.noticeURL.pop(NoticedURL.StackType.REMOTE, true, this.sb.crawler); diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index d0d29a34f..9a0c6c557 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -370,14 +370,14 @@ public final class CrawlStacker { // it may be possible that global == true and local == true, so do not check an error case against it if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LIMIT, entry); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry); } else if (local) { if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry); } else if (proxy) { if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry); } else if (remote) { warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry); } diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java index 3c6dca7da..85644f80d 100644 --- a/source/de/anomic/crawler/Latency.java +++ b/source/de/anomic/crawler/Latency.java @@ -146,7 +146,7 @@ public class Latency { // return time that is remaining //System.out.println("Latency: " + (waiting - timeSinceLastAccess)); - return waiting - timeSinceLastAccess; + return Math.max(0, waiting - timeSinceLastAccess); } /** diff --git a/source/de/anomic/crawler/NoticedURL.java b/source/de/anomic/crawler/NoticedURL.java index 93220c5b0..fa5df6683 100644 --- a/source/de/anomic/crawler/NoticedURL.java +++ b/source/de/anomic/crawler/NoticedURL.java @@ -44,7 +44,7 @@ import de.anomic.crawler.retrieval.Request; public class NoticedURL { public enum StackType { - NULL, CORE, LIMIT, OVERHANG, REMOTE, NOLOAD, IMAGE, MOVIE, MUSIC; + LOCAL, GLOBAL, OVERHANG, REMOTE, NOLOAD; } public static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain @@ -146,8 +146,8 @@ public class NoticedURL { public int stackSize(final StackType stackType) { switch (stackType) { case NOLOAD: return (this.noloadStack == null) ? 0 : this.noloadStack.size(); - case CORE: return (this.coreStack == null) ? 0 : this.coreStack.size(); - case LIMIT: return (this.limitStack == null) ? 0 : this.limitStack.size(); + case LOCAL: return (this.coreStack == null) ? 0 : this.coreStack.size(); + case GLOBAL: return (this.limitStack == null) ? 0 : this.limitStack.size(); case OVERHANG: return 0; case REMOTE: return (this.remoteStack == null) ? 0 : this.remoteStack.size(); default: return -1; @@ -172,9 +172,9 @@ public class NoticedURL { public String push(final StackType stackType, final Request entry) { try { switch (stackType) { - case CORE: + case LOCAL: return this.coreStack.push(entry); - case LIMIT: + case GLOBAL: return this.limitStack.push(entry); case REMOTE: return this.remoteStack.push(entry); @@ -233,30 +233,30 @@ public class NoticedURL { * get a list of domains that are currently maintained as domain stacks * @return a map of clear text strings of host names to the size of the domain stacks */ - public Map getDomainStackHosts(final StackType stackType) { + public Map getDomainStackHosts(final StackType stackType) { switch (stackType) { - case CORE: return this.coreStack.getDomainStackHosts(); - case LIMIT: return this.limitStack.getDomainStackHosts(); + case LOCAL: return this.coreStack.getDomainStackHosts(); + case GLOBAL: return this.limitStack.getDomainStackHosts(); case REMOTE: return this.remoteStack.getDomainStackHosts(); case NOLOAD: return this.noloadStack.getDomainStackHosts(); default: return null; } } - + /** * get a list of domains that are currently maintained as domain stacks * @return a collection of clear text strings of host names */ public long getDomainSleepTime(final StackType stackType, final CrawlSwitchboard cs, Request crawlEntry) { switch (stackType) { - case CORE: return this.coreStack.getDomainSleepTime(cs, crawlEntry); - case LIMIT: return this.limitStack.getDomainSleepTime(cs, crawlEntry); + case LOCAL: return this.coreStack.getDomainSleepTime(cs, crawlEntry); + case GLOBAL: return this.limitStack.getDomainSleepTime(cs, crawlEntry); case REMOTE: return this.remoteStack.getDomainSleepTime(cs, crawlEntry); case NOLOAD: return this.noloadStack.getDomainSleepTime(cs, crawlEntry); default: return 0; } } - + /** * get lists of crawl request entries for a specific host * @param host @@ -265,28 +265,18 @@ public class NoticedURL { */ public List getDomainStackReferences(final StackType stackType, String host, int maxcount) { switch (stackType) { - case CORE: return this.coreStack.getDomainStackReferences(host, maxcount); - case LIMIT: return this.limitStack.getDomainStackReferences(host, maxcount); + case LOCAL: return this.coreStack.getDomainStackReferences(host, maxcount); + case GLOBAL: return this.limitStack.getDomainStackReferences(host, maxcount); case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount); case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount); default: return null; } } - - public List top(final StackType stackType, final int count) { - switch (stackType) { - case CORE: return top(this.coreStack, count); - case LIMIT: return top(this.limitStack, count); - case REMOTE: return top(this.remoteStack, count); - case NOLOAD: return top(this.noloadStack, count); - default: return null; - } - } public Request pop(final StackType stackType, final boolean delay, final CrawlSwitchboard cs) throws IOException { switch (stackType) { - case CORE: return pop(this.coreStack, delay, cs); - case LIMIT: return pop(this.limitStack, delay, cs); + case LOCAL: return pop(this.coreStack, delay, cs); + case GLOBAL: return pop(this.limitStack, delay, cs); case REMOTE: return pop(this.remoteStack, delay, cs); case NOLOAD: return pop(this.noloadStack, false, cs); default: return null; @@ -310,8 +300,8 @@ public class NoticedURL { public void clear(final StackType stackType) { Log.logInfo("NoticedURL", "CLEARING STACK " + stackType); switch (stackType) { - case CORE: this.coreStack.clear(); break; - case LIMIT: this.limitStack.clear(); break; + case LOCAL: this.coreStack.clear(); break; + case GLOBAL: this.limitStack.clear(); break; case REMOTE: this.remoteStack.clear(); break; case NOLOAD: this.noloadStack.clear(); break; default: return; @@ -340,17 +330,11 @@ public class NoticedURL { return null; } - private static List top(final Balancer balancer, int count) { - // this is a filo - top - if (count > balancer.size()) count = balancer.size(); - return balancer.top(count); - } - public Iterator iterator(final StackType stackType) { // returns an iterator of plasmaCrawlBalancerEntry Objects try {switch (stackType) { - case CORE: return this.coreStack.iterator(); - case LIMIT: return this.limitStack.iterator(); + case LOCAL: return this.coreStack.iterator(); + case GLOBAL: return this.limitStack.iterator(); case REMOTE: return this.remoteStack.iterator(); case NOLOAD: return this.noloadStack.iterator(); default: return null; diff --git a/source/de/anomic/data/BookmarkHelper.java b/source/de/anomic/data/BookmarkHelper.java index 6119978d2..3fa6d1fcd 100644 --- a/source/de/anomic/data/BookmarkHelper.java +++ b/source/de/anomic/data/BookmarkHelper.java @@ -143,7 +143,7 @@ public class BookmarkHelper { //load the links final ContentScraper scraper = new ContentScraper(baseURL); //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); - final Writer writer= new TransformerWriter(null,null,scraper, null, false); + final Writer writer = new TransformerWriter(null, null, scraper, null, false); FileUtils.copy(input,writer); writer.close(); links = scraper.getAnchors(); diff --git a/source/de/anomic/http/client/Cache.java b/source/de/anomic/http/client/Cache.java index b886d1924..f89f1d10c 100644 --- a/source/de/anomic/http/client/Cache.java +++ b/source/de/anomic/http/client/Cache.java @@ -40,6 +40,7 @@ import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.Map; +import net.yacy.cora.document.ASCII; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.kelondro.blob.ArrayStack; import net.yacy.kelondro.blob.Compressor; @@ -172,26 +173,30 @@ public final class Cache { * @return true if the content of the url is in the cache, false otherwise */ public static boolean has(final DigestURI url) { + return has(url.hash()); + } + + public static boolean has(final byte[] urlhash) { boolean headerExists; boolean fileExists; //synchronized (responseHeaderDB) { - headerExists = responseHeaderDB.containsKey(url.hash()); - fileExists = fileDB.containsKey(url.hash()); + headerExists = responseHeaderDB.containsKey(urlhash); + fileExists = fileDB.containsKey(urlhash); //} if (headerExists && fileExists) return true; if (!headerExists && !fileExists) return false; // if not both is there then we do a clean-up if (headerExists) try { - log.logWarning("header but not content of url " + url.toString() + " in cache; cleaned up"); + log.logWarning("header but not content of urlhash " + ASCII.String(urlhash) + " in cache; cleaned up"); if (responseHeaderDB instanceof MapHeap) { - ((MapHeap) responseHeaderDB).delete(url.hash()); + ((MapHeap) responseHeaderDB).delete(urlhash); } else { - responseHeaderDB.remove(url.hash()); + responseHeaderDB.remove(urlhash); } } catch (final IOException e) {} if (fileExists) try { //log.logWarning("content but not header of url " + url.toString() + " in cache; cleaned up"); - fileDB.delete(url.hash()); + fileDB.delete(urlhash); } catch (final IOException e) {} return false; } diff --git a/source/de/anomic/http/server/HTTPDFileHandler.java b/source/de/anomic/http/server/HTTPDFileHandler.java index 0ece1e9b0..5ec61e83b 100644 --- a/source/de/anomic/http/server/HTTPDFileHandler.java +++ b/source/de/anomic/http/server/HTTPDFileHandler.java @@ -1039,18 +1039,18 @@ public final class HTTPDFileHandler { if (mimeType.startsWith("text")) { // every text-file distributed by yacy is UTF-8 - if(!path.startsWith("/repository")) { + if (!path.startsWith("/repository")) { mimeType = mimeType + "; charset=UTF-8"; } else { // detect charset of html-files - if((path.endsWith("html") || path.endsWith("htm"))) { + if ((path.endsWith("html") || path.endsWith("htm"))) { // save position fis.mark(1000); // scrape document to look up charset - final ScraperInputStream htmlFilter = new ScraperInputStream(fis,"UTF-8",new DigestURI("http://localhost"),null,false); + final ScraperInputStream htmlFilter = new ScraperInputStream(fis, "UTF-8", new DigestURI("http://localhost"), null, false); final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); - if(charset != null) - mimeType = mimeType + "; charset="+charset; + htmlFilter.close(); + if (charset != null) mimeType = mimeType + "; charset="+charset; // reset position fis.reset(); } diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index 62ee4f994..58e3ef437 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -560,7 +560,12 @@ public class MultiProtocolURI implements Serializable, Comparable= 0 && ((ip6 = this.host.indexOf("]", ip6)) > 0)) { + pss = ip6 + 1; + } + final int r = this.host.indexOf(":", pss); if (r < 0) { this.port = dflt; } else { @@ -1164,13 +1169,14 @@ public class MultiProtocolURI implements Serializable, Comparable/ may have many '/' if the host is omitted and the path starts with '/' @@ -1221,9 +1227,9 @@ public class MultiProtocolURI implements Serializable, Comparable alllinks = yacydoc.getAnchors(); int c = 0; if (isEmpty() || contains(Field.inboundlinkscount_i.name())) addSolr(solrdoc, Field.inboundlinkscount_i, yacydoc.inboundLinkCount()); - if (isEmpty() || contains(Field.inboundlinksnoindexcount_i.name())) addSolr(solrdoc, Field.inboundlinksnoindexcount_i, yacydoc.inboundLinkNoindexCount()); + if (isEmpty() || contains(Field.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount()); final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()]; final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()]; final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()]; @@ -325,7 +326,7 @@ public class SolrScheme extends ConfigurationSet { c++; } if (isEmpty() || contains(Field.inboundlinks_tag_txt.name())) addSolr(solrdoc, Field.inboundlinks_tag_txt, inboundlinksTag); - if (isEmpty() || contains(Field.inboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.inboundlinks_protocol_txt, inboundlinksURLProtocol); + if (isEmpty() || contains(Field.inboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol)); if (isEmpty() || contains(Field.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.inboundlinks_urlstub_txt, inboundlinksURLStub); if (isEmpty() || contains(Field.inboundlinks_name_txt.name())) addSolr(solrdoc, Field.inboundlinks_name_txt, inboundlinksName); if (isEmpty() || contains(Field.inboundlinks_rel_txt.name())) addSolr(solrdoc, Field.inboundlinks_rel_txt, inboundlinksRel); @@ -334,7 +335,7 @@ public class SolrScheme extends ConfigurationSet { c = 0; if (isEmpty() || contains(Field.outboundlinkscount_i.name())) addSolr(solrdoc, Field.outboundlinkscount_i, yacydoc.outboundLinkCount()); - if (isEmpty() || contains(Field.outboundlinksnoindexcount_i.name())) addSolr(solrdoc, Field.outboundlinksnoindexcount_i, yacydoc.outboundLinkNoindexCount()); + if (isEmpty() || contains(Field.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount()); final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()]; final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()]; final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()]; @@ -362,7 +363,7 @@ public class SolrScheme extends ConfigurationSet { c++; } if (isEmpty() || contains(Field.outboundlinks_tag_txt.name())) addSolr(solrdoc, Field.outboundlinks_tag_txt, outboundlinksTag); - if (isEmpty() || contains(Field.outboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.outboundlinks_protocol_txt, outboundlinksURLProtocol); + if (isEmpty() || contains(Field.outboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol)); if (isEmpty() || contains(Field.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.outboundlinks_urlstub_txt, outboundlinksURLStub); if (isEmpty() || contains(Field.outboundlinks_name_txt.name())) addSolr(solrdoc, Field.outboundlinks_name_txt, outboundlinksName); if (isEmpty() || contains(Field.outboundlinks_rel_txt.name())) addSolr(solrdoc, Field.outboundlinks_rel_txt, outboundlinksRel); @@ -476,7 +477,7 @@ public class SolrScheme extends ConfigurationSet { } addSolr(solrdoc, Field.imagescount_i, imgtags.length); if (isEmpty() || contains(Field.images_tag_txt.name())) addSolr(solrdoc, Field.images_tag_txt, imgtags); - if (isEmpty() || contains(Field.images_protocol_txt.name())) addSolr(solrdoc, Field.images_protocol_txt, imgprots); + if (isEmpty() || contains(Field.images_protocol_txt.name())) addSolr(solrdoc, Field.images_protocol_txt, protocolList2indexedList(imgprots)); if (isEmpty() || contains(Field.images_urlstub_txt.name())) addSolr(solrdoc, Field.images_urlstub_txt, imgstubs); if (isEmpty() || contains(Field.images_alt_txt.name())) addSolr(solrdoc, Field.images_alt_txt, imgalts); @@ -556,6 +557,18 @@ public class SolrScheme extends ConfigurationSet { return solrdoc; } + private static String[] protocolList2indexedList(String[] protocol) { + List a = new ArrayList(); + for (int i = 0; i < protocol.length; i++) { + if (!protocol[i].equals("http")) { + String c = Integer.toString(i); + while (c.length() < 3) c = "0" + c; + a.add(c + "-" + protocol[i]); + } + } + return a.toArray(new String[a.size()]); + } + /** * encode a string containing attributes from anchor rel properties binary: * bit 0: "me" contained in rel @@ -615,7 +628,7 @@ public class SolrScheme extends ConfigurationSet { } /* - * standard solr scheme + standard solr schema @@ -641,6 +654,5 @@ public class SolrScheme extends ConfigurationSet { - */ } diff --git a/source/net/yacy/cora/storage/ConfigurationSet.java b/source/net/yacy/cora/storage/ConfigurationSet.java index 6ac64ff30..de025bfea 100644 --- a/source/net/yacy/cora/storage/ConfigurationSet.java +++ b/source/net/yacy/cora/storage/ConfigurationSet.java @@ -121,13 +121,13 @@ public class ConfigurationSet extends AbstractSet implements Set return false; } - public void fill(final ConfigurationSet other) { + public void fill(final ConfigurationSet other, final boolean defaultActivated) { final Iterator i = other.allIterator(); Entry e; while (i.hasNext()) { e = i.next(); if (contains(e.key) || containsDisabled(e.key)) continue; - this.add(e.key(), other.commentHeadline(e.key()), e.enabled()); + this.add(e.key(), other.commentHeadline(e.key()), defaultActivated && e.enabled()); } } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index dfa8802db..cec3c7858 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -634,22 +634,22 @@ dc_rights return (this.outboundlinks == null) ? 0 : this.outboundlinks.size(); } - public int inboundLinkNoindexCount() { + public int inboundLinkNofollowCount() { if (this.inboundlinks == null) resortLinks(); if (this.inboundlinks == null) return 0; int c = 0; for (final String tag: this.inboundlinks.values()) { - if (tag.contains("noindex")) c++; + if (tag.contains("nofollow")) c++; } return c; } - public int outboundLinkNoindexCount() { + public int outboundLinkNofollowCount() { if (this.outboundlinks == null) resortLinks(); if (this.outboundlinks == null) return 0; int c = 0; for (final String tag: this.outboundlinks.values()) { - if (tag.contains("noindex")) c++; + if (tag.contains("nofollow")) c++; } return c; } diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 850a7a433..919d74417 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -472,6 +472,7 @@ public final class TextParser { } public static void grantExtension(final String ext, final boolean grant) { + if (ext == null || ext.length() == 0) return; if (grant) denyExtensionx.remove(ext); else denyExtensionx.put(ext, v); } diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 07b22f6ab..b6ca55ff9 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -131,6 +131,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private float lon, lat; private MultiProtocolURI canonical; + /** * {@link MultiProtocolURI} to the favicon that belongs to the document */ @@ -151,6 +152,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { // the root value here will not be used to load the resource. // it is only the reference for relative links super(linkTags0, linkTags1); + assert root != null; this.root = root; this.evaluationScores = new Evaluation(); this.rss = new HashMap(); @@ -175,6 +177,11 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.canonical = null; } + @Override + public void finish() { + this.content.trimToSize(); + } + private void mergeAnchors(final MultiProtocolURI url, final Properties p) { final Properties p0 = this.anchors.get(url); if (p0 == null) { @@ -485,17 +492,23 @@ public class ContentScraper extends AbstractScraper implements Scraper { final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false); try { FileUtils.copy(new CharArrayReader(inlineHtml), writer); - writer.close(); } catch (final IOException e) { Log.logException(e); return cleanLine(super.stripAll(inlineHtml)); + } finally { + try { + writer.close(); + } catch (IOException e) { + } } for (final Map.Entry entry: scraper.getAnchors().entrySet()) { mergeAnchors(entry.getKey(), entry.getValue()); } this.images.putAll(scraper.images); - return cleanLine(super.stripAll(scraper.content.getChars())); + String line = cleanLine(super.stripAll(scraper.content.getChars())); + scraper.close(); + return line; } private final static String cleanLine(final String s) { @@ -885,14 +898,14 @@ public class ContentScraper extends AbstractScraper implements Scraper { // scrape document to look up charset final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false); String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); - if(charset == null) - charset = Charset.defaultCharset().toString(); + htmlFilter.close(); + if (charset == null) charset = Charset.defaultCharset().toString(); // scrape content final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost")); final Writer writer = new TransformerWriter(null, null, scraper, null, false); FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset)); - + writer.close(); return scraper; } diff --git a/source/net/yacy/document/parser/html/ContentTransformer.java b/source/net/yacy/document/parser/html/ContentTransformer.java index c6d97bea4..ce4679676 100644 --- a/source/net/yacy/document/parser/html/ContentTransformer.java +++ b/source/net/yacy/document/parser/html/ContentTransformer.java @@ -34,7 +34,6 @@ import java.util.TreeSet; import net.yacy.cora.document.ASCII; import net.yacy.kelondro.io.CharBuffer; -import net.yacy.kelondro.logging.Log; public class ContentTransformer extends AbstractTransformer implements Transformer { @@ -90,11 +89,7 @@ public class ContentTransformer extends AbstractTransformer implements Transform } bb.append(" "); final char[] result = bb.getChars(); - try { - bb.close(); - } catch (IOException e) { - Log.logException(e); - } + bb.close(); return result; } diff --git a/source/net/yacy/document/parser/html/Scraper.java b/source/net/yacy/document/parser/html/Scraper.java index ccc7de263..e1dfe73e1 100644 --- a/source/net/yacy/document/parser/html/Scraper.java +++ b/source/net/yacy/document/parser/html/Scraper.java @@ -1,4 +1,4 @@ -// Scraper.java +// Scraper.java // --------------------------- // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de @@ -39,10 +39,12 @@ public interface Scraper { public void scrapeTag1(String tagname, Properties tagopts, char[] text); public void scrapeComment(final char[] comment); - + + public void finish(); + public void close(); - + public void registerHtmlFilterEventListener(ScraperListener listener); - + public void deregisterHtmlFilterEventListener(ScraperListener listener); } diff --git a/source/net/yacy/document/parser/html/ScraperInputStream.java b/source/net/yacy/document/parser/html/ScraperInputStream.java index 6cdc6086a..8c3fa454d 100644 --- a/source/net/yacy/document/parser/html/ScraperInputStream.java +++ b/source/net/yacy/document/parser/html/ScraperInputStream.java @@ -9,7 +9,7 @@ // $LastChangedBy$ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -39,11 +39,11 @@ import net.yacy.cora.document.MultiProtocolURI; public class ScraperInputStream extends InputStream implements ScraperListener { - + private static final int MODE_PRESCAN = 0; private static final int MODE_PRESCAN_FINISHED = 1; private int mode = 1; - + private static final long preBufferSize = 4096; private long preRead = 0; private final BufferedInputStream bufferedIn; @@ -51,10 +51,10 @@ public class ScraperInputStream extends InputStream implements ScraperListener { private String detectedCharset; private boolean charsetChanged = false; private boolean endOfHead = false; - + private Reader reader; private Writer writer; - + public ScraperInputStream( final InputStream inStream, final String inputStreamCharset, @@ -65,10 +65,10 @@ public class ScraperInputStream extends InputStream implements ScraperListener { // create a input stream for buffereing this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize); this.bufferedIn.mark((int) preBufferSize); - + final ContentScraper scraper = new ContentScraper(rooturl); scraper.registerHtmlFilterEventListener(this); - + try { this.reader = (inputStreamCharset == null) ? new InputStreamReader(this) : new InputStreamReader(this,inputStreamCharset); } catch (UnsupportedEncodingException e) { @@ -78,17 +78,17 @@ public class ScraperInputStream extends InputStream implements ScraperListener { // how is that possible? this.reader = new InputStreamReader(this); } - } + } this.writer = new TransformerWriter(null,null,scraper,transformer,passbyIfBinarySuspect); } private static String extractCharsetFromMimetypeHeader(final String mimeType) { if (mimeType == null) return null; - + final String[] parts = mimeType.split(";"); if (parts == null || parts.length <= 1) return null; - - for (int i=1; i < parts.length; i++) { + + for (int i=1; i < parts.length; i++) { final String param = parts[i].trim(); if (param.startsWith("charset=")) { String charset = param.substring("charset=".length()).trim(); @@ -97,13 +97,14 @@ public class ScraperInputStream extends InputStream implements ScraperListener { return charset.trim(); } } - - return null; + + return null; } + @Override public void scrapeTag0(final String tagname, final Properties tagopts) { if (tagname == null || tagname.length() == 0) return; - + if (tagname.equalsIgnoreCase("meta")) { if (tagopts.containsKey("http-equiv")) { final String value = tagopts.getProperty("http-equiv"); @@ -113,7 +114,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener { this.detectedCharset = extractCharsetFromMimetypeHeader(contentType); if (this.detectedCharset != null && this.detectedCharset.length() > 0) { this.charsetChanged = true; - } else if (tagopts.containsKey("charset")) { + } else if (tagopts.containsKey("charset")) { // sometimes the charset property is configured as extra attribut. try it ... this.detectedCharset = tagopts.getProperty("charset"); this.charsetChanged = true; @@ -123,48 +124,54 @@ public class ScraperInputStream extends InputStream implements ScraperListener { } } + @Override public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) { if (tagname == null || tagname.length() == 0) return; - + if (tagname.equalsIgnoreCase("head")) { this.endOfHead = true; } } - + public String detectCharset() throws IOException { - this.mode = MODE_PRESCAN; - + this.mode = MODE_PRESCAN; + // loop until we have detected the header element or the charset data int c; while ((c = this.reader.read())!= -1) { this.writer.write(c); if (this.charsetChanged) break; // thats enough } - + // free writer - this.writer = null; - // don't close writer here, otherwise it will shutdown our source stream + this.writer = null; + // don't close writer here, otherwise it will shutdown our source stream // reset the buffer if not already done if (this.mode != MODE_PRESCAN_FINISHED) { this.mode++; this.bufferedIn.reset(); } - + // return scanning result return (this.charsetChanged) ? this.detectedCharset : null; } + @Override public int read() throws IOException { // mode 0 is called from within the detectCharset function - if (this.mode == MODE_PRESCAN) { + if (this.mode == MODE_PRESCAN) { if (this.endOfHead || this.charsetChanged || this.preRead >= preBufferSize - 1) { - return -1; + return -1; } - this.preRead++; - } + this.preRead++; + } return this.bufferedIn.read(); } - + @Override + public void close() throws IOException { + if (this.writer != null) this.writer.close(); + } + } diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java index e6dfe9c75..77a6b0bd6 100644 --- a/source/net/yacy/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -127,11 +127,7 @@ public final class TransformerWriter extends Writer { } bb.append('>'); final char[] result = bb.getChars(); - try { - bb.close(); - } catch (final IOException e) { - Log.logException(e); - } + bb.close(); return result; } @@ -147,11 +143,7 @@ public final class TransformerWriter extends Writer { bb.append(text); bb.append('<').append('/').append(tagname).append('>'); final char[] result = bb.getChars(); - try { - bb.close(); - } catch (final IOException e) { - Log.logException(e); - } + bb.close(); return result; } @@ -165,11 +157,7 @@ public final class TransformerWriter extends Writer { } bb.append('>'); final char[] result = bb.getChars(); - try { - bb.close(); - } catch (final IOException e) { - Log.logException(e); - } + bb.close(); return result; } @@ -178,11 +166,7 @@ public final class TransformerWriter extends Writer { final CharBuffer cb = new CharBuffer(ContentScraper.MAX_DOCSIZE, gt0, gt0.length + text.length + tagname.length() + 3); cb.append(text).append('<').append('/').append(tagname).append('>'); final char[] result = cb.getChars(); - try { - cb.close(); - } catch (final IOException e) { - Log.logException(e); - } + cb.close(); return result; } @@ -202,11 +186,7 @@ public final class TransformerWriter extends Writer { result = bb.getChars(1); else result = bb.getChars(); - try { - bb.close(); - } catch (final IOException ex) { - Log.logException(ex); - } + bb.close(); return result; } @@ -227,12 +207,7 @@ public final class TransformerWriter extends Writer { // this single tag is collected at once here final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); this.scraper.scrapeTag0(tag, charBuffer.propParser()); - try { - charBuffer.close(); - } catch (final IOException e) { - // TODO Auto-generated catch block - Log.logException(e); - } + charBuffer.close(); } if ((this.transformer != null) && (this.transformer.isTag0(tag))) { // this single tag is collected at once here @@ -240,11 +215,7 @@ public final class TransformerWriter extends Writer { try { return this.transformer.transformTag0(tag, scb.propParser(), quotechar); } finally { - try { - scb.close(); - } catch (final IOException e) { - Log.logException(e); - } + scb.close(); } } else if (((this.scraper != null) && (this.scraper.isTag1(tag))) || ((this.transformer != null) && (this.transformer.isTag1(tag)))) { @@ -252,11 +223,7 @@ public final class TransformerWriter extends Writer { this.filterTag = tag; final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); this.filterOpts = scb.propParser(); - try { - scb.close(); - } catch (final IOException e) { - Log.logException(e); - } + scb.close(); if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset(); return new char[0]; } else { @@ -543,6 +510,7 @@ public final class TransformerWriter extends Writer { // the filter process is messed up // instead, we simply flush the underlying output stream if (this.out != null) this.out.flush(); + if (this.scraper != null) this.scraper.finish(); // if you want to flush all, call close() at end of writing; } @@ -567,8 +535,7 @@ public final class TransformerWriter extends Writer { this.filterOpts = null; if (this.filterCont != null) this.filterCont.close(); this.filterCont = null; -// if (scraper != null) {scraper.close(); scraper = null;} -// if (transformer != null) {transformer.close(); transformer = null;} + if (this.scraper != null) this.scraper.finish(); } private static boolean binaryHint(final char c) { diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 117cd884e..033ced94e 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -203,8 +203,9 @@ public class htmlParser extends AbstractParser implements Parser { } catch (final IOException e) { throw new Parser.Failure("IO error:" + e.getMessage(), location); } finally { + writer.flush(); sourceStream.close(); - writer.close(); + //writer.close(); } //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); //serverFileUtils.copy(sourceFile, hfos); diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index e8b01edc3..3d5f93410 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -144,14 +144,13 @@ public class pdfParser extends AbstractParser implements Parser { try { writer.append(stripper.getText(pdfDoc)); } catch (final Throwable e) {} - } - }; + } + }; t.start(); t.join(3000); if (t.isAlive()) t.interrupt(); pdfDoc.close(); - contentBytes = writer.getBytes(); // get final text before closing writer - writer.close(); + contentBytes = writer.getBytes(); // get final text before closing writer } catch (final IOException e) { // close the writer if (writer != null) try { writer.close(); } catch (final Exception ex) {} @@ -166,6 +165,7 @@ public class pdfParser extends AbstractParser implements Parser { //throw new Parser.Failure(e.getMessage(), location); } finally { try {pdfDoc.close();} catch (final IOException e) {} + writer.close(); } String[] docKeywords = null; @@ -175,7 +175,7 @@ public class pdfParser extends AbstractParser implements Parser { if (docTitle == null) { docTitle = docSubject; } - + // clear resources in pdfbox. they say that is resolved but it's not. see: // https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-351 diff --git a/source/net/yacy/kelondro/blob/MapColumnIndex.java b/source/net/yacy/kelondro/blob/MapColumnIndex.java new file mode 100644 index 000000000..fc7a9dcb2 --- /dev/null +++ b/source/net/yacy/kelondro/blob/MapColumnIndex.java @@ -0,0 +1,174 @@ +/** + * MapColumnIndex + * Copyright 2012 by Michael Christen + * First released 01.02.2012 at http://yacy.net + * + * $LastChangedDate$ + * $LastChangedRevision$ + * $LastChangedBy$ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.kelondro.blob; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; + +import net.yacy.cora.document.ASCII; +import net.yacy.kelondro.order.NaturalOrder; + +/** + * a mapping from a column name to maps with the value of the columns to the primary keys where the entry exist in the table + */ +public class MapColumnIndex { + + private static final long serialVersionUID=-424741536889467566L; + + private final Map>> index; + + public MapColumnIndex() { + this.index = new HashMap>>(); + } + + public synchronized Collection getIndex(final String whereKey, final String isValue) throws UnsupportedOperationException { + Map> references = this.index.get(whereKey); + if (references == null) throw new UnsupportedOperationException(); + Collection indexes = references.get(isValue); + if (indexes == null) return new ArrayList(0); // empty collection + return indexes; + } + + public synchronized void clear() { + this.index.clear(); + } + + /** + * create a full index for the whereKey + * @param whereKey + * @param isValue + * @param table + */ + public synchronized void init(final String whereKey, final String isValue, final Iterator>> table) { + Map> valueIdxMap = new HashMap>(); + this.index.put(whereKey, valueIdxMap); + Map.Entry> line; + while (table.hasNext()) { + line = table.next(); + String value = line.getValue().get(whereKey); + if (value == null) continue; // we don't need to remember that + indexupdate(line.getKey(), valueIdxMap, value); + } + } + + /** + * update an index entry + * @param primarykey the primary key for the row that is updated + * @param row the row that was updated (a mapping from column names to values) + */ + public synchronized void update(final byte[] primarykey, final Map row) { + for (Map.Entry>> entry: this.index.entrySet()) { + // create an index for all columns that we track + String value = row.get(entry.getKey()); + if (value == null) continue; // we don't need to remember that + indexupdate(primarykey, entry.getValue(), value); + } + } + + private void indexupdate(final byte[] primarykey, final Map> valueIdxMap, final String value) { + Collection indexes = valueIdxMap.get(value); + if (indexes == null) { + // create a new index entry + indexes = new ArrayList(1); + indexes.add(primarykey); + valueIdxMap.put(value, indexes); + } else { + // update the existing index entry + // check if value already exist + if (!net.yacy.kelondro.util.ByteBuffer.contains(indexes, primarykey)) { + indexes.add(primarykey); + } + } + } + + /** + * delete all references to the primary key + * @param primarykey + */ + public synchronized void delete(final byte[] primarykey) { + for (Map.Entry>> entry: this.index.entrySet()) { + // we must check all index reference maps: iterate over entries + indexdelete(primarykey, entry.getValue()); + } + } + + private void indexdelete(final byte[] index, final Map> valueIdxMap) { + Iterator>> i = valueIdxMap.entrySet().iterator(); + Map.Entry> ref; + while (i.hasNext()) { + ref = i.next(); + net.yacy.kelondro.util.ByteBuffer.remove(ref.getValue(), index); + if (ref.getValue().isEmpty()) { + i.remove(); + } + } + } + + private static Collection getIndexWithExceptionHandler(final MapColumnIndex idx, final String whereKey, final String isValue, Map> table) { + try { + return idx.getIndex(whereKey, isValue); + } catch (UnsupportedOperationException e) { + idx.init(whereKey, isValue, table.entrySet().iterator()); + try { + return idx.getIndex(whereKey, isValue); + } catch (UnsupportedOperationException ee) { + throw ee; + } + } + } + + private static void printIndex(Collection index) { + System.out.print("idx{"); + int c = 0; + for (byte[] a: index) { + if (c++ != 0) System.out.print(", "); + System.out.print(ASCII.String(a)); + } + System.out.print("}"); + } + + public static void main(String[] args) { + Map> table = new TreeMap>(NaturalOrder.naturalOrder); + Map row; + row = new HashMap(); row.put("a", "1"); row.put("b", "2"); row.put("c", "2"); table.put("line1".getBytes(), row); + row = new HashMap(); row.put("a", "3"); row.put("b", "2"); row.put("c", "4"); table.put("line2".getBytes(), row); + row = new HashMap(); row.put("a", "5"); row.put("b", "2"); row.put("c", "4"); table.put("line3".getBytes(), row); + row = new HashMap(); row.put("a", "6"); row.put("b", "7"); row.put("c", "8"); table.put("line4".getBytes(), row); + MapColumnIndex idx = new MapColumnIndex(); + System.out.print("colum b, value 2: "); printIndex(getIndexWithExceptionHandler(idx, "b", "2", table)); System.out.println(); + System.out.print("colum c, value 4: "); printIndex(getIndexWithExceptionHandler(idx, "c", "4", table)); System.out.println(); + System.out.print("colum b, value 2: "); printIndex(getIndexWithExceptionHandler(idx, "b", "7", table)); System.out.println(); + System.out.print("colum d, value 0: "); printIndex(getIndexWithExceptionHandler(idx, "d", "0", table)); System.out.println(); + row = new HashMap(); row.put("a", "9"); row.put("b", "9"); row.put("c", "4"); table.put("line5".getBytes(), row); + idx.update("line5".getBytes(), row); + System.out.print("colum c, value 4: "); printIndex(getIndexWithExceptionHandler(idx, "c", "4", table)); System.out.println(); + } + +} diff --git a/source/net/yacy/kelondro/blob/MapDataMining.java b/source/net/yacy/kelondro/blob/MapDataMining.java index 1fc87c0ef..ef36e46d4 100644 --- a/source/net/yacy/kelondro/blob/MapDataMining.java +++ b/source/net/yacy/kelondro/blob/MapDataMining.java @@ -29,6 +29,7 @@ package net.yacy.kelondro.blob; import java.io.File; import java.io.IOException; +import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -55,6 +56,7 @@ public class MapDataMining extends MapHeap { private Map> sortClusterMap; // a String-kelondroMScoreCluster - relation private Map accLong; // to store accumulations of Long cells private Map accFloat; // to store accumulations of Float cells + private final MapColumnIndex columnIndex; // to store fast select-where indexes @SuppressWarnings("unchecked") public MapDataMining(final File heapFile, @@ -73,6 +75,8 @@ public class MapDataMining extends MapHeap { this.longaccfields = longaccfields; this.floataccfields = floataccfields; + this.columnIndex = new MapColumnIndex(); + ScoreMap[] cluster = null; if (sortfields == null) this.sortClusterMap = null; else { this.sortClusterMap = new ConcurrentHashMap>(); @@ -192,6 +196,8 @@ public class MapDataMining extends MapHeap { this.accFloat.put(floataccfield, FLOAT0); } } + + this.columnIndex.clear(); } @Override @@ -216,6 +222,8 @@ public class MapDataMining extends MapHeap { // update sortCluster if (this.sortClusterMap != null) updateSortCluster(UTF8.String(key), newMap); + + this.columnIndex.update(key, newMap); } private void updateAcc(final Map map, final boolean add) { @@ -294,6 +302,8 @@ public class MapDataMining extends MapHeap { } } super.delete(key); + + this.columnIndex.delete(key); } private void deleteSortCluster(final String key) { @@ -315,6 +325,10 @@ public class MapDataMining extends MapHeap { return new string2bytearrayIterator(cluster.keys(up)); } + private synchronized Iterator keys() throws IOException { + return super.keys(true, null); + } + private static class string2bytearrayIterator implements Iterator { private final Iterator s; @@ -342,15 +356,25 @@ public class MapDataMining extends MapHeap { } - @Override - public synchronized Iterator>> entries(final String whereKey, final String isValue) throws IOException { - return super.entries(whereKey, isValue); + public synchronized Collection select(final String whereKey, final String isValue) throws IOException { + Collection idx = null; + try { + idx = this.columnIndex.getIndex(whereKey, isValue); + } catch (UnsupportedOperationException e) { + this.columnIndex.init(whereKey, isValue, new FullMapIterator(keys())); + try { + idx = this.columnIndex.getIndex(whereKey, isValue); + } catch (UnsupportedOperationException ee) { + throw ee; + } + } + return idx; } - + public synchronized Iterator>> entries(final boolean up, final String field) { - return new MapIterator(keys(up, field), null, null); + return new FullMapIterator(keys(up, field)); } - + public synchronized long getLongAcc(final String field) { final Long accumulator = this.accLong.get(field); if (accumulator == null) return -1; diff --git a/source/net/yacy/kelondro/blob/MapHeap.java b/source/net/yacy/kelondro/blob/MapHeap.java index 42073f9c7..5d5049f7f 100644 --- a/source/net/yacy/kelondro/blob/MapHeap.java +++ b/source/net/yacy/kelondro/blob/MapHeap.java @@ -82,6 +82,14 @@ public class MapHeap implements Map> { return this.blob.keylength(); } + /** + * get the ordering of the primary keys + * @return + */ + public ByteOrder ordering() { + return this.blob.ordering(); + } + /** * clears the content of the database * @throws IOException @@ -366,6 +374,10 @@ public class MapHeap implements Map> { return new KeyIterator(up, rotating, firstKey, secondKey); } + public synchronized CloneableIterator keys(boolean up, byte[] firstKey) throws IOException { + return this.blob.keys(up, firstKey); + } + public class KeyIterator implements CloneableIterator, Iterator { final boolean up, rotating; @@ -406,17 +418,13 @@ public class MapHeap implements Map> { } } - - public synchronized Iterator>> entries(final String whereKey, final String isValue) throws IOException { - return new MapIterator(this.blob.keys(true, null), whereKey, isValue); - } public synchronized Iterator>> entries(final boolean up, final boolean rotating) throws IOException { - return new MapIterator(keys(up, rotating), null, null); + return new FullMapIterator(keys(up, rotating)); } public synchronized Iterator>> entries(final boolean up, final boolean rotating, final byte[] firstKey, final byte[] secondKey) throws IOException { - return new MapIterator(keys(up, rotating, firstKey, secondKey), null, null); + return new FullMapIterator(keys(up, rotating, firstKey, secondKey)); } /** @@ -448,18 +456,15 @@ public class MapHeap implements Map> { public void finalize() { close(); } - - public class MapIterator extends LookAheadIterator>> implements Iterator>> { + + protected class FullMapIterator extends LookAheadIterator>> implements Iterator>> { // enumerates Map-Type elements // the key is also included in every map that is returned; it's key is 'key' private final Iterator keyIterator; - private final String whereKey, isValue; - MapIterator(final Iterator keyIterator, final String whereKey, final String isValue) { + FullMapIterator(final Iterator keyIterator) { this.keyIterator = keyIterator; - this.whereKey = whereKey; - this.isValue = isValue; } @Override @@ -479,19 +484,14 @@ public class MapHeap implements Map> { continue; } if (map == null) continue; // circumvention of a modified exception - // check if the where case holds - if (this.whereKey != null && this.isValue != null) { - String v = map.get(this.whereKey); - if (v == null) continue; - if (!v.equals(this.isValue)) continue; - } // produce entry Map.Entry> entry = new AbstractMap.SimpleImmutableEntry>(nextKey, map); return entry; } return null; } - } // class mapIterator + } // class FullMapIterator + @Override public void putAll(final Map> map) { diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index be805dbeb..dbef4fe98 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -189,7 +189,7 @@ public class URIMetadataRow implements URIMetadata { final String dc_publisher, final float lat, final float lon) { - final CharBuffer s = new CharBuffer(20000, 360); + final CharBuffer s = new CharBuffer(3600, 360); s.append(url.toNormalform(false, true)).appendLF(); s.append(dc_title).appendLF(); if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator); diff --git a/source/net/yacy/kelondro/data/word/Word.java b/source/net/yacy/kelondro/data/word/Word.java index b4986d0dd..714bbbd6b 100644 --- a/source/net/yacy/kelondro/data/word/Word.java +++ b/source/net/yacy/kelondro/data/word/Word.java @@ -26,6 +26,7 @@ package net.yacy.kelondro.data.word; +import java.util.Collection; import java.util.HashSet; import java.util.Iterator; import java.util.Locale; @@ -118,11 +119,11 @@ public class Word { private final static byte lowByte = Base64Order.alpha_enhanced[0]; private final static byte highByte = Base64Order.alpha_enhanced[Base64Order.alpha_enhanced.length - 1]; - + public static boolean isPrivate(byte[] hash) { return hash[0] == highByte && hash[1] == highByte && hash[2] == highByte && hash[3] == highByte && hash[4] == highByte; } - + // create a word hash public static final byte[] word2hash(final String word) { final String wordlc = word.toLowerCase(Locale.ENGLISH); @@ -148,7 +149,7 @@ public class Word { public final static byte PRIVATE_TYPE_COPY = 'C'; // used for a private local copy of the index public final static byte PRIVATE_TYPE_PHONETIC = 'K'; // used for ColognePhonetics - + public static final byte[] hash2private(final byte[] hash, byte privateType) { byte[] p = new byte[commonHashLength]; p[0] = highByte; p[1] = highByte; p[2] = highByte; ; p[3] = highByte; ; p[4] = highByte; p[5] = privateType; @@ -156,7 +157,7 @@ public class Word { return p; } - public static final HandleSet words2hashesHandles(final Set words) { + public static final HandleSet words2hashesHandles(final Collection words) { final HandleSet hashes = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, words.size()); for (final String word: words) try { diff --git a/source/net/yacy/kelondro/io/CharBuffer.java b/source/net/yacy/kelondro/io/CharBuffer.java index 5c6c340ba..4c2e698aa 100644 --- a/source/net/yacy/kelondro/io/CharBuffer.java +++ b/source/net/yacy/kelondro/io/CharBuffer.java @@ -73,27 +73,6 @@ public final class CharBuffer extends Writer { this.maximumLength = maximumLength; } - public CharBuffer(final int maximumLength, final char[] bb, final int of, final int le) { - if (of * 2 > bb.length) { - this.buffer = new char[le]; - System.arraycopy(bb, of, this.buffer, 0, le); - this.length = le; - this.offset = 0; - } else { - this.buffer = bb; - this.length = le; - this.offset = of; - } - this.maximumLength = maximumLength; - } - - public CharBuffer(final CharBuffer bb) { - this.buffer = bb.buffer; - this.length = bb.length; - this.offset = bb.offset; - this.maximumLength = bb.maximumLength; - } - public CharBuffer(final File f) throws IOException { // initially fill the buffer with the content of a file if (f.length() > Integer.MAX_VALUE) throw new IOException("file is too large for buffering"); @@ -130,8 +109,7 @@ public final class CharBuffer extends Writer { } private void grow(int minSize) { - int newsize = this.buffer.length + 1024; - if (newsize < minSize) newsize = minSize+1; + int newsize = 12 * Math.max(this.buffer.length, minSize) / 10; // grow by 20% char[] tmp = new char[newsize]; System.arraycopy(this.buffer, this.offset, tmp, 0, this.length); this.buffer = tmp; @@ -187,7 +165,7 @@ public final class CharBuffer extends Writer { } public CharBuffer append(final char[] bb) { - write(bb); + write(bb, 0, bb.length); return this; } @@ -205,14 +183,14 @@ public final class CharBuffer extends Writer { public CharBuffer append(final String s) { final char[] temp = new char[s.length()]; s.getChars(0, temp.length, temp, 0); - write(temp); + write(temp, 0, temp.length); return this; } public CharBuffer append(final String s, final int off, final int len) { final char[] temp = new char[len]; s.getChars(off, (off + len), temp, 0); - write(temp); + write(temp, 0, len); return this; } @@ -479,15 +457,12 @@ public final class CharBuffer extends Writer { this.offset = 0; } - public void reset(final int newSize) { - this.resize(newSize); - this.reset(); - } - - public void resize(final int newSize) { - if(newSize < 0) throw new IllegalArgumentException("Illegal array size: " + newSize); - final char[] v = new char[newSize]; - System.arraycopy(this.buffer,0,v,0,newSize > this.buffer.length ? this.buffer.length : newSize); + /** + * call trimToSize() whenever a CharBuffer is not extended any more and is kept to store the content permanently + */ + public void trimToSize() { + final char[] v = new char[this.length]; + System.arraycopy(this.buffer, this.offset, v, 0, this.length); this.buffer = v; } @@ -498,13 +473,15 @@ public final class CharBuffer extends Writer { } @Override - public void close() throws IOException { + public void close() { + this.length = 0; + this.offset = 0; this.buffer = null; // assist with garbage collection } @Override - public void flush() throws IOException { - // TODO Auto-generated method stub + public void flush() { + trimToSize(); } } \ No newline at end of file diff --git a/source/net/yacy/kelondro/table/SplitTable.java b/source/net/yacy/kelondro/table/SplitTable.java index 465ed92e4..c86eccb43 100644 --- a/source/net/yacy/kelondro/table/SplitTable.java +++ b/source/net/yacy/kelondro/table/SplitTable.java @@ -320,7 +320,9 @@ public class SplitTable implements Index, Iterable { public Row.Entry get(final byte[] key, final boolean forcecopy) throws IOException { final Index keeper = keeperOf(key); if (keeper == null) return null; - return keeper.get(key, forcecopy); + synchronized (this) { // avoid concurrent IO from different methods + return keeper.get(key, forcecopy); + } } @Override @@ -376,8 +378,10 @@ public class SplitTable implements Index, Iterable { public Row.Entry replace(final Row.Entry row) throws IOException, RowSpaceExceededException { assert row.objectsize() <= this.rowdef.objectsize; Index keeper = keeperOf(row.getPrimaryKeyBytes()); - if (keeper != null) return keeper.replace(row); - synchronized (this.tables) { + if (keeper != null) synchronized (this) { // avoid concurrent IO from different methods + return keeper.replace(row); + } + synchronized (this) { assert this.current == null || this.tables.get(this.current) != null : "this.current = " + this.current; keeper = (this.current == null) ? newTable() : checkTable(this.tables.get(this.current)); } @@ -397,12 +401,11 @@ public class SplitTable implements Index, Iterable { assert row.objectsize() <= this.rowdef.objectsize; final byte[] key = row.getPrimaryKeyBytes(); if (this.tables == null) return true; - Index keeper = null; - synchronized (this.tables) { - keeper = keeperOf(key); + Index keeper = keeperOf(key); + if (keeper != null) synchronized (this) { // avoid concurrent IO from different methods + return keeper.put(row); } - if (keeper != null) return keeper.put(row); - synchronized (this.tables) { + synchronized (this) { keeper = keeperOf(key); // we must check that again because it could have changed in between if (keeper != null) return keeper.put(row); assert this.current == null || this.tables.get(this.current) != null : "this.current = " + this.current; @@ -425,12 +428,12 @@ public class SplitTable implements Index, Iterable { @Override public void addUnique(final Row.Entry row) throws IOException, RowSpaceExceededException { assert row.objectsize() <= this.rowdef.objectsize; - Index table = (this.current == null) ? null : this.tables.get(this.current); - synchronized (this.tables) { + Index keeper = (this.current == null) ? null : this.tables.get(this.current); + synchronized (this) { assert this.current == null || this.tables.get(this.current) != null : "this.current = " + this.current; - if (table == null) table = newTable(); else table = checkTable(table); + if (keeper == null) keeper = newTable(); else keeper = checkTable(keeper); } - table.addUnique(row); + keeper.addUnique(row); } @Override @@ -447,14 +450,18 @@ public class SplitTable implements Index, Iterable { public boolean delete(final byte[] key) throws IOException { final Index table = keeperOf(key); if (table == null) return false; - return table.delete(key); + synchronized (this) { // avoid concurrent IO from different methods + return table.delete(key); + } } @Override public Row.Entry remove(final byte[] key) throws IOException { final Index table = keeperOf(key); if (table == null) return null; - return table.remove(key); + synchronized (this) { // avoid concurrent IO from different methods + return table.remove(key); + } } @Override @@ -472,7 +479,9 @@ public class SplitTable implements Index, Iterable { if (maxtable == null) { return null; } - return maxtable.removeOne(); + synchronized (this) { // avoid concurrent IO from different methods + return maxtable.removeOne(); + } } @Override @@ -490,7 +499,9 @@ public class SplitTable implements Index, Iterable { if (maxtable == null) { return null; } - return maxtable.top(count); + synchronized (this) { // avoid concurrent IO from different methods + return maxtable.top(count); + } } @Override diff --git a/source/net/yacy/kelondro/util/ByteBuffer.java b/source/net/yacy/kelondro/util/ByteBuffer.java index 051467e61..93ef6d7b8 100644 --- a/source/net/yacy/kelondro/util/ByteBuffer.java +++ b/source/net/yacy/kelondro/util/ByteBuffer.java @@ -32,6 +32,7 @@ import java.io.OutputStream; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Collection; +import java.util.Iterator; import java.util.List; import java.util.Properties; @@ -67,7 +68,7 @@ public final class ByteBuffer extends OutputStream { } public ByteBuffer(final String s) { - this.buffer = s.getBytes(UTF8.charset); + this.buffer = UTF8.getBytes(s); this.length = this.buffer.length; this.offset = 0; } @@ -140,6 +141,7 @@ public final class ByteBuffer extends OutputStream { this.offset = 0; } + @Override public void write(final int b) { write((byte) (b & 0xff)); } @@ -518,6 +520,20 @@ public final class ByteBuffer extends OutputStream { return false; } + public static int remove(final Collection collection, final byte[] key) { + Iterator i = collection.iterator(); + byte[] v; + int c = 0; + while (i.hasNext()) { + v = i.next(); + if (equals(v, key)) { + i.remove(); + c++; + } + } + return c; + } + public static List split(final byte[] b, final byte s) { final ArrayList a = new ArrayList(); int c = 0; diff --git a/source/net/yacy/kelondro/util/SetTools.java b/source/net/yacy/kelondro/util/SetTools.java index a08dc2e0c..6f34a9b74 100644 --- a/source/net/yacy/kelondro/util/SetTools.java +++ b/source/net/yacy/kelondro/util/SetTools.java @@ -49,12 +49,12 @@ import net.yacy.kelondro.logging.Log; public final class SetTools { - + //public static Comparator fastStringComparator = fastStringComparator(true); // ------------------------------------------------------------------------------------------------ // helper methods - + public static int log2a(int x) { // this computes 1 + log2 // it is the number of bits in x, not the logarithm by 2 @@ -72,10 +72,10 @@ public final class SetTools { // - join by pairwise enumeration // - join by iterative tests (where we distinguish left-right and right-left tests) - + public static SortedMap joinConstructive(final Collection> maps, final boolean concatStrings) { // this joins all TreeMap(s) contained in maps - + // first order entities by their size final SortedMap> orderMap = new TreeMap>(); SortedMap singleMap; @@ -84,18 +84,18 @@ public final class SetTools { while (i.hasNext()) { // get next entity: singleMap = i.next(); - + // check result if ((singleMap == null) || (singleMap.isEmpty())) return new TreeMap(); - + // store result in order of result size orderMap.put(Long.valueOf(singleMap.size() * 1000 + count), singleMap); count++; } - + // check if there is any result if (orderMap.isEmpty()) return new TreeMap(); - + // we now must pairwise build up a conjunction of these maps Long k = orderMap.firstKey(); // the smallest, which means, the one with the least entries SortedMap mapA, mapB, joinResult = orderMap.remove(k); @@ -114,7 +114,7 @@ public final class SetTools { if (joinResult.isEmpty()) return new TreeMap(); return joinResult; } - + public static SortedMap joinConstructive(final SortedMap map1, final SortedMap map2, final boolean concatStrings) { // comparators must be equal if ((map1 == null) || (map2 == null)) return null; @@ -134,7 +134,7 @@ public final class SetTools { } return joinConstructiveByEnumeration(map1, map2, concatStrings); } - + @SuppressWarnings("unchecked") private static SortedMap joinConstructiveByTest(final SortedMap small, final SortedMap large, final boolean concatStrings) { final SortedMap result = new TreeMap(large.comparator()); @@ -198,7 +198,7 @@ public final class SetTools { } return result; } - + // now the same for set-set public static SortedSet joinConstructive(final SortedSet set1, final SortedSet set2) { // comparators must be equal @@ -220,9 +220,9 @@ public final class SetTools { return joinConstructiveByEnumeration(set1, set2); } - private static SortedSet joinConstructiveByTest(final SortedSet small, final SortedSet large) { + public static SortedSet joinConstructiveByTest(final Collection small, final SortedSet large) { final Iterator mi = small.iterator(); - final SortedSet result = new TreeSet(small.comparator()); + final SortedSet result = new TreeSet(large.comparator()); A o; while (mi.hasNext()) { o = mi.next(); @@ -256,7 +256,7 @@ public final class SetTools { } return result; } - + /** * test if one set is totally included in another set * @param @@ -269,8 +269,8 @@ public final class SetTools { if (!large.contains(o)) return false; } return true; - } - + } + /** * test if one set is totally included in another set * @param small @@ -282,8 +282,8 @@ public final class SetTools { if (!large.has(handle)) return false; } return true; - } - + } + /** * test if the intersection of two sets is not empty * @param @@ -379,7 +379,7 @@ public final class SetTools { } return false; } - + private static boolean anymatchByEnumeration(final HandleSet set1, final HandleSet set2) { // implement pairwise enumeration final Comparator comp = set1.comparator(); @@ -402,7 +402,7 @@ public final class SetTools { } return false; } - + // ------------------------------------------------------------------------------------------------ // exclude @@ -416,7 +416,7 @@ public final class SetTools { return excludeConstructiveByTestMapInSet(map, set); // return excludeConstructiveByEnumeration(map, set); } - + private static TreeMap excludeConstructiveByTestMapInSet(final TreeMap map, final Set set) { final TreeMap result = new TreeMap(map.comparator()); A o; @@ -427,7 +427,7 @@ public final class SetTools { return result; } */ - + public static void excludeDestructive(final Map map, final Set set) { // comparators must be equal if (map == null) return; @@ -440,40 +440,40 @@ public final class SetTools { else excludeDestructiveByTestSetInMap(map, set); } - + private static void excludeDestructiveByTestMapInSet(final Map map, final Set set) { final Iterator mi = map.keySet().iterator(); while (mi.hasNext()) if (set.contains(mi.next())) mi.remove(); } - + private static void excludeDestructiveByTestSetInMap(final Map map, final Set set) { final Iterator si = set.iterator(); while (si.hasNext()) map.remove(si.next()); } - + // and the same again with set-set public static void excludeDestructive(final Set set1, final Set set2) { if (set1 == null) return; if (set2 == null) return; assert !(set1 instanceof SortedSet && set2 instanceof SortedSet) || ((SortedSet) set1).comparator() == ((SortedSet) set2).comparator(); if (set1.isEmpty() || set2.isEmpty()) return; - + if (set1.size() < set2.size()) excludeDestructiveByTestSmallInLarge(set1, set2); else excludeDestructiveByTestLargeInSmall(set1, set2); } - - private static void excludeDestructiveByTestSmallInLarge(final Set small, final Set large) { + + public static void excludeDestructiveByTestSmallInLarge(final Collection small, final Set large) { final Iterator mi = small.iterator(); while (mi.hasNext()) if (large.contains(mi.next())) mi.remove(); } - - private static void excludeDestructiveByTestLargeInSmall(final Set large, final Set small) { + + public static void excludeDestructiveByTestLargeInSmall(final Set large, final Collection small) { final Iterator si = small.iterator(); while (si.hasNext()) large.remove(si.next()); } - + // ------------------------------------------------------------------------------------------------ public static SortedMap loadMap(final String filename, final String sep) { @@ -488,13 +488,13 @@ public final class SetTools { if ((line.length() > 0 && line.charAt(0) != '#') && ((pos = line.indexOf(sep)) > 0)) map.put(line.substring(0, pos).trim().toLowerCase(), line.substring(pos + sep.length()).trim()); } - } catch (final IOException e) { + } catch (final IOException e) { } finally { if (br != null) try { br.close(); } catch (final Exception e) {} } return map; } - + public static SortedMap> loadMapMultiValsPerKey(final String filename, final String sep) { final SortedMap> map = new TreeMap>(); BufferedReader br = null; @@ -511,17 +511,17 @@ public final class SetTools { map.get(key).add(value); } } - } catch (final IOException e) { + } catch (final IOException e) { } finally { if (br != null) try { br.close(); } catch (final Exception e) {} } return map; } - + public static SortedSet loadList(final File file, final Comparator c) { final SortedSet list = new TreeSet(c); if (!(file.exists())) return list; - + BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(new FileInputStream(file))); @@ -531,7 +531,7 @@ public final class SetTools { if (line.length() > 0 && line.charAt(0) != '#') list.add(line.trim().toLowerCase()); } br.close(); - } catch (final IOException e) { + } catch (final IOException e) { } finally { if (br != null) try{br.close();}catch(final Exception e){} } @@ -547,7 +547,7 @@ public final class SetTools { } return sb.toString(); } - + public static String setToString(final Set set, final char separator) { final Iterator i = set.iterator(); final StringBuilder sb = new StringBuilder(set.size() * 7); @@ -560,7 +560,7 @@ public final class SetTools { // ------------------------------------------------------------------------------------------------ - + public static void main(final String[] args) { final SortedMap m = new TreeMap(); final SortedMap s = new TreeMap(); diff --git a/source/net/yacy/peers/SeedDB.java b/source/net/yacy/peers/SeedDB.java index f5a11642d..9467beab5 100644 --- a/source/net/yacy/peers/SeedDB.java +++ b/source/net/yacy/peers/SeedDB.java @@ -29,11 +29,10 @@ import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; -import java.lang.ref.SoftReference; import java.net.InetAddress; import java.util.ArrayList; +import java.util.Collection; import java.util.HashSet; -import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Set; @@ -98,8 +97,6 @@ public final class SeedDB implements AlternativeDomainNames { private Seed mySeed; // my own seed private final Set myBotIDs; // list of id's that this bot accepts as robots.txt identification - private final Map nameLookupCache; // a name-to-hash relation - private final Map> ipLookupCache; public SeedDB( final File networkRoot, @@ -128,12 +125,6 @@ public final class SeedDB implements AlternativeDomainNames { this.seedPassiveDB = openSeedTable(this.seedPassiveDBFile); this.seedPotentialDB = openSeedTable(this.seedPotentialDBFile); - // start our virtual DNS service for yacy peers with empty cache - this.nameLookupCache = new HashMap(); - - // cache for reverse name lookup - this.ipLookupCache = new HashMap>(); - // check if we are in the seedCaches: this can happen if someone else published our seed removeMySeed(); @@ -184,12 +175,6 @@ public final class SeedDB implements AlternativeDomainNames { this.seedPassiveDB = openSeedTable(this.seedPassiveDBFile); this.seedPotentialDB = openSeedTable(this.seedPotentialDBFile); - // start our virtual DNS service for yacy peers with empty cache - this.nameLookupCache.clear(); - - // cache for reverse name lookup - this.ipLookupCache.clear(); - // check if we are in the seedCaches: this can happen if someone else published our seed removeMySeed(); @@ -497,7 +482,6 @@ public final class SeedDB implements AlternativeDomainNames { //seed.put(yacySeed.LASTSEEN, yacyCore.shortFormatter.format(new Date(yacyCore.universalTime()))); synchronized (this) { try { - this.nameLookupCache.put(seed.getName(), seed.hash); final ConcurrentMap seedPropMap = seed.getMap(); this.seedActiveDB.insert(ASCII.getBytes(seed.hash), seedPropMap); this.seedPassiveDB.delete(ASCII.getBytes(seed.hash)); @@ -513,7 +497,6 @@ public final class SeedDB implements AlternativeDomainNames { if (seed.isProper(false) != null) return; synchronized (this) { try { - this.nameLookupCache.remove(seed.getName()); this.seedActiveDB.delete(ASCII.getBytes(seed.hash)); this.seedPotentialDB.delete(ASCII.getBytes(seed.hash)); } catch (final Exception e) { Log.logWarning("yacySeedDB", "could not remove hash ("+ e.getClass() +"): "+ e.getMessage()); } @@ -532,7 +515,6 @@ public final class SeedDB implements AlternativeDomainNames { if (seed.isProper(false) != null) return; synchronized (this) { try { - this.nameLookupCache.remove(seed.getName()); this.seedActiveDB.delete(ASCII.getBytes(seed.hash)); this.seedPassiveDB.delete(ASCII.getBytes(seed.hash)); } catch (final Exception e) { Log.logWarning("yacySeedDB", "could not remove hash ("+ e.getClass() +"): "+ e.getMessage()); } @@ -637,52 +619,35 @@ public final class SeedDB implements AlternativeDomainNames { return this.mySeed; } - // then try to use the cache peerName = peerName.toLowerCase(); - final String seedhash = this.nameLookupCache.get(peerName); Seed seed; - if (seedhash != null) { - seed = this.get(seedhash); - if (seed != null) { - //System.out.println("*** found lookupByName in cache: " + peerName); - return seed; - } - } // enumerate the cache String name = Seed.checkPeerName(peerName); - Map.Entry> entry; - try { - Iterator>> mmap = this.seedActiveDB.entries(Seed.NAME, name); - while (mmap.hasNext()) { - entry = mmap.next(); - if (entry == null) break; - seed = this.getConnected(ASCII.String(entry.getKey())); + synchronized (this) { try { + Collection idx = this.seedActiveDB.select(Seed.NAME, name); + for (byte[] pk: idx) { + seed = this.getConnected(ASCII.String(pk)); if (seed == null) continue; - if (seed.isProper(false) == null) this.nameLookupCache.put(seed.getName().toLowerCase(), seed.hash); //System.out.println("*** found lookupByName in seedActiveDB: " + peerName); return seed; } } catch ( IOException e ) { - } - try { - Iterator>> mmap = this.seedPassiveDB.entries(Seed.NAME, name); - while (mmap.hasNext()) { - entry = mmap.next(); - if (entry == null) break; - seed = this.getConnected(ASCII.String(entry.getKey())); + }} + synchronized (this) { try { + Collection idx = this.seedPassiveDB.select(Seed.NAME, name); + for (byte[] pk: idx) { + seed = this.getDisconnected(ASCII.String(pk)); if (seed == null) continue; - if (seed.isProper(false) == null) this.nameLookupCache.put(seed.getName().toLowerCase(), seed.hash); //System.out.println("*** found lookupByName in seedPassiveDB: " + peerName); return seed; } } catch ( IOException e ) { - } - + }} + // check local seed if (this.mySeed == null) initMySeed(); name = this.mySeed.getName().toLowerCase(); - if (this.mySeed.isProper(false) == null) this.nameLookupCache.put(name, this.mySeed.hash); if (name.equals(peerName)) return this.mySeed; // nothing found return null; @@ -705,31 +670,16 @@ public final class SeedDB implements AlternativeDomainNames { } // then try to use the cache - final SoftReference ref = this.ipLookupCache.get(peerIP); Seed seed = null; - if (ref != null) { - seed = ref.get(); - if (seed != null) { - //System.out.println("*** found lookupByIP in cache: " + peerIP.toString() + " -> " + this.mySeed.getName()); - return seed; - } - } - String ipString = peerIP.getHostAddress(); - Map.Entry> entry; - if (lookupConnected) { + if (lookupConnected) synchronized (this) { try { - Iterator>> mmap = this.seedActiveDB.entries(Seed.IP, ipString); - while (mmap.hasNext()) { - entry = mmap.next(); - if (entry == null) break; - String p = entry.getValue().get(Seed.PORT); - if (p == null) continue; - if (port > 0 && Integer.parseInt(p) != port) continue; - seed = this.getConnected(ASCII.String(entry.getKey())); + Collection idx = this.seedActiveDB.select(Seed.IP, ipString); + for (byte[] pk: idx) { + seed = this.getConnected(ASCII.String(pk)); if (seed == null) continue; - this.ipLookupCache.put(peerIP, new SoftReference(seed)); + if (seed.getPort() != port) continue; //System.out.println("*** found lookupByIP in connected: " + peerIP.toString() + " -> " + seed.getName()); return seed; } @@ -737,18 +687,13 @@ public final class SeedDB implements AlternativeDomainNames { } } - if (lookupDisconnected) { + if (lookupDisconnected) synchronized (this) { try { - Iterator>> mmap = this.seedPassiveDB.entries(Seed.IP, ipString); - while (mmap.hasNext()) { - entry = mmap.next(); - if (entry == null) break; - String p = entry.getValue().get(Seed.PORT); - if (p == null) continue; - if (port > 0 && Integer.parseInt(p) != port) continue; - seed = this.getDisconnected(ASCII.String(entry.getKey())); + Collection idx = this.seedPassiveDB.select(Seed.IP, ipString); + for (byte[] pk: idx) { + seed = this.getDisconnected(ASCII.String(pk)); if (seed == null) continue; - this.ipLookupCache.put(peerIP, new SoftReference(seed)); + if (seed.getPort() != port) continue; //System.out.println("*** found lookupByIP in disconnected: " + peerIP.toString() + " -> " + seed.getName()); return seed; } @@ -756,18 +701,13 @@ public final class SeedDB implements AlternativeDomainNames { } } - if (lookupPotential) { + if (lookupPotential) synchronized (this) { try { - Iterator>> mmap = this.seedPotentialDB.entries(Seed.IP, ipString); - while (mmap.hasNext()) { - entry = mmap.next(); - if (entry == null) break; - String p = entry.getValue().get(Seed.PORT); - if (p == null) continue; - if (port > 0 && Integer.parseInt(p) != port) continue; - seed = this.getPotential(ASCII.String(entry.getKey())); + Collection idx = this.seedPotentialDB.select(Seed.IP, ipString); + for (byte[] pk: idx) { + seed = this.getPotential(ASCII.String(pk)); if (seed == null) continue; - this.ipLookupCache.put(peerIP, new SoftReference(seed)); + if (seed.getPort() != port) continue; //System.out.println("*** found lookupByIP in potential: " + peerIP.toString() + " -> " + seed.getName()); return seed; } diff --git a/source/net/yacy/peers/operation/yacyRelease.java b/source/net/yacy/peers/operation/yacyRelease.java index 8c0e29ef4..80d132259 100644 --- a/source/net/yacy/peers/operation/yacyRelease.java +++ b/source/net/yacy/peers/operation/yacyRelease.java @@ -365,6 +365,7 @@ public final class yacyRelease extends yacyVersion { try { final CharBuffer signBuffer = new CharBuffer(getSignatureFile()); final byte[] signByteBuffer = Base64Order.standardCoder.decode(signBuffer.toString().trim()); + signBuffer.close(); final CryptoLib cl = new CryptoLib(); for(final yacyUpdateLocation updateLocation : latestReleaseLocations) { try { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 4a34e54c8..87bf68729 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -295,7 +295,7 @@ public final class Switchboard extends serverSwitch } // init TrayIcon if possible - tray = new Tray(this); + this.tray = new Tray(this); // remote proxy configuration initRemoteProxy(); @@ -636,6 +636,7 @@ public final class Switchboard extends serverSwitch // define a realtime parsable mimetype list this.log.logConfig("Parser: Initializing Mime Type deny list"); TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, "")); + TextParser.setDenyExtension(getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, "")); // prepare a solr index profile switch list final File solrBackupProfile = new File("defaults/solr.keys.list"); @@ -650,7 +651,7 @@ public final class Switchboard extends serverSwitch // update the working scheme with the backup scheme. This is necessary to include new features. // new features are always activated by default - workingScheme.fill(backupScheme); + workingScheme.fill(backupScheme, false); // set up the solr interface final String solrurls = @@ -1598,7 +1599,7 @@ public final class Switchboard extends serverSwitch Domains.close(); AccessTracker.dumpLog(new File("DATA/LOG/queries.log")); UPnP.deletePortMapping(); - tray.remove(); + this.tray.remove(); try { HTTPClient.closeConnectionManager(); } catch ( final InterruptedException e ) { @@ -3327,7 +3328,7 @@ public final class Switchboard extends serverSwitch this.peers.mySeed().put(Seed.NCOUNT, Integer.toString(this.crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's) this.peers.mySeed().put( Seed.RCOUNT, - Integer.toString(this.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's) + Integer.toString(this.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.GLOBAL))); // the number of links that the peer provides for remote crawling (ZURL's) this.peers.mySeed().put(Seed.ICOUNT, Long.toString(this.indexSegments.RWICount())); // the minimum number of words that the peer has indexed (as it says) this.peers.mySeed().put(Seed.SCOUNT, Integer.toString(this.peers.sizeConnected())); // the number of seeds that the peer has stored this.peers.mySeed().put( diff --git a/source/net/yacy/search/index/MetadataRepository.java b/source/net/yacy/search/index/MetadataRepository.java index ebc18dfd1..5f953dde4 100644 --- a/source/net/yacy/search/index/MetadataRepository.java +++ b/source/net/yacy/search/index/MetadataRepository.java @@ -599,19 +599,21 @@ public final class MetadataRepository implements Iterable { public Map domainSampleCollector() throws IOException { final Map map = new HashMap(); // first collect all domains and calculate statistics about it - final CloneableIterator i = this.urlIndexFile.keys(true, null); - String hosthash; - byte[] urlhashb; - URLHashCounter ds; - if (i != null) while (i.hasNext()) { - urlhashb = i.next(); - hosthash = ASCII.String(urlhashb, 6, 6); - ds = map.get(hosthash); - if (ds == null) { - ds = new URLHashCounter(urlhashb); - map.put(hosthash, ds); - } else { - ds.count++; + synchronized (this) { + final CloneableIterator i = this.urlIndexFile.keys(true, null); + String hosthash; + byte[] urlhashb; + URLHashCounter ds; + if (i != null) while (i.hasNext()) { + urlhashb = i.next(); + hosthash = ASCII.String(urlhashb, 6, 6); + ds = map.get(hosthash); + if (ds == null) { + ds = new URLHashCounter(urlhashb); + map.put(hosthash, ds); + } else { + ds.count++; + } } } return map; @@ -739,11 +741,13 @@ public final class MetadataRepository implements Iterable { // first collect all url hashes that belong to the domain assert hosthash.length() == 6; final ArrayList l = new ArrayList(); - final CloneableIterator i = this.urlIndexFile.keys(true, null); - String hash; - while (i != null && i.hasNext()) { - hash = ASCII.String(i.next()); - if (hosthash.equals(hash.substring(6))) l.add(hash); + synchronized (this) { + final CloneableIterator i = this.urlIndexFile.keys(true, null); + String hash; + while (i != null && i.hasNext()) { + hash = ASCII.String(i.next()); + if (hosthash.equals(hash.substring(6))) l.add(hash); + } } // then delete the urls using this list diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 20627524e..f45bde9f7 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -73,7 +73,7 @@ public class Segment { public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash public static final int lowcachedivisor = 900; - public static final long targetFileSize = 256 * 1024 * 1024; // 256 MB + public static final long targetFileSize = 64 * 1024 * 1024; // 256 MB public static final int writeBufferSize = 4 * 1024 * 1024; // the reference factory diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 2e759cd70..052f70062 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -35,7 +35,6 @@ import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.SortedSet; -import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -56,7 +55,6 @@ import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Bitfield; -import net.yacy.kelondro.order.NaturalOrder; import net.yacy.kelondro.util.SetTools; import net.yacy.peers.Seed; import net.yacy.search.index.Segment; @@ -162,7 +160,7 @@ public final class QueryParams { } } else { this.queryString = queryString; - final TreeSet[] cq = cleanQuery(queryString); + final Collection[] cq = cleanQuery(queryString); this.queryHashes = Word.words2hashesHandles(cq[0]); this.excludeHashes = Word.words2hashesHandles(cq[1]); this.fullqueryHashes = Word.words2hashesHandles(cq[2]); @@ -378,11 +376,11 @@ public final class QueryParams { private static String seps = "'.,/&_"; static {seps += '"';} @SuppressWarnings("unchecked") - public static TreeSet[] cleanQuery(String querystring) { + public static Collection[] cleanQuery(String querystring) { // returns three sets: a query set, a exclude set and a full query set - final TreeSet query = new TreeSet(NaturalOrder.naturalComparator); - final TreeSet exclude = new TreeSet(NaturalOrder.naturalComparator); - final TreeSet fullquery = new TreeSet(NaturalOrder.naturalComparator); + final Collection query = new ArrayList(); + final Collection exclude = new ArrayList(); + final Collection fullquery = new ArrayList(); if ((querystring != null) && (!querystring.isEmpty())) { @@ -401,22 +399,23 @@ public final class QueryParams { final String[] queries = querystring.split(" "); for (String quer : queries) { if (quer.startsWith("-")) { - exclude.add(quer.substring(1)); + String x = quer.substring(1); + if (!exclude.contains(x)) exclude.add(x); } else { while ((c = quer.indexOf('-')) >= 0) { s = quer.substring(0, c); l = s.length(); - if (l >= Condenser.wordminsize) {query.add(s);} - if (l > 0) {fullquery.add(s);} + if (l >= Condenser.wordminsize && !query.contains(s)) {query.add(s);} + if (l > 0 && !fullquery.contains(s)) {fullquery.add(s);} quer = quer.substring(c + 1); } l = quer.length(); - if (l >= Condenser.wordminsize) {query.add(quer);} - if (l > 0) {fullquery.add(quer);} + if (l >= Condenser.wordminsize && !query.contains(quer)) {query.add(quer);} + if (l > 0 && !fullquery.contains(quer)) {fullquery.add(quer);} } } } - return new TreeSet[]{query, exclude, fullquery}; + return new Collection[]{query, exclude, fullquery}; } public String queryString(final boolean encodeHTML) { @@ -438,7 +437,7 @@ public final class QueryParams { } } - public TreeSet[] queryWords() { + public Collection[] queryWords() { return cleanQuery(this.queryString); }