diff --git a/build.properties b/build.properties index 6c66862df..58349756e 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.545 +releaseVersion=0.546 releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseFileParentDir=yacy diff --git a/htroot/CacheAdmin_p.html b/htroot/CacheAdmin_p.html index c04c5f092..0f6e916ab 100644 --- a/htroot/CacheAdmin_p.html +++ b/htroot/CacheAdmin_p.html @@ -5,7 +5,8 @@ #%env/templates/metas.template%# - #%env/templates/header.template%# + #%env/templates/header.template%# + #%env/templates/submenuWebStructure.template%#

Web Cache

The current cache size is #[cachesize]# KB. The maximum cache size is #[cachemax]# KB.

Crawl Thread + Status Start URL Depth Filter @@ -46,6 +47,7 @@ #{crawlProfiles}# #[name]# + #(status)#terminated::active#(/status)# #[startURL]# #[depth]# #[filter]# @@ -58,12 +60,18 @@ #(indexText)#no::yes#(/indexText)# #(indexMedia)#no::yes#(/indexMedia)# #(remoteIndexing)#no::yes#(/remoteIndexing)# - #(deleteButton)#:: -

-
-
- #(/deleteButton)# -
+ #(terminateButton)#:: +
+ + +
+ #(/terminateButton)# + #(deleteButton)#:: +
+ + +
+ #(/deleteButton)# #{/crawlProfiles}# diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index ab1eb894b..2e8f26464 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -1,11 +1,15 @@ // CrawlProfileEditor_p.java -// ------------------------------- -// part of the AnomicHTTPD caching proxy -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004, 2005 -// last major change: 04.07.2005 +// (C) 2005, by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 04.07.2005 on http://yacy.net // +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -19,29 +23,6 @@ // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - -// You must compile this file with -// javac -classpath .:../classes CrawlProfileEditor_p.java -// if the shell's current path is HTROOT import java.io.IOException; import java.util.ArrayList; @@ -103,14 +84,23 @@ public class CrawlProfileEditor_p { // read post for handle String handle = (post == null) ? "" : post.get("handle", ""); - if ((post != null) && (post.containsKey("deleteprofile"))) { - // deletion of a crawl - sb.profiles.removeEntry(handle); + if (post != null) { + if (post.containsKey("terminate")) { + // termination of a crawl: shift the crawl from active to passive + sb.profilesPassiveCrawls.newEntry(sb.profilesActiveCrawls.getEntry(handle).map()); + sb.profilesActiveCrawls.removeEntry(handle); + // delete all entries from the crawl queue that are deleted here + sb.noticeURL.removeByProfileHandle(handle); + } + if (post.containsKey("delete")) { + // deletion of a terminated crawl profile + sb.profilesPassiveCrawls.removeEntry(handle); + } } // generate handle list int count = 0; - Iterator it = sb.profiles.profiles(true); + Iterator it = sb.profilesActiveCrawls.profiles(true); entry selentry; while (it.hasNext()) { selentry = (entry)it.next(); @@ -126,7 +116,7 @@ public class CrawlProfileEditor_p { count++; } prop.put("profiles", count); - selentry = sb.profiles.getEntry(handle); + selentry = sb.profilesActiveCrawls.getEntry(handle); // read post for change submit if ((post != null) && (selentry != null)) { @@ -138,7 +128,7 @@ public class CrawlProfileEditor_p { tee = (eentry) it.next(); String cval = (String) selentry.map().get(tee.name); String val = (tee.type == eentry.BOOLEAN) ? Boolean.toString(post.containsKey(tee.name)) : post.get(tee.name, cval); - if (!cval.equals(val)) sb.profiles.changeEntry(selentry, tee.name, val); + if (!cval.equals(val)) sb.profilesActiveCrawls.changeEntry(selentry, tee.name, val); } } catch (IOException ex) { prop.put("error", 1); @@ -149,47 +139,22 @@ public class CrawlProfileEditor_p { // generate crawl profile table count = 0; + boolean dark = true; int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160); - it = sb.profiles.profiles(true); plasmaCrawlProfile.entry profile; - boolean dark = true; + // put active crawls into list + it = sb.profilesActiveCrawls.profiles(true); while (it.hasNext()) { profile = (plasmaCrawlProfile.entry) it.next(); - prop.put("crawlProfiles_"+count+"_dark", ((dark) ? 1 : 0)); - prop.put("crawlProfiles_"+count+"_name", profile.name()); - prop.put("crawlProfiles_"+count+"_startURL", profile.startURL()); - prop.put("crawlProfiles_"+count+"_handle", profile.handle()); - prop.put("crawlProfiles_"+count+"_depth", profile.generalDepth()); - prop.put("crawlProfiles_"+count+"_filter", profile.generalFilter()); - prop.put("crawlProfiles_"+count+"_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder()); - prop.put("crawlProfiles_"+count+"_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth())); - - //start contrib [MN] - int i = 0; - String item; - while((i <= domlistlength) && !((item = profile.domName(true, i)).equals(""))){ - if(i == domlistlength){ - item = item + " ..."; - } - prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent_"+i+"_item", item); - i++; - } - - prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", i); - //end contrib [MN] - - prop.put("crawlProfiles_"+count+"_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages()); - prop.put("crawlProfiles_"+count+"_withQuery", ((profile.crawlingQ()) ? 1 : 0)); - prop.put("crawlProfiles_"+count+"_storeCache", ((profile.storeHTCache()) ? 1 : 0)); - prop.put("crawlProfiles_"+count+"_indexText", ((profile.indexText()) ? 1 : 0)); - prop.put("crawlProfiles_"+count+"_indexMedia", ((profile.indexMedia()) ? 1 : 0)); - prop.put("crawlProfiles_"+count+"_remoteIndexing", ((profile.remoteIndexing()) ? 1 : 0)); - prop.put("crawlProfiles_"+count+"_deleteButton", (((profile.name().equals("remote")) || - (profile.name().equals("proxy")) || - (profile.name().equals("snippetText")) || - (profile.name().equals("snippetMedia")) ? 0 : 1))); - prop.put("crawlProfiles_"+count+"_deleteButton_handle", profile.handle()); - + putProfileEntry(prop, profile, true, dark, count, domlistlength); + dark = !dark; + count++; + } + // put passive crawls into list + it = sb.profilesPassiveCrawls.profiles(true); + while (it.hasNext()) { + profile = (plasmaCrawlProfile.entry) it.next(); + putProfileEntry(prop, profile, false, dark, count, domlistlength); dark = !dark; count++; } @@ -223,4 +188,44 @@ public class CrawlProfileEditor_p { return prop; } + + private static void putProfileEntry(servletProperties prop, plasmaCrawlProfile.entry profile, boolean active, boolean dark, int count, int domlistlength) { + prop.put("crawlProfiles_" + count + "_dark", ((dark) ? 1 : 0)); + prop.put("crawlProfiles_" + count + "_status", ((active) ? 1 : 0)); + prop.put("crawlProfiles_" + count + "_name", profile.name()); + prop.put("crawlProfiles_" + count + "_startURL", profile.startURL()); + prop.put("crawlProfiles_" + count + "_handle", profile.handle()); + prop.put("crawlProfiles_" + count + "_depth", profile.generalDepth()); + prop.put("crawlProfiles_" + count + "_filter", profile.generalFilter()); + prop.put("crawlProfiles_" + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder()); + prop.put("crawlProfiles_" + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth())); + + // start contrib [MN] + int i = 0; + String item; + while ((i <= domlistlength) && !((item = profile.domName(true, i)).equals(""))){ + if(i == domlistlength){ + item = item + " ..."; + } + prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent_"+i+"_item", item); + i++; + } + + prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", i); + // end contrib [MN] + + prop.put("crawlProfiles_" + count + "_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages()); + prop.put("crawlProfiles_" + count + "_withQuery", (profile.crawlingQ()) ? 1 : 0); + prop.put("crawlProfiles_" + count + "_storeCache", (profile.storeHTCache()) ? 1 : 0); + prop.put("crawlProfiles_" + count + "_indexText", (profile.indexText()) ? 1 : 0); + prop.put("crawlProfiles_" + count + "_indexMedia", (profile.indexMedia()) ? 1 : 0); + prop.put("crawlProfiles_" + count + "_remoteIndexing", (profile.remoteIndexing()) ? 1 : 0); + prop.put("crawlProfiles_" + count + "_terminateButton", ((!active) || (profile.name().equals("remote")) || + (profile.name().equals("proxy")) || + (profile.name().equals("snippetText")) || + (profile.name().equals("snippetMedia"))) ? 0 : 1); + prop.put("crawlProfiles_" + count + "_terminateButton_handle", profile.handle()); + prop.put("crawlProfiles_" + count + "_deleteButton", (active) ? 0 : 1); + prop.put("crawlProfiles_" + count + "_deleteButton_handle", profile.handle()); + } } diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index 7b992180e..a8e0a3e24 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -69,6 +69,11 @@ public class CrawlResults { tabletype = 0; } + if ((post != null) && (post.containsKey("autoforward")) && (tabletype == 5) && (sb.wordIndex.loadedURL.getStackSize(5) == 0)) { + // the main menu does a request to the local crawler page, but in case this table is empty, the overview page is shown + tabletype = 0; + } + // check if authorization is needed and/or given if (((tabletype > 0) && (tabletype < 6)) || (post.containsKey("clearlist")) || diff --git a/htroot/CrawlURLFetch_p.java b/htroot/CrawlURLFetch_p.java index f16e714ed..3abacae11 100644 --- a/htroot/CrawlURLFetch_p.java +++ b/htroot/CrawlURLFetch_p.java @@ -83,9 +83,9 @@ public class CrawlURLFetch_p { public static plasmaCrawlProfile.entry getCrawlProfile(serverSwitch env) { if (profile == null) { - profile = ((plasmaSwitchboard)env).profiles.newEntry( + profile = ((plasmaSwitchboard)env).profilesActiveCrawls.newEntry( "URLFetcher", // Name - "", // URL + null, // URL ".*", ".*", // General / specific filter 0, 0, // General / specific depth -1, -1, -1, // Recrawl / Dom-filter depth / Dom-max-pages diff --git a/htroot/IndexCreateIndexingQueue_p.html b/htroot/IndexCreateIndexingQueue_p.html index 2e467b9f1..c127527b0 100644 --- a/htroot/IndexCreateIndexingQueue_p.html +++ b/htroot/IndexCreateIndexingQueue_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuCrawler.template%# + #%env/templates/submenuIndexCreate.template%#

Indexing Queue

diff --git a/htroot/IndexCreateIndexingQueue_p.java b/htroot/IndexCreateIndexingQueue_p.java index a99f81890..970c69376 100644 --- a/htroot/IndexCreateIndexingQueue_p.java +++ b/htroot/IndexCreateIndexingQueue_p.java @@ -92,6 +92,7 @@ public class IndexCreateIndexingQueue_p { plasmaHTCache.deleteFile(entry.url()); } } + switchboard.sbQueue.clear(); // reset file to clean up content completely } } catch (Exception e) {} } else if (post.containsKey("deleteEntry")) { diff --git a/htroot/IndexCreateLoaderQueue_p.html b/htroot/IndexCreateLoaderQueue_p.html index cd15f5c06..ba8727f4a 100644 --- a/htroot/IndexCreateLoaderQueue_p.html +++ b/htroot/IndexCreateLoaderQueue_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuCrawler.template%# + #%env/templates/submenuIndexCreate.template%#

Loader Queue

diff --git a/htroot/IndexCreateWWWGlobalQueue_p.html b/htroot/IndexCreateWWWGlobalQueue_p.html index d7234864b..b2c8d8dec 100644 --- a/htroot/IndexCreateWWWGlobalQueue_p.html +++ b/htroot/IndexCreateWWWGlobalQueue_p.html @@ -5,8 +5,8 @@ #%env/templates/metas.template%# - #%env/templates/header.template%# - #%env/templates/submenuCrawler.template%# + #%env/templates/header.template%# + #%env/templates/submenuIndexCreate.template%#

Global Crawl Queue

This queue stores the urls that shall be sent to other peers to perform a remote crawl. diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java index 1f067130b..d91a72ee4 100644 --- a/htroot/IndexCreateWWWGlobalQueue_p.java +++ b/htroot/IndexCreateWWWGlobalQueue_p.java @@ -94,7 +94,7 @@ public class IndexCreateWWWGlobalQueue_p { prop.put("info_numEntries", c); } else if (post.containsKey("deleteEntry")) { String urlHash = (String) post.get("deleteEntry"); - switchboard.noticeURL.remove(urlHash); + switchboard.noticeURL.removeByURLHash(urlHash); prop.put("LOCATION",""); return prop; } @@ -118,7 +118,7 @@ public class IndexCreateWWWGlobalQueue_p { if ((urle != null)&&(urle.url()!=null)) { initiator = yacyCore.seedDB.getConnected(urle.initiator()); profileHandle = urle.profileHandle(); - profileEntry = (profileHandle == null) ? null : switchboard.profiles.getEntry(profileHandle); + profileEntry = (profileHandle == null) ? null : switchboard.profilesActiveCrawls.getEntry(profileHandle); prop.put("crawler-queue_list_"+showNum+"_dark", ((dark) ? 1 : 0) ); prop.put("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : htmlTools.encodeUnicode2html(initiator.getName(), true)) ); prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : htmlTools.encodeUnicode2html(profileEntry.name(), true))); diff --git a/htroot/IndexCreateWWWLocalQueue_p.html b/htroot/IndexCreateWWWLocalQueue_p.html index ebefb3bb3..fbfd4d714 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.html +++ b/htroot/IndexCreateWWWLocalQueue_p.html @@ -4,9 +4,9 @@ YaCy '#[clientname]#': Local Crawl Queue #%env/templates/metas.template%# - - #%env/templates/header.template%# - #%env/templates/submenuCrawler.template%# + + #%env/templates/header.template%# + #%env/templates/submenuIndexCreate.template%#

Local Crawl Queue

This queue stores the urls that shall be crawled localy by this peer. diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java index 049ee5578..d4ecb88c7 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -109,7 +109,7 @@ public class IndexCreateWWWLocalQueue_p { if (option == PROFILE) { // search and delete the crawl profile (_much_ faster, independant of queue size) // XXX: what to do about the annoying LOST PROFILE messages in the log? - Iterator it = switchboard.profiles.profiles(true); + Iterator it = switchboard.profilesActiveCrawls.profiles(true); plasmaCrawlProfile.entry entry; while (it.hasNext()) { entry = (plasmaCrawlProfile.entry)it.next(); @@ -119,8 +119,9 @@ public class IndexCreateWWWLocalQueue_p { name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) || name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA)) continue; - if (compiledPattern.matcher(name).find()) - switchboard.profiles.removeEntry(entry.handle()); + if (compiledPattern.matcher(name).find()) { + switchboard.profilesActiveCrawls.removeEntry(entry.handle()); + } } } else { // iterating through the list of URLs @@ -144,7 +145,7 @@ public class IndexCreateWWWLocalQueue_p { if (value != null) { Matcher matcher = compiledPattern.matcher(value); if (matcher.find()) { - switchboard.noticeURL.remove(entry.url().hash()); + switchboard.noticeURL.removeByURLHash(entry.url().hash()); } } } @@ -158,7 +159,7 @@ public class IndexCreateWWWLocalQueue_p { prop.put("info_numEntries", c); } else if (post.containsKey("deleteEntry")) { String urlHash = (String) post.get("deleteEntry"); - switchboard.noticeURL.remove(urlHash); + switchboard.noticeURL.removeByURLHash(urlHash); prop.put("LOCATION",""); return prop; } @@ -182,7 +183,7 @@ public class IndexCreateWWWLocalQueue_p { if ((urle != null)&&(urle.url()!=null)) { initiator = yacyCore.seedDB.getConnected(urle.initiator()); profileHandle = urle.profileHandle(); - profileEntry = (profileHandle == null) ? null : switchboard.profiles.getEntry(profileHandle); + profileEntry = (profileHandle == null) ? null : switchboard.profilesActiveCrawls.getEntry(profileHandle); prop.put("crawler-queue_list_"+showNum+"_dark", ((dark) ? 1 : 0) ); prop.put("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : htmlTools.encodeUnicode2html(initiator.getName(), true)) ); prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); diff --git a/htroot/IndexCreateWWWRemoteQueue_p.html b/htroot/IndexCreateWWWRemoteQueue_p.html index cbe2a95c9..cdde8bb54 100644 --- a/htroot/IndexCreateWWWRemoteQueue_p.html +++ b/htroot/IndexCreateWWWRemoteQueue_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuCrawler.template%# + #%env/templates/submenuIndexCreate.template%#

Remote Crawl Queue

This queue stores the urls that other peers sent to you in order to perform a remote crawl for them. diff --git a/htroot/IndexCreateWWWRemoteQueue_p.java b/htroot/IndexCreateWWWRemoteQueue_p.java index 01ba6c758..cb30a2cda 100644 --- a/htroot/IndexCreateWWWRemoteQueue_p.java +++ b/htroot/IndexCreateWWWRemoteQueue_p.java @@ -93,7 +93,7 @@ public class IndexCreateWWWRemoteQueue_p { prop.put("info_numEntries", c); } else if (post.containsKey("deleteEntry")) { String urlHash = (String) post.get("deleteEntry"); - sb.noticeURL.remove(urlHash); + sb.noticeURL.removeByURLHash(urlHash); prop.put("LOCATION",""); return prop; } @@ -117,7 +117,7 @@ public class IndexCreateWWWRemoteQueue_p { if (urle != null && urle.url() != null) { initiator = yacyCore.seedDB.getConnected(urle.initiator()); profileHandle = urle.profileHandle(); - profileEntry = (profileHandle == null) ? null : sb.profiles.getEntry(profileHandle); + profileEntry = (profileHandle == null) ? null : sb.profilesActiveCrawls.getEntry(profileHandle); prop.put("crawler-queue_list_" + showNum + "_dark", ((dark) ? 1 : 0) ); prop.put("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName())); prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); diff --git a/htroot/ProxyIndexingMonitor_p.java b/htroot/ProxyIndexingMonitor_p.java index 92589bbc7..3332fb583 100644 --- a/htroot/ProxyIndexingMonitor_p.java +++ b/htroot/ProxyIndexingMonitor_p.java @@ -117,11 +117,11 @@ public class ProxyIndexingMonitor_p { prop.put("info", 1); //delete DATA/PLASMADB/crawlProfiles0.db } else { try { - sb.profiles.changeEntry(sb.defaultProxyProfile, "generalDepth", Integer.toString(newProxyPrefetchDepth)); - sb.profiles.changeEntry(sb.defaultProxyProfile, "storeHTCache", (proxyStoreHTCache) ? "true": "false"); - sb.profiles.changeEntry(sb.defaultProxyProfile, "remoteIndexing",proxyIndexingRemote ? "true":"false"); - sb.profiles.changeEntry(sb.defaultProxyProfile, "indexText",proxyIndexingLocalText ? "true":"false"); - sb.profiles.changeEntry(sb.defaultProxyProfile, "indexMedia",proxyIndexingLocalMedia ? "true":"false"); + sb.profilesActiveCrawls.changeEntry(sb.defaultProxyProfile, "generalDepth", Integer.toString(newProxyPrefetchDepth)); + sb.profilesActiveCrawls.changeEntry(sb.defaultProxyProfile, "storeHTCache", (proxyStoreHTCache) ? "true": "false"); + sb.profilesActiveCrawls.changeEntry(sb.defaultProxyProfile, "remoteIndexing",proxyIndexingRemote ? "true":"false"); + sb.profilesActiveCrawls.changeEntry(sb.defaultProxyProfile, "indexText",proxyIndexingLocalText ? "true":"false"); + sb.profilesActiveCrawls.changeEntry(sb.defaultProxyProfile, "indexMedia",proxyIndexingLocalMedia ? "true":"false"); prop.put("info", 2);//new proxyPrefetchdepth prop.put("info_message", newProxyPrefetchDepth); diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 8f4d463ba..2b8a85ecd 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -141,15 +141,15 @@ public class QuickCrawlLink_p { String urlhash = crawlingStartURL.hash(); switchboard.wordIndex.loadedURL.remove(urlhash); - switchboard.noticeURL.remove(urlhash); + switchboard.noticeURL.removeByURLHash(urlhash); switchboard.errorURL.remove(urlhash); // create crawling profile plasmaCrawlProfile.entry pe = null; try { - pe = switchboard.profiles.newEntry( + pe = switchboard.profilesActiveCrawls.newEntry( crawlingStartURL.getHost(), - crawlingStart, + crawlingStartURL, crawlingFilter, crawlingFilter, CrawlingDepth, diff --git a/htroot/WatchCrawler_p.html b/htroot/WatchCrawler_p.html index 974b7f045..5a935e3c1 100644 --- a/htroot/WatchCrawler_p.html +++ b/htroot/WatchCrawler_p.html @@ -1,5 +1,6 @@ +#(forwardToCrawlStart)#::#(/forwardToCrawlStart)# YaCy '#[clientname]#': Crawler Queues #%env/templates/metas.template%# @@ -9,7 +10,7 @@ #%env/templates/header.template%# -#%env/templates/submenuCrawler.template%# +#%env/templates/submenuIndexCreate.template%#

Crawler Queues

Next update in seconds. empty

diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index a8f8aa478..52f39d732 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -63,6 +63,7 @@ public class WatchCrawler_p { // return variable that accumulates replacements plasmaSwitchboard switchboard = (plasmaSwitchboard) env; serverObjects prop = new serverObjects(); + prop.put("forwardToCrawlStart", 0); if (post == null) { // not a crawl start, only monitoring @@ -70,6 +71,10 @@ public class WatchCrawler_p { } else { prop.put("info", 0); + if ((post.containsKey("autoforward")) && (switchboard.coreCrawlJobSize() == 0)) { + prop.put("forwardToCrawlStart", 1); + } + if (post.containsKey("continue")) { // continue queue String queue = post.get("continue", ""); @@ -158,18 +163,12 @@ public class WatchCrawler_p { if (pos == -1) crawlingStart = "http://" + crawlingStart; // normalizing URL - try {crawlingStart = new yacyURL(crawlingStart, null).toNormalform(true, true);} catch (MalformedURLException e1) {} - - // check if url is proper yacyURL crawlingStartURL = null; - try { - crawlingStartURL = new yacyURL(crawlingStart, null); - } catch (MalformedURLException e) { - crawlingStartURL = null; - } + try {crawlingStartURL = new yacyURL(crawlingStart, null);} catch (MalformedURLException e1) {} + crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true); // check if pattern matches - if ((crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) { + if ((crawlingStart == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) { // print error message prop.put("info", 4); //crawlfilter does not match url prop.put("info_newcrawlingfilter", newcrawlingfilter); @@ -183,12 +182,13 @@ public class WatchCrawler_p { // first delete old entry, if exists String urlhash = (new yacyURL(crawlingStart, null)).hash(); switchboard.wordIndex.loadedURL.remove(urlhash); - switchboard.noticeURL.remove(urlhash); + switchboard.noticeURL.removeByURLHash(urlhash); switchboard.errorURL.remove(urlhash); // stack url - plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry( - crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, + switchboard.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it + plasmaCrawlProfile.entry pe = switchboard.profilesActiveCrawls.newEntry( + crawlingStartURL.getHost(), crawlingStartURL, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, @@ -268,7 +268,8 @@ public class WatchCrawler_p { HashMap hyperlinks = (HashMap) scraper.getAnchors(); // creating a crawler profile - plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, "file://" + file.toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw); + yacyURL crawlURL = new yacyURL("file://" + file.toString(), null); + plasmaCrawlProfile.entry profile = switchboard.profilesActiveCrawls.newEntry(fileName, crawlURL, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw); // loop through the contained links Iterator interator = hyperlinks.entrySet().iterator(); @@ -325,10 +326,11 @@ public class WatchCrawler_p { try { // getting the sitemap URL sitemapURLStr = post.get("sitemapURL",""); - + yacyURL sitemapURL = new yacyURL(sitemapURLStr, null); + // create a new profile - plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry( - sitemapURLStr, sitemapURLStr, newcrawlingfilter, newcrawlingfilter, + plasmaCrawlProfile.entry pe = switchboard.profilesActiveCrawls.newEntry( + sitemapURLStr, sitemapURL, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, diff --git a/htroot/WatchWebStructure_p.html b/htroot/WatchWebStructure_p.html index cb4559aa5..527db4d46 100644 --- a/htroot/WatchWebStructure_p.html +++ b/htroot/WatchWebStructure_p.html @@ -20,7 +20,7 @@ #%env/templates/header.template%# -#%env/templates/submenuCrawler.template%# +#%env/templates/submenuWebStructure.template%#

Web Structure

diff --git a/htroot/WatchWebStructure_p.java b/htroot/WatchWebStructure_p.java index 818c11d4e..ce1331d0f 100644 --- a/htroot/WatchWebStructure_p.java +++ b/htroot/WatchWebStructure_p.java @@ -31,7 +31,7 @@ public class WatchWebStructure_p { if (host.equals("auto")) { // try to find the host from the crawl profiles - Iterator it = sb.profiles.profiles(true); + Iterator it = sb.profilesActiveCrawls.profiles(true); entry e; while (it.hasNext()) { e = (entry)it.next(); diff --git a/htroot/env/templates/header.template b/htroot/env/templates/header.template index 5e3154e67..bb25bc722 100644 --- a/htroot/env/templates/header.template +++ b/htroot/env/templates/header.template @@ -16,7 +16,8 @@
+ \ No newline at end of file diff --git a/htroot/env/templates/submenuWebStructure.template b/htroot/env/templates/submenuWebStructure.template new file mode 100644 index 000000000..9d46ab7ac --- /dev/null +++ b/htroot/env/templates/submenuWebStructure.template @@ -0,0 +1,7 @@ + \ No newline at end of file diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index d42e77e3e..0d323fa1b 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -160,7 +160,7 @@ public class SitemapParser extends DefaultHandler { if (theCrawlingProfile == null) { // create a new profile - this.crawlingProfile = createProfile(this.siteMapURL.getHost(),this.siteMapURL.toString()); + this.crawlingProfile = createProfile(this.siteMapURL.getHost(), this.siteMapURL); } else { // use an existing profile this.crawlingProfile = theCrawlingProfile; @@ -348,8 +348,8 @@ public class SitemapParser extends DefaultHandler { } } - private plasmaCrawlProfile.entry createProfile(String domainName, String sitemapURL) { - return this.switchboard.profiles.newEntry( + private plasmaCrawlProfile.entry createProfile(String domainName, yacyURL sitemapURL) { + return this.switchboard.profilesActiveCrawls.newEntry( domainName, sitemapURL, // crawlingFilter diff --git a/source/de/anomic/http/httpTemplate.java b/source/de/anomic/http/httpTemplate.java index fba1314df..6c1a5024a 100644 --- a/source/de/anomic/http/httpTemplate.java +++ b/source/de/anomic/http/httpTemplate.java @@ -464,9 +464,9 @@ public final class httpTemplate { while( (line = br.readLine()) != null ){ include.append(line.getBytes("UTF-8")).append(de.anomic.server.serverCore.crlfString.getBytes("UTF-8")); } - }catch(IOException e){ + } catch (IOException e) { //file not found? - serverLog.logSevere("FILEHANDLER","Include Error with file: " + new String(filename, "UTF-8")); + serverLog.logSevere("FILEHANDLER","Include Error with file " + new String(filename, "UTF-8") + ": " + e.getMessage()); } finally { if (br != null) try { br.close(); br=null; } catch (Exception e) {} } diff --git a/source/de/anomic/plasma/crawler/http/CrawlWorker.java b/source/de/anomic/plasma/crawler/http/CrawlWorker.java index 3df38b207..d50f41495 100644 --- a/source/de/anomic/plasma/crawler/http/CrawlWorker.java +++ b/source/de/anomic/plasma/crawler/http/CrawlWorker.java @@ -332,7 +332,7 @@ public final class CrawlWorker extends AbstractCrawlWorker { String urlhash = redirectionUrl.hash(); // removing url from loader queue - plasmaCrawlLoader.switchboard.noticeURL.remove(urlhash); + plasmaCrawlLoader.switchboard.noticeURL.removeByURLHash(urlhash); // retry crawling with new url this.url = redirectionUrl; diff --git a/source/de/anomic/plasma/dbImport/SitemapImporter.java b/source/de/anomic/plasma/dbImport/SitemapImporter.java index 8109f12e0..945bbbef8 100644 --- a/source/de/anomic/plasma/dbImport/SitemapImporter.java +++ b/source/de/anomic/plasma/dbImport/SitemapImporter.java @@ -113,7 +113,7 @@ public class SitemapImporter extends AbstractImporter implements dbImporter { this.sitemapURL = new yacyURL((String)initParams.get("sitemapURL"), null); // getting the crawling profile to use - plasmaCrawlProfile.entry profileEntry = this.sb.profiles.getEntry((String)initParams.get("crawlingProfile")); + plasmaCrawlProfile.entry profileEntry = this.sb.profilesActiveCrawls.getEntry((String)initParams.get("crawlingProfile")); // creating the sitemap parser this.parser = new SitemapParser(this.sb,this.sitemapURL,profileEntry); diff --git a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java index 55e5ca384..ae8b3fbce 100644 --- a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java @@ -64,7 +64,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor this.preloadTime = Long.valueOf((String)initParams.get("preloadTime")).longValue(); File noticeUrlDbFile = new File(plasmaPath,"urlNotice1.db"); - File profileDbFile = new File(plasmaPath, "crawlProfiles0.db"); + File profileDbFile = new File(plasmaPath, plasmaSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES); String errorMsg = null; if (!plasmaPath.exists()) @@ -169,7 +169,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor if (!this.importProfileHandleCache.contains(profileHandle)) { // testing if the profile is already known - plasmaCrawlProfile.entry profileEntry = this.sb.profiles.getEntry(profileHandle); + plasmaCrawlProfile.entry profileEntry = this.sb.profilesActiveCrawls.getEntry(profileHandle); // if not we need to import it if (profileEntry == null) { @@ -178,7 +178,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor if (sourceEntry != null) { this.profileCount++; this.importProfileHandleCache.add(profileHandle); - this.sb.profiles.newEntry((TreeMap)((TreeMap)sourceEntry.map()).clone()); + this.sb.profilesActiveCrawls.newEntry((TreeMap)((TreeMap)sourceEntry.map()).clone()); } else { this.log.logWarning("Profile '" + profileHandle + "' of url entry '" + nextHash + "' unknown."); continue; @@ -193,7 +193,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor // removing hash from the import db } finally { - this.importNurlDB.remove(nextHash); + this.importNurlDB.removeByURLHash(nextHash); } if (this.urlCount % 100 == 0) { diff --git a/source/de/anomic/plasma/plasmaCrawlBalancer.java b/source/de/anomic/plasma/plasmaCrawlBalancer.java index 1905e2351..ecfff03ff 100644 --- a/source/de/anomic/plasma/plasmaCrawlBalancer.java +++ b/source/de/anomic/plasma/plasmaCrawlBalancer.java @@ -63,8 +63,8 @@ import de.anomic.yacy.yacySeedDB; public class plasmaCrawlBalancer { - private static final String stackSuffix = "7.stack"; - private static final String indexSuffix = "7.db"; + private static final String stackSuffix = "8.stack"; + private static final String indexSuffix = "8.db"; // a shared domainAccess map for all balancers private static final Map domainAccess = Collections.synchronizedMap(new HashMap()); @@ -85,10 +85,10 @@ public class plasmaCrawlBalancer { this.cacheStacksPath = cachePath; this.stackname = stackname; File stackFile = new File(cachePath, stackname + stackSuffix); - urlFileStack = kelondroStack.open(stackFile, stackrow); - domainStacks = new HashMap(); - urlRAMStack = new ArrayList(); - top = true; + this.urlFileStack = kelondroStack.open(stackFile, stackrow); + this.domainStacks = new HashMap(); + this.urlRAMStack = new ArrayList(); + this.top = true; // create a stack for newly entered entries if (!(cachePath.exists())) cachePath.mkdir(); // make the path @@ -142,6 +142,30 @@ public class plasmaCrawlBalancer { return new plasmaCrawlEntry(entry); } + public synchronized int removeAllByProfileHandle(String profileHandle) throws IOException { + // removes all entries with a specific profile hash. + // this may last some time + // returns number of deletions + + // first find a list of url hashes that shall be deleted + Iterator i = urlFileIndex.rows(true, null); + ArrayList urlHashes = new ArrayList(); + kelondroRow.Entry rowEntry; + plasmaCrawlEntry crawlEntry; + while (i.hasNext()) { + rowEntry = (kelondroRow.Entry) i.next(); + crawlEntry = new plasmaCrawlEntry(rowEntry); + if (crawlEntry.profileHandle().equals(profileHandle)) { + urlHashes.add(crawlEntry.url().hash()); + } + } + + // then delete all these urls from the queues and the file index + i = urlHashes.iterator(); + while (i.hasNext()) this.remove((String) i.next()); + return urlHashes.size(); + } + public synchronized plasmaCrawlEntry remove(String urlhash) throws IOException { // this method is only here, because so many import/export methods need it // and it was implemented in the previous architecture diff --git a/source/de/anomic/plasma/plasmaCrawlEntry.java b/source/de/anomic/plasma/plasmaCrawlEntry.java index c3ba841a7..39320ed03 100644 --- a/source/de/anomic/plasma/plasmaCrawlEntry.java +++ b/source/de/anomic/plasma/plasmaCrawlEntry.java @@ -48,7 +48,7 @@ public class plasmaCrawlEntry { "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash "String urlname-80, " + // the name of the url, from anchor tag name "Cardinal appdate-8 {b256}, " + // the time when the url was first time appeared - "String profile-4, " + // the name of the prefetch profile handle + "String profile-" + yacySeedDB.commonHashLength + ", " + // the name of the prefetch profile handle "Cardinal depth-2 {b256}, " + // the prefetch depth so far, starts at 0 "Cardinal parentbr-3 {b256}, " + // number of anchors of the parent "Cardinal forkfactor-4 {b256}, " + // sum of anchors of all ancestors diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index 9712d9a9d..bea3725b1 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -139,7 +139,7 @@ public class plasmaCrawlNURL { return null; } - public plasmaCrawlEntry remove(String urlhash) { + public plasmaCrawlEntry removeByURLHash(String urlhash) { plasmaCrawlEntry entry = null; try {if ((entry = coreStack.remove(urlhash)) != null) return entry;} catch (IOException e) {} try {if ((entry = limitStack.remove(urlhash)) != null) return entry;} catch (IOException e) {} @@ -147,6 +147,14 @@ public class plasmaCrawlNURL { return null; } + public int removeByProfileHandle(String handle) { + int removed = 0; + try {removed += coreStack.removeAllByProfileHandle(handle);} catch (IOException e) {} + try {removed += limitStack.removeAllByProfileHandle(handle);} catch (IOException e) {} + try {removed += remoteStack.removeAllByProfileHandle(handle);} catch (IOException e) {} + return removed; + } + public plasmaCrawlEntry[] top(int stackType, int count) { switch (stackType) { case STACK_TYPE_CORE: return top(coreStack, count); diff --git a/source/de/anomic/plasma/plasmaCrawlProfile.java b/source/de/anomic/plasma/plasmaCrawlProfile.java index b4e8352b9..93c12ece0 100644 --- a/source/de/anomic/plasma/plasmaCrawlProfile.java +++ b/source/de/anomic/plasma/plasmaCrawlProfile.java @@ -55,6 +55,8 @@ import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMapObjects; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.server.serverCodings; +import de.anomic.yacy.yacySeedDB; +import de.anomic.yacy.yacyURL; public class plasmaCrawlProfile { @@ -64,13 +66,11 @@ public class plasmaCrawlProfile { private File profileTableFile; private long preloadTime; - public static final int crawlProfileHandleLength = 4; // name of the prefetch profile - public plasmaCrawlProfile(File file, long preloadTime) { this.profileTableFile = file; this.preloadTime = preloadTime; profileTableFile.getParentFile().mkdirs(); - kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, crawlProfileHandleLength, 2000, '#', kelondroNaturalOrder.naturalOrder, true, false, true); + kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, yacySeedDB.commonHashLength, 2000, '#', kelondroNaturalOrder.naturalOrder, true, false, true); profileTable = new kelondroMapObjects(dyn, 500); } @@ -79,7 +79,7 @@ public class plasmaCrawlProfile { if (profileTable != null) profileTable.close(); if (!(profileTableFile.delete())) throw new RuntimeException("cannot delete crawl profile database"); profileTableFile.getParentFile().mkdirs(); - kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, crawlProfileHandleLength, 2000, '#', kelondroNaturalOrder.naturalOrder, true, false, true); + kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, yacySeedDB.commonHashLength, 2000, '#', kelondroNaturalOrder.naturalOrder, true, false, true); profileTable = new kelondroMapObjects(dyn, 500); } @@ -164,7 +164,7 @@ public class plasmaCrawlProfile { return ne; } - public entry newEntry(String name, String startURL, String generalFilter, String specificFilter, + public entry newEntry(String name, yacyURL startURL, String generalFilter, String specificFilter, int generalDepth, int specificDepth, int recrawlIfOlder /*minutes*/, int domFilterDepth, int domMaxPages, boolean crawlingQ, @@ -257,7 +257,7 @@ public class plasmaCrawlProfile { private Map mem; private Map doms; - public entry(String name, String startURL, String generalFilter, String specificFilter, + public entry(String name, yacyURL startURL, String generalFilter, String specificFilter, int generalDepth, int specificDepth, int recrawlIfOlder /*minutes*/, int domFilterDepth, int domMaxPages, boolean crawlingQ, @@ -266,11 +266,11 @@ public class plasmaCrawlProfile { boolean remoteIndexing, boolean xsstopw, boolean xdstopw, boolean xpstopw) { if (name == null || name.length() == 0) throw new NullPointerException("name must not be null"); - String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, crawlProfileHandleLength); + String handle = (startURL == null) ? kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, yacySeedDB.commonHashLength) : startURL.hash(); mem = new HashMap(); mem.put(HANDLE, handle); mem.put(NAME, name); - mem.put(START_URL, (startURL == null) ? "" : startURL); + mem.put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false)); mem.put(GENERAL_FILTER, (generalFilter == null) ? ".*" : generalFilter); mem.put(SPECIFIC_FILTER, (specificFilter == null) ? ".*" : specificFilter); mem.put(GENERAL_DEPTH, Integer.toString(generalDepth)); diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 3cb9517df..bcd254232 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -214,7 +214,7 @@ public final class plasmaCrawlStacker { public String dequeue(plasmaCrawlEntry theMsg) throws InterruptedException { - plasmaCrawlProfile.entry profile = this.sb.profiles.getEntry(theMsg.profileHandle()); + plasmaCrawlProfile.entry profile = this.sb.profilesActiveCrawls.getEntry(theMsg.profileHandle()); if (profile == null) { String errorMsg = "LOST PROFILE HANDLE '" + theMsg.profileHandle() + "' for URL " + theMsg.url(); this.log.logSevere(errorMsg); diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 54fd261b8..d6dadf939 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -575,7 +575,7 @@ public final class plasmaParser { // testing if the resource is not empty if (sourceArray == null || sourceArray.length == 0) { - String errorMsg = "No resource content available."; + String errorMsg = "No resource content available (1)."; this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT); } @@ -609,7 +609,7 @@ public final class plasmaParser { // testing if the resource is not empty if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) { - String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available."; + String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2)."; this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT); } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 88d8bb5cd..fd60036b7 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -224,7 +224,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public blogBoard blogDB; public blogBoardComments blogCommentDB; public static plasmaCrawlRobotsTxt robots; - public plasmaCrawlProfile profiles; + public plasmaCrawlProfile profilesActiveCrawls, profilesPassiveCrawls; public plasmaCrawlProfile.entry defaultProxyProfile; public plasmaCrawlProfile.entry defaultRemoteProfile; public plasmaCrawlProfile.entry defaultTextSnippetProfile; @@ -866,7 +866,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser * * @see plasmaSwitchboard#DBPATH for the folder this file lies in */ - public static final String DBFILE_CRAWL_PROFILES = "crawlProfiles0.db"; + public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.db"; + public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.db"; /** *

public static final String DBFILE_CRAWL_ROBOTS = "crawlRobotsTxt.db"

*

Name of the file containing the database holding all robots.txt-entries of the lately crawled domains

@@ -1066,12 +1067,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // make crawl profiles database and default profiles this.log.logConfig("Initializing Crawl Profiles"); - File profilesFile = new File(this.plasmaPath, DBFILE_CRAWL_PROFILES); - this.profiles = new plasmaCrawlProfile(profilesFile, ramProfiles_time); - initProfiles(); - log.logConfig("Loaded profiles from file " + profilesFile.getName() + - ", " + this.profiles.size() + " entries" + - ", " + ppRamString(profilesFile.length()/1024)); + File profilesActiveFile = new File(this.plasmaPath, DBFILE_ACTIVE_CRAWL_PROFILES); + this.profilesActiveCrawls = new plasmaCrawlProfile(profilesActiveFile, ramProfiles_time); + initActiveCrawlProfiles(); + log.logConfig("Loaded active crawl profiles from file " + profilesActiveFile.getName() + + ", " + this.profilesActiveCrawls.size() + " entries" + + ", " + ppRamString(profilesActiveFile.length()/1024)); + File profilesPassiveFile = new File(this.plasmaPath, DBFILE_PASSIVE_CRAWL_PROFILES); + this.profilesPassiveCrawls = new plasmaCrawlProfile(profilesPassiveFile, ramProfiles_time); + log.logConfig("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() + + ", " + this.profilesPassiveCrawls.size() + " entries" + + ", " + ppRamString(profilesPassiveFile.length()/1024)); // loading the robots.txt db this.log.logConfig("Initializing robots.txt DB"); @@ -1135,8 +1141,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logConfig("Starting Indexing Management"); noticeURL = new plasmaCrawlNURL(plasmaPath); //errorURL = new plasmaCrawlZURL(); // fresh error DB each startup; can be hold in RAM and reduces IO; - errorURL = new plasmaCrawlZURL(plasmaPath, "urlError.db", true); - delegatedURL = new plasmaCrawlZURL(plasmaPath, "urlDelegated.db", false); + errorURL = new plasmaCrawlZURL(plasmaPath, "urlError1.db", true); + delegatedURL = new plasmaCrawlZURL(plasmaPath, "urlDelegated1.db", false); wordIndex = new plasmaWordIndex(indexPrimaryPath, indexSecondaryPath, ramRWI_time, log); // set a high maximum cache size to current size; this is adopted later automatically @@ -1161,7 +1167,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser * initialize switchboard queue * ====================================================================== */ // create queue - this.sbQueue = new plasmaSwitchboardQueue(this.wordIndex.loadedURL, new File(this.plasmaPath, "switchboardQueue1.stack"), this.profiles); + this.sbQueue = new plasmaSwitchboardQueue(this.wordIndex.loadedURL, new File(this.plasmaPath, "switchboardQueue2.stack"), this.profilesActiveCrawls); // setting the indexing queue slots indexingSlots = (int) getConfigLong(INDEXER_SLOTS, 30); @@ -1504,7 +1510,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public void urlRemove(String hash) { wordIndex.loadedURL.remove(hash); - noticeURL.remove(hash); + noticeURL.removeByURLHash(hash); delegatedURL.remove(hash); errorURL.remove(hash); } @@ -1547,12 +1553,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser return (bytes / 1024) + "TByte"; } - private void initProfiles() { + private void initActiveCrawlProfiles() { this.defaultProxyProfile = null; this.defaultRemoteProfile = null; this.defaultTextSnippetProfile = null; this.defaultMediaSnippetProfile = null; - Iterator i = this.profiles.profiles(true); + Iterator i = this.profilesActiveCrawls.profiles(true); plasmaCrawlProfile.entry profile; String name; while (i.hasNext()) { @@ -1565,7 +1571,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } if (this.defaultProxyProfile == null) { // generate new default entry for proxy crawling - this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", + this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, ".*", ".*", Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0")), Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0")), 60 * 24, -1, -1, false, @@ -1576,27 +1582,27 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } if (this.defaultRemoteProfile == null) { // generate new default entry for remote crawling - defaultRemoteProfile = this.profiles.newEntry(CRAWL_PROFILE_REMOTE, "", ".*", ".*", 0, 0, + defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, ".*", ".*", 0, 0, -1, -1, -1, true, true, true, false, true, false, true, true, false); } if (this.defaultTextSnippetProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultTextSnippetProfile = this.profiles.newEntry(CRAWL_PROFILE_SNIPPET_TEXT, "", ".*", ".*", 0, 0, + defaultTextSnippetProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_TEXT, null, ".*", ".*", 0, 0, 60 * 24 * 30, -1, -1, true, true, true, true, true, false, true, true, false); } if (this.defaultMediaSnippetProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultMediaSnippetProfile = this.profiles.newEntry(CRAWL_PROFILE_SNIPPET_MEDIA, "", ".*", ".*", 0, 0, + defaultMediaSnippetProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_MEDIA, null, ".*", ".*", 0, 0, 60 * 24 * 30, -1, -1, true, false, true, true, true, false, true, true, false); } } private void resetProfiles() { - final File pdb = new File(plasmaPath, DBFILE_CRAWL_PROFILES); + final File pdb = new File(plasmaPath, DBFILE_ACTIVE_CRAWL_PROFILES); if (pdb.exists()) pdb.delete(); long ramProfiles_time = getConfigLong(RAM_CACHE_PROFILES_TIME, 1000); - profiles = new plasmaCrawlProfile(pdb, ramProfiles_time); - initProfiles(); + profilesActiveCrawls = new plasmaCrawlProfile(pdb, ramProfiles_time); + initActiveCrawlProfiles(); } /** @@ -1623,7 +1629,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser */ public boolean cleanProfiles() throws InterruptedException { if ((sbQueue.size() > 0) || (cacheLoader.size() > 0) || (noticeURL.notEmpty())) return false; - final Iterator iter = profiles.profiles(true); + final Iterator iter = profilesActiveCrawls.profiles(true); plasmaCrawlProfile.entry entry; boolean hasDoneSomething = false; try { @@ -1637,6 +1643,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser (entry.name().equals(CRAWL_PROFILE_REMOTE)) || (entry.name().equals(CRAWL_PROFILE_SNIPPET_TEXT)) || (entry.name().equals(CRAWL_PROFILE_SNIPPET_MEDIA)))) { + profilesPassiveCrawls.newEntry(entry.map()); iter.remove(); hasDoneSomething = true; } @@ -1780,7 +1787,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser messageDB.close(); if (facilityDB != null) facilityDB.close(); sbStackCrawlThread.close(); - profiles.close(); + profilesActiveCrawls.close(); robots.close(); parser.close(); plasmaHTCache.close(); @@ -1799,10 +1806,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public int queueSize() { return sbQueue.size(); - //return processStack.size() + cacheLoader.size() + noticeURL.stackSize(); } public void enQueue(Object job) { + assert job != null; if (!(job instanceof plasmaSwitchboardQueue.Entry)) { System.out.println("Internal error at plasmaSwitchboard.enQueue: wrong job type"); System.exit(0); @@ -1900,9 +1907,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser ", overhangStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", remoteStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE)); try { + int sizeBefore = sbQueue.size(); nextentry = sbQueue.pop(); if (nextentry == null) { - log.logFine("deQueue: null entry on queue stack"); + log.logWarning("deQueue: null entry on queue stack."); + if (sbQueue.size() == sizeBefore) { + // this is a severe problem: because this time a null is returned, it means that this status will last forever + // to re-enable use of the sbQueue, it must be emptied completely + log.logSevere("deQueue: does not shrink after pop() == null. Emergency reset."); + sbQueue.clear(); + } return false; } } catch (IOException e) { @@ -2179,7 +2193,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); return true; } - plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); + plasmaCrawlProfile.entry profile = profilesActiveCrawls.getEntry(profileHandle); if (profile == null) { log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); return true; @@ -2244,7 +2258,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String profileHandle = urlEntry.profileHandle(); // System.out.println("DEBUG plasmaSwitchboard.processCrawling: // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); - plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); + plasmaCrawlProfile.entry profile = profilesActiveCrawls.getEntry(profileHandle); if (profile == null) { log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); return true; @@ -2332,7 +2346,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // System.out.println("DEBUG plasmaSwitchboard.processCrawling: // profileHandle = " + profileHandle + ", urlEntry.url = " + // urlEntry.url()); - plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); + plasmaCrawlProfile.entry profile = profilesActiveCrawls.getEntry(profileHandle); if (profile == null) { log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index 53537107d..802644143 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -82,7 +82,7 @@ public class plasmaSwitchboardQueue { "byte[] flags-1, " + // flags "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator "Cardinal depth-2 {b64e}, " + // the prefetch depth so far, starts at 0 - "String profile-" + plasmaCrawlProfile.crawlProfileHandleLength + ", " + // the name of the prefetch profile handle + "String profile-" + yacySeedDB.commonHashLength + ", " + // the name of the prefetch profile handle "String urldescr-80", kelondroNaturalOrder.naturalOrder, 0); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index f3ce46eb6..b1adbf3a5 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -61,7 +61,7 @@ public final class plasmaWordIndex implements indexRI { // environment constants public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes public static final int wCacheMaxChunk = 1000; // number of references for each urlhash - public static final int lowcachedivisor = 200; + public static final int lowcachedivisor = 320; public static final int maxCollectionPartition = 7; // should be 7 private final kelondroOrder indexOrder = kelondroBase64Order.enhancedCoder; diff --git a/source/de/anomic/urlRedirector/urlRedirectord.java b/source/de/anomic/urlRedirector/urlRedirectord.java index 4e18d3fa7..d099418c4 100644 --- a/source/de/anomic/urlRedirector/urlRedirectord.java +++ b/source/de/anomic/urlRedirector/urlRedirectord.java @@ -34,11 +34,11 @@ public class urlRedirectord implements serverHandler { } if (profile == null) { - profile = switchboard.profiles.newEntry( + profile = switchboard.profilesActiveCrawls.newEntry( // name "URL Redirector", // start URL - "", + null, // crawling filter ".*", ".*", @@ -151,7 +151,7 @@ public class urlRedirectord implements serverHandler { if (pos != -1) { String newDepth = line.substring(pos).trim(); this.theLogger.logFine("Changing crawling depth to '" + newDepth + "'."); - switchboard.profiles.changeEntry(profile, "generalDepth",newDepth); + switchboard.profilesActiveCrawls.changeEntry(profile, "generalDepth",newDepth); } outputWriter.print("\r\n"); outputWriter.flush(); @@ -160,7 +160,7 @@ public class urlRedirectord implements serverHandler { if (pos != -1) { String newValue = line.substring(pos).trim(); this.theLogger.logFine("Changing crawl dynamic setting to '" + newValue + "'"); - switchboard.profiles.changeEntry(profile, "crawlingQ",newValue); + switchboard.profilesActiveCrawls.changeEntry(profile, "crawlingQ",newValue); } outputWriter.print("\r\n"); outputWriter.flush(); @@ -192,7 +192,7 @@ public class urlRedirectord implements serverHandler { // first delete old entry, if exists String urlhash = reqURL.hash(); switchboard.wordIndex.loadedURL.remove(urlhash); - switchboard.noticeURL.remove(urlhash); + switchboard.noticeURL.removeByURLHash(urlhash); switchboard.errorURL.remove(urlhash); // enqueuing URL for crawling