#{/crawlProfiles}#
diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java
index ab1eb894b..2e8f26464 100644
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@@ -1,11 +1,15 @@
// CrawlProfileEditor_p.java
-// -------------------------------
-// part of the AnomicHTTPD caching proxy
-// (C) by Michael Peter Christen; mc@anomic.de
-// first published on http://www.anomic.de
-// Frankfurt, Germany, 2004, 2005
-// last major change: 04.07.2005
+// (C) 2005, by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 04.07.2005 on http://yacy.net
//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
+// $LastChangedRevision: 1986 $
+// $LastChangedBy: orbiter $
+//
+// LICENSE
+//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@@ -19,29 +23,6 @@
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-//
-// Using this software in any meaning (reading, learning, copying, compiling,
-// running) means that you agree that the Author(s) is (are) not responsible
-// for cost, loss of data or any harm that may be caused directly or indirectly
-// by usage of this softare or this documentation. The usage of this software
-// is on your own risk. The installation and usage (starting/running) of this
-// software may allow other people or application to access your computer and
-// any attached devices and is highly dependent on the configuration of the
-// software which must be done by the user of the software; the author(s) is
-// (are) also not responsible for proper configuration and usage of the
-// software, even if provoked by documentation provided together with
-// the software.
-//
-// Any changes to this file according to the GPL as documented in the file
-// gpl.txt aside this file in the shipment you received can be done to the
-// lines that follows this copyright notice here, but changes must not be
-// done inside the copyright notive above. A re-distribution must contain
-// the intact and unchanged copyright notice.
-// Contributions and changes to the program code must be marked as such.
-
-// You must compile this file with
-// javac -classpath .:../classes CrawlProfileEditor_p.java
-// if the shell's current path is HTROOT
import java.io.IOException;
import java.util.ArrayList;
@@ -103,14 +84,23 @@ public class CrawlProfileEditor_p {
// read post for handle
String handle = (post == null) ? "" : post.get("handle", "");
- if ((post != null) && (post.containsKey("deleteprofile"))) {
- // deletion of a crawl
- sb.profiles.removeEntry(handle);
+ if (post != null) {
+ if (post.containsKey("terminate")) {
+ // termination of a crawl: shift the crawl from active to passive
+ sb.profilesPassiveCrawls.newEntry(sb.profilesActiveCrawls.getEntry(handle).map());
+ sb.profilesActiveCrawls.removeEntry(handle);
+ // delete all entries from the crawl queue that are deleted here
+ sb.noticeURL.removeByProfileHandle(handle);
+ }
+ if (post.containsKey("delete")) {
+ // deletion of a terminated crawl profile
+ sb.profilesPassiveCrawls.removeEntry(handle);
+ }
}
// generate handle list
int count = 0;
- Iterator it = sb.profiles.profiles(true);
+ Iterator it = sb.profilesActiveCrawls.profiles(true);
entry selentry;
while (it.hasNext()) {
selentry = (entry)it.next();
@@ -126,7 +116,7 @@ public class CrawlProfileEditor_p {
count++;
}
prop.put("profiles", count);
- selentry = sb.profiles.getEntry(handle);
+ selentry = sb.profilesActiveCrawls.getEntry(handle);
// read post for change submit
if ((post != null) && (selentry != null)) {
@@ -138,7 +128,7 @@ public class CrawlProfileEditor_p {
tee = (eentry) it.next();
String cval = (String) selentry.map().get(tee.name);
String val = (tee.type == eentry.BOOLEAN) ? Boolean.toString(post.containsKey(tee.name)) : post.get(tee.name, cval);
- if (!cval.equals(val)) sb.profiles.changeEntry(selentry, tee.name, val);
+ if (!cval.equals(val)) sb.profilesActiveCrawls.changeEntry(selentry, tee.name, val);
}
} catch (IOException ex) {
prop.put("error", 1);
@@ -149,47 +139,22 @@ public class CrawlProfileEditor_p {
// generate crawl profile table
count = 0;
+ boolean dark = true;
int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160);
- it = sb.profiles.profiles(true);
plasmaCrawlProfile.entry profile;
- boolean dark = true;
+ // put active crawls into list
+ it = sb.profilesActiveCrawls.profiles(true);
while (it.hasNext()) {
profile = (plasmaCrawlProfile.entry) it.next();
- prop.put("crawlProfiles_"+count+"_dark", ((dark) ? 1 : 0));
- prop.put("crawlProfiles_"+count+"_name", profile.name());
- prop.put("crawlProfiles_"+count+"_startURL", profile.startURL());
- prop.put("crawlProfiles_"+count+"_handle", profile.handle());
- prop.put("crawlProfiles_"+count+"_depth", profile.generalDepth());
- prop.put("crawlProfiles_"+count+"_filter", profile.generalFilter());
- prop.put("crawlProfiles_"+count+"_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder());
- prop.put("crawlProfiles_"+count+"_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth()));
-
- //start contrib [MN]
- int i = 0;
- String item;
- while((i <= domlistlength) && !((item = profile.domName(true, i)).equals(""))){
- if(i == domlistlength){
- item = item + " ...";
- }
- prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent_"+i+"_item", item);
- i++;
- }
-
- prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", i);
- //end contrib [MN]
-
- prop.put("crawlProfiles_"+count+"_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages());
- prop.put("crawlProfiles_"+count+"_withQuery", ((profile.crawlingQ()) ? 1 : 0));
- prop.put("crawlProfiles_"+count+"_storeCache", ((profile.storeHTCache()) ? 1 : 0));
- prop.put("crawlProfiles_"+count+"_indexText", ((profile.indexText()) ? 1 : 0));
- prop.put("crawlProfiles_"+count+"_indexMedia", ((profile.indexMedia()) ? 1 : 0));
- prop.put("crawlProfiles_"+count+"_remoteIndexing", ((profile.remoteIndexing()) ? 1 : 0));
- prop.put("crawlProfiles_"+count+"_deleteButton", (((profile.name().equals("remote")) ||
- (profile.name().equals("proxy")) ||
- (profile.name().equals("snippetText")) ||
- (profile.name().equals("snippetMedia")) ? 0 : 1)));
- prop.put("crawlProfiles_"+count+"_deleteButton_handle", profile.handle());
-
+ putProfileEntry(prop, profile, true, dark, count, domlistlength);
+ dark = !dark;
+ count++;
+ }
+ // put passive crawls into list
+ it = sb.profilesPassiveCrawls.profiles(true);
+ while (it.hasNext()) {
+ profile = (plasmaCrawlProfile.entry) it.next();
+ putProfileEntry(prop, profile, false, dark, count, domlistlength);
dark = !dark;
count++;
}
@@ -223,4 +188,44 @@ public class CrawlProfileEditor_p {
return prop;
}
+
+ private static void putProfileEntry(servletProperties prop, plasmaCrawlProfile.entry profile, boolean active, boolean dark, int count, int domlistlength) {
+ prop.put("crawlProfiles_" + count + "_dark", ((dark) ? 1 : 0));
+ prop.put("crawlProfiles_" + count + "_status", ((active) ? 1 : 0));
+ prop.put("crawlProfiles_" + count + "_name", profile.name());
+ prop.put("crawlProfiles_" + count + "_startURL", profile.startURL());
+ prop.put("crawlProfiles_" + count + "_handle", profile.handle());
+ prop.put("crawlProfiles_" + count + "_depth", profile.generalDepth());
+ prop.put("crawlProfiles_" + count + "_filter", profile.generalFilter());
+ prop.put("crawlProfiles_" + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder());
+ prop.put("crawlProfiles_" + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth()));
+
+ // start contrib [MN]
+ int i = 0;
+ String item;
+ while ((i <= domlistlength) && !((item = profile.domName(true, i)).equals(""))){
+ if(i == domlistlength){
+ item = item + " ...";
+ }
+ prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent_"+i+"_item", item);
+ i++;
+ }
+
+ prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", i);
+ // end contrib [MN]
+
+ prop.put("crawlProfiles_" + count + "_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages());
+ prop.put("crawlProfiles_" + count + "_withQuery", (profile.crawlingQ()) ? 1 : 0);
+ prop.put("crawlProfiles_" + count + "_storeCache", (profile.storeHTCache()) ? 1 : 0);
+ prop.put("crawlProfiles_" + count + "_indexText", (profile.indexText()) ? 1 : 0);
+ prop.put("crawlProfiles_" + count + "_indexMedia", (profile.indexMedia()) ? 1 : 0);
+ prop.put("crawlProfiles_" + count + "_remoteIndexing", (profile.remoteIndexing()) ? 1 : 0);
+ prop.put("crawlProfiles_" + count + "_terminateButton", ((!active) || (profile.name().equals("remote")) ||
+ (profile.name().equals("proxy")) ||
+ (profile.name().equals("snippetText")) ||
+ (profile.name().equals("snippetMedia"))) ? 0 : 1);
+ prop.put("crawlProfiles_" + count + "_terminateButton_handle", profile.handle());
+ prop.put("crawlProfiles_" + count + "_deleteButton", (active) ? 0 : 1);
+ prop.put("crawlProfiles_" + count + "_deleteButton_handle", profile.handle());
+ }
}
diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java
index 7b992180e..a8e0a3e24 100644
--- a/htroot/CrawlResults.java
+++ b/htroot/CrawlResults.java
@@ -69,6 +69,11 @@ public class CrawlResults {
tabletype = 0;
}
+ if ((post != null) && (post.containsKey("autoforward")) && (tabletype == 5) && (sb.wordIndex.loadedURL.getStackSize(5) == 0)) {
+ // the main menu does a request to the local crawler page, but in case this table is empty, the overview page is shown
+ tabletype = 0;
+ }
+
// check if authorization is needed and/or given
if (((tabletype > 0) && (tabletype < 6)) ||
(post.containsKey("clearlist")) ||
diff --git a/htroot/CrawlURLFetch_p.java b/htroot/CrawlURLFetch_p.java
index f16e714ed..3abacae11 100644
--- a/htroot/CrawlURLFetch_p.java
+++ b/htroot/CrawlURLFetch_p.java
@@ -83,9 +83,9 @@ public class CrawlURLFetch_p {
public static plasmaCrawlProfile.entry getCrawlProfile(serverSwitch env) {
if (profile == null) {
- profile = ((plasmaSwitchboard)env).profiles.newEntry(
+ profile = ((plasmaSwitchboard)env).profilesActiveCrawls.newEntry(
"URLFetcher", // Name
- "", // URL
+ null, // URL
".*", ".*", // General / specific filter
0, 0, // General / specific depth
-1, -1, -1, // Recrawl / Dom-filter depth / Dom-max-pages
diff --git a/htroot/IndexCreateIndexingQueue_p.html b/htroot/IndexCreateIndexingQueue_p.html
index 2e467b9f1..c127527b0 100644
--- a/htroot/IndexCreateIndexingQueue_p.html
+++ b/htroot/IndexCreateIndexingQueue_p.html
@@ -6,7 +6,7 @@
#%env/templates/header.template%#
- #%env/templates/submenuCrawler.template%#
+ #%env/templates/submenuIndexCreate.template%#
Indexing Queue
diff --git a/htroot/IndexCreateIndexingQueue_p.java b/htroot/IndexCreateIndexingQueue_p.java
index a99f81890..970c69376 100644
--- a/htroot/IndexCreateIndexingQueue_p.java
+++ b/htroot/IndexCreateIndexingQueue_p.java
@@ -92,6 +92,7 @@ public class IndexCreateIndexingQueue_p {
plasmaHTCache.deleteFile(entry.url());
}
}
+ switchboard.sbQueue.clear(); // reset file to clean up content completely
}
} catch (Exception e) {}
} else if (post.containsKey("deleteEntry")) {
diff --git a/htroot/IndexCreateLoaderQueue_p.html b/htroot/IndexCreateLoaderQueue_p.html
index cd15f5c06..ba8727f4a 100644
--- a/htroot/IndexCreateLoaderQueue_p.html
+++ b/htroot/IndexCreateLoaderQueue_p.html
@@ -6,7 +6,7 @@
#%env/templates/header.template%#
- #%env/templates/submenuCrawler.template%#
+ #%env/templates/submenuIndexCreate.template%#
diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java
index a8f8aa478..52f39d732 100644
--- a/htroot/WatchCrawler_p.java
+++ b/htroot/WatchCrawler_p.java
@@ -63,6 +63,7 @@ public class WatchCrawler_p {
// return variable that accumulates replacements
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects();
+ prop.put("forwardToCrawlStart", 0);
if (post == null) {
// not a crawl start, only monitoring
@@ -70,6 +71,10 @@ public class WatchCrawler_p {
} else {
prop.put("info", 0);
+ if ((post.containsKey("autoforward")) && (switchboard.coreCrawlJobSize() == 0)) {
+ prop.put("forwardToCrawlStart", 1);
+ }
+
if (post.containsKey("continue")) {
// continue queue
String queue = post.get("continue", "");
@@ -158,18 +163,12 @@ public class WatchCrawler_p {
if (pos == -1) crawlingStart = "http://" + crawlingStart;
// normalizing URL
- try {crawlingStart = new yacyURL(crawlingStart, null).toNormalform(true, true);} catch (MalformedURLException e1) {}
-
- // check if url is proper
yacyURL crawlingStartURL = null;
- try {
- crawlingStartURL = new yacyURL(crawlingStart, null);
- } catch (MalformedURLException e) {
- crawlingStartURL = null;
- }
+ try {crawlingStartURL = new yacyURL(crawlingStart, null);} catch (MalformedURLException e1) {}
+ crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
// check if pattern matches
- if ((crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
+ if ((crawlingStart == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
// print error message
prop.put("info", 4); //crawlfilter does not match url
prop.put("info_newcrawlingfilter", newcrawlingfilter);
@@ -183,12 +182,13 @@ public class WatchCrawler_p {
// first delete old entry, if exists
String urlhash = (new yacyURL(crawlingStart, null)).hash();
switchboard.wordIndex.loadedURL.remove(urlhash);
- switchboard.noticeURL.remove(urlhash);
+ switchboard.noticeURL.removeByURLHash(urlhash);
switchboard.errorURL.remove(urlhash);
// stack url
- plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(
- crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter,
+ switchboard.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
+ plasmaCrawlProfile.entry pe = switchboard.profilesActiveCrawls.newEntry(
+ crawlingStartURL.getHost(), crawlingStartURL, newcrawlingfilter, newcrawlingfilter,
newcrawlingdepth, newcrawlingdepth,
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
@@ -268,7 +268,8 @@ public class WatchCrawler_p {
HashMap hyperlinks = (HashMap) scraper.getAnchors();
// creating a crawler profile
- plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, "file://" + file.toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
+ yacyURL crawlURL = new yacyURL("file://" + file.toString(), null);
+ plasmaCrawlProfile.entry profile = switchboard.profilesActiveCrawls.newEntry(fileName, crawlURL, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
// loop through the contained links
Iterator interator = hyperlinks.entrySet().iterator();
@@ -325,10 +326,11 @@ public class WatchCrawler_p {
try {
// getting the sitemap URL
sitemapURLStr = post.get("sitemapURL","");
-
+ yacyURL sitemapURL = new yacyURL(sitemapURLStr, null);
+
// create a new profile
- plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(
- sitemapURLStr, sitemapURLStr, newcrawlingfilter, newcrawlingfilter,
+ plasmaCrawlProfile.entry pe = switchboard.profilesActiveCrawls.newEntry(
+ sitemapURLStr, sitemapURL, newcrawlingfilter, newcrawlingfilter,
newcrawlingdepth, newcrawlingdepth,
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
diff --git a/htroot/WatchWebStructure_p.html b/htroot/WatchWebStructure_p.html
index cb4559aa5..527db4d46 100644
--- a/htroot/WatchWebStructure_p.html
+++ b/htroot/WatchWebStructure_p.html
@@ -20,7 +20,7 @@
#%env/templates/header.template%#
-#%env/templates/submenuCrawler.template%#
+#%env/templates/submenuWebStructure.template%#
Web Structure
diff --git a/htroot/WatchWebStructure_p.java b/htroot/WatchWebStructure_p.java
index 818c11d4e..ce1331d0f 100644
--- a/htroot/WatchWebStructure_p.java
+++ b/htroot/WatchWebStructure_p.java
@@ -31,7 +31,7 @@ public class WatchWebStructure_p {
if (host.equals("auto")) {
// try to find the host from the crawl profiles
- Iterator it = sb.profiles.profiles(true);
+ Iterator it = sb.profilesActiveCrawls.profiles(true);
entry e;
while (it.hasNext()) {
e = (entry)it.next();
diff --git a/htroot/env/templates/header.template b/htroot/env/templates/header.template
index 5e3154e67..bb25bc722 100644
--- a/htroot/env/templates/header.template
+++ b/htroot/env/templates/header.template
@@ -16,7 +16,8 @@
\ No newline at end of file
diff --git a/htroot/env/templates/submenuIndexCreate.template b/htroot/env/templates/submenuIndexCreate.template
index 2245182ea..fcea0c668 100644
--- a/htroot/env/templates/submenuIndexCreate.template
+++ b/htroot/env/templates/submenuIndexCreate.template
@@ -4,5 +4,35 @@
\ No newline at end of file
diff --git a/htroot/env/templates/submenuWebStructure.template b/htroot/env/templates/submenuWebStructure.template
new file mode 100644
index 000000000..9d46ab7ac
--- /dev/null
+++ b/htroot/env/templates/submenuWebStructure.template
@@ -0,0 +1,7 @@
+