diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index a87c1b977..0680e2c42 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -24,13 +24,10 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -import java.text.DateFormat; import java.util.ArrayList; -import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.TreeMap; import java.util.regex.Pattern; @@ -39,8 +36,6 @@ import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.CrawlStacker; -import de.anomic.crawler.CrawlSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.server.servletProperties; @@ -50,19 +45,6 @@ public class CrawlProfileEditor_p { private final static String CRAWL_PROFILE_PREFIX = "crawlProfiles_"; private static final String EDIT_ENTRIES_PREFIX = "edit_entries_"; - private static final Set ignoreNames = new HashSet(); - static { - ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_PROXY); - ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_REMOTE); - ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA); - ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT); - ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA); - ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT); - ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE); - ignoreNames.add(CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES); - ignoreNames.add(CrawlSwitchboard.DBFILE_PASSIVE_CRAWL_PROFILES); - } - public static class eentry { public static final int BOOLEAN = 0; public static final int INTEGER = 1; @@ -136,7 +118,7 @@ public class CrawlProfileEditor_p { final Map orderdHandles = new TreeMap(); for (final byte[] h : sb.crawler.getActive()) { selentry = sb.crawler.getActive(h); - if (selentry != null && !ignoreNames.contains(selentry.name())) { + if (selentry != null && !CrawlProfile.ignoreNames.contains(selentry.name())) { orderdHandles.put(selentry.name(), selentry.handle()); } } @@ -187,7 +169,7 @@ public class CrawlProfileEditor_p { // put active crawls into list for (final byte[] h: sb.crawler.getActive()) { profile = sb.crawler.getActive(h); - putProfileEntry(prop, sb.crawlStacker, profile, true, dark, count, domlistlength); + profile.putProfileEntry(CRAWL_PROFILE_PREFIX, prop, sb.crawlStacker, true, dark, count, domlistlength); dark = !dark; count++; } @@ -195,7 +177,7 @@ public class CrawlProfileEditor_p { boolean existPassiveCrawls = false; for (final byte[] h: sb.crawler.getPassive()) { profile = sb.crawler.getPassive(h); - putProfileEntry(prop, sb.crawlStacker, profile, false, dark, count, domlistlength); + profile.putProfileEntry(CRAWL_PROFILE_PREFIX, prop, sb.crawlStacker, false, dark, count, domlistlength); dark = !dark; count++; existPassiveCrawls = true; @@ -234,49 +216,4 @@ public class CrawlProfileEditor_p { return prop; } - private static void putProfileEntry( - final servletProperties prop, - final CrawlStacker crawlStacker, - final CrawlProfile profile, - final boolean active, - final boolean dark, - final int count, - final int domlistlength) { - - prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0"); - prop.put(CRAWL_PROFILE_PREFIX + count + "_name", profile.name()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton", (!active || ignoreNames.contains(profile.name())) ? "0" : "1"); - prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", profile.handle()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", (active) ? "0" : "1"); - prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", profile.handle()); - prop.putXML(CRAWL_PROFILE_PREFIX + count + "_startURL", profile.startURL()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_handle", profile.handle()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", profile.depth()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.urlMustMatchPattern().toString()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.urlMustNotMatchPattern().toString()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder())); - prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive"); - - int i = 0; - if (active && profile.domMaxPages() > 0 - && profile.domMaxPages() != Integer.MAX_VALUE) { - String item; - while (i <= domlistlength && !(item = profile.domName(true, i)).isEmpty()){ - if (i == domlistlength) { - item += " ..."; - } - prop.putHTML(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterContent_" + i + "_item", item); - i++; - } - } - - prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i); - - prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(profile.domMaxPages())); - prop.put(CRAWL_PROFILE_PREFIX + count + "_withQuery", (profile.crawlingQ()) ? "1" : "0"); - prop.put(CRAWL_PROFILE_PREFIX + count + "_storeCache", (profile.storeHTCache()) ? "1" : "0"); - prop.put(CRAWL_PROFILE_PREFIX + count + "_indexText", (profile.indexText()) ? "1" : "0"); - prop.put(CRAWL_PROFILE_PREFIX + count + "_indexMedia", (profile.indexMedia()) ? "1" : "0"); - prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", (profile.remoteIndexing()) ? "1" : "0"); - } } diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html index ce86f9321..4b706f93f 100644 --- a/htroot/Crawler_p.html +++ b/htroot/Crawler_p.html @@ -21,14 +21,14 @@ #%env/templates/submenuCrawlMonitor.template%#

Crawler Queues

- +
+Queues - - + @@ -38,7 +38,6 @@ - @@ -48,7 +47,6 @@ - @@ -58,7 +56,6 @@ - @@ -68,18 +65,17 @@ - - + -
Queue Size MaxPause/Resume
Local Crawler unlimited
Limit Crawler unlimited
Remote Crawler unlimited
No-Load Crawler unlimited
LoaderLoader (#[loaderMax]#) #[loaderSize]#  #[loaderMax]#
- - +
+
+Index Size @@ -96,7 +92,9 @@
- +
+
+Progress
@@ -158,9 +156,40 @@ it may take some seconds until the first result appears there. If you crawl any un-wanted pages, you can delete them here.
#(/info)#

- + +

+ +#(crawlProfilesShow)#:: +
+Running Crawls +
+ + + + + + + + + #{list}# + + + + + #{/list}# +
Start URLStatus
#[startURL]##(terminateButton)#:: +
Running
+
+ + +
+ #(/terminateButton)# +
+
+#(/crawlProfilesShow)# +

See an access timing

diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index bcad9fd42..75a28a323 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -46,6 +46,7 @@ import net.yacy.document.Document; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.peers.NewsPool; @@ -127,6 +128,18 @@ public class Crawler_p { } } + if (post != null && post.containsKey("terminate")) try { + final String handle = post.get("handle", ""); + // termination of a crawl: shift the crawl from active to passive + final CrawlProfile p = sb.crawler.getActive(handle.getBytes()); + if (p != null) sb.crawler.putPassive(handle.getBytes(), p); + // delete all entries from the crawl queue that are deleted here + sb.crawler.removeActive(handle.getBytes()); + sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000); + } catch (final RowSpaceExceededException e) { + Log.logException(e); + } + if (post != null && post.containsKey("crawlingstart")) { // init crawl if (sb.peers == null) { @@ -614,6 +627,24 @@ public class Crawler_p { prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? "1" : "0"); prop.put("customPPMdefault", Integer.toString(LCppm)); + + // generate crawl profile table + int count = 0; + boolean dark = true; + final int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160); + CrawlProfile profile; + // put active crawls into list + for (final byte[] h: sb.crawler.getActive()) { + profile = sb.crawler.getActive(h); + if (CrawlProfile.ignoreNames.contains(profile.name())) continue; + profile.putProfileEntry("crawlProfilesShow_list_", prop, sb.crawlStacker, true, dark, count, domlistlength); + dark = !dark; + count++; + } + prop.put("crawlProfilesShow_list", count); + prop.put("crawlProfilesShow", count == 0 ? 0 : 1); + + // return rewrite properties return prop; } diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 09745b5ca..c5a6e24bc 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -25,11 +25,17 @@ package de.anomic.crawler; +import java.text.DateFormat; +import java.util.HashSet; import java.util.Iterator; import java.util.Map; +import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; +import de.anomic.server.serverObjects; +import de.anomic.server.servletProperties; + import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.services.federated.yacy.CacheStrategy; @@ -495,4 +501,64 @@ public class CrawlProfile extends ConcurrentHashMap implements M } } } + + + public static final Set ignoreNames = new HashSet(); + static { + ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_PROXY); + ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_REMOTE); + ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA); + ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT); + ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA); + ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT); + ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE); + ignoreNames.add(CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES); + ignoreNames.add(CrawlSwitchboard.DBFILE_PASSIVE_CRAWL_PROFILES); + } + + public void putProfileEntry( + final String CRAWL_PROFILE_PREFIX, + final serverObjects prop, + final CrawlStacker crawlStacker, + final boolean active, + final boolean dark, + final int count, + final int domlistlength) { + + prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0"); + prop.put(CRAWL_PROFILE_PREFIX + count + "_name", this.name()); + prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton", (!active || ignoreNames.contains(this.name())) ? "0" : "1"); + prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", this.handle()); + prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", (active) ? "0" : "1"); + prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", this.handle()); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_startURL", this.startURL()); + prop.put(CRAWL_PROFILE_PREFIX + count + "_handle", this.handle()); + prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth()); + prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString()); + prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString()); + prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder())); + prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive"); + + int i = 0; + if (active && this.domMaxPages() > 0 + && this.domMaxPages() != Integer.MAX_VALUE) { + String item; + while (i <= domlistlength && !(item = this.domName(true, i)).isEmpty()){ + if (i == domlistlength) { + item += " ..."; + } + prop.putHTML(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterContent_" + i + "_item", item); + i++; + } + } + + prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i); + + prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (this.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(this.domMaxPages())); + prop.put(CRAWL_PROFILE_PREFIX + count + "_withQuery", (this.crawlingQ()) ? "1" : "0"); + prop.put(CRAWL_PROFILE_PREFIX + count + "_storeCache", (this.storeHTCache()) ? "1" : "0"); + prop.put(CRAWL_PROFILE_PREFIX + count + "_indexText", (this.indexText()) ? "1" : "0"); + prop.put(CRAWL_PROFILE_PREFIX + count + "_indexMedia", (this.indexMedia()) ? "1" : "0"); + prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", (this.remoteIndexing()) ? "1" : "0"); + } }