Added more steering in Crawler_p.html interface

pull/1/head
Michael Peter Christen 13 years ago
parent acc19e190d
commit 16b21f7a5b

@ -24,13 +24,10 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Pattern;
@ -39,8 +36,6 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlStacker;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.servletProperties;
@ -50,19 +45,6 @@ public class CrawlProfileEditor_p {
private final static String CRAWL_PROFILE_PREFIX = "crawlProfiles_";
private static final String EDIT_ENTRIES_PREFIX = "edit_entries_";
private static final Set<String> ignoreNames = new HashSet<String>();
static {
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_PROXY);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_REMOTE);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE);
ignoreNames.add(CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES);
ignoreNames.add(CrawlSwitchboard.DBFILE_PASSIVE_CRAWL_PROFILES);
}
public static class eentry {
public static final int BOOLEAN = 0;
public static final int INTEGER = 1;
@ -136,7 +118,7 @@ public class CrawlProfileEditor_p {
final Map<String, String> orderdHandles = new TreeMap<String, String>();
for (final byte[] h : sb.crawler.getActive()) {
selentry = sb.crawler.getActive(h);
if (selentry != null && !ignoreNames.contains(selentry.name())) {
if (selentry != null && !CrawlProfile.ignoreNames.contains(selentry.name())) {
orderdHandles.put(selentry.name(), selentry.handle());
}
}
@ -187,7 +169,7 @@ public class CrawlProfileEditor_p {
// put active crawls into list
for (final byte[] h: sb.crawler.getActive()) {
profile = sb.crawler.getActive(h);
putProfileEntry(prop, sb.crawlStacker, profile, true, dark, count, domlistlength);
profile.putProfileEntry(CRAWL_PROFILE_PREFIX, prop, sb.crawlStacker, true, dark, count, domlistlength);
dark = !dark;
count++;
}
@ -195,7 +177,7 @@ public class CrawlProfileEditor_p {
boolean existPassiveCrawls = false;
for (final byte[] h: sb.crawler.getPassive()) {
profile = sb.crawler.getPassive(h);
putProfileEntry(prop, sb.crawlStacker, profile, false, dark, count, domlistlength);
profile.putProfileEntry(CRAWL_PROFILE_PREFIX, prop, sb.crawlStacker, false, dark, count, domlistlength);
dark = !dark;
count++;
existPassiveCrawls = true;
@ -234,49 +216,4 @@ public class CrawlProfileEditor_p {
return prop;
}
private static void putProfileEntry(
final servletProperties prop,
final CrawlStacker crawlStacker,
final CrawlProfile profile,
final boolean active,
final boolean dark,
final int count,
final int domlistlength) {
prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_name", profile.name());
prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton", (!active || ignoreNames.contains(profile.name())) ? "0" : "1");
prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", profile.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", (active) ? "0" : "1");
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", profile.handle());
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_startURL", profile.startURL());
prop.put(CRAWL_PROFILE_PREFIX + count + "_handle", profile.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", profile.depth());
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.urlMustMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.urlMustNotMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");
int i = 0;
if (active && profile.domMaxPages() > 0
&& profile.domMaxPages() != Integer.MAX_VALUE) {
String item;
while (i <= domlistlength && !(item = profile.domName(true, i)).isEmpty()){
if (i == domlistlength) {
item += " ...";
}
prop.putHTML(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterContent_" + i + "_item", item);
i++;
}
}
prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i);
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(profile.domMaxPages()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_withQuery", (profile.crawlingQ()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_storeCache", (profile.storeHTCache()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexText", (profile.indexText()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexMedia", (profile.indexMedia()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", (profile.remoteIndexing()) ? "1" : "0");
}
}

@ -21,14 +21,14 @@
#%env/templates/submenuCrawlMonitor.template%#
<h2>Crawler Queues</h2>
<noscript><p>(Please enable JavaScript to automatically update this page!)</p></noscript>
<fieldset style="width:240px;height:130px;float:left;">
<legend>Queues</legend>
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody>
<tr class="TableHeader">
<th>Queue</th>
<th>Size</th>
<th>&nbsp;</th>
<th>Max</th>
<th width="60">Pause/Resume</th>
</tr>
<tr class="TableCellLight">
<td align="left">Local Crawler</td>
@ -38,7 +38,6 @@
<img src="" alt="" style="width:12px; height:12px;" id="localcrawlerstateIMG" />
</a>
</td>
<td align="right">unlimited</td>
</tr>
<tr class="TableCellLight">
<td align="left">Limit Crawler</td>
@ -48,7 +47,6 @@
<img src="" alt="" style="width:12px; height:12px;" id="limitcrawlerstateIMG" />
</a>
</td>
<td align="right">unlimited</td>
</tr>
<tr class="TableCellLight">
<td align="left">Remote Crawler</td>
@ -58,7 +56,6 @@
<img src="" alt="" style="width:12px; height:12px;" id="remotecrawlerstateIMG" />
</a>
</td>
<td align="right">unlimited</td>
</tr>
<tr class="TableCellLight">
<td align="left">No-Load Crawler</td>
@ -68,18 +65,17 @@
<img src="" alt="" style="width:12px; height:12px;" id="noloadcrawlerstateIMG" />
</a>
</td>
<td align="right">unlimited</td>
</tr>
<tr class="TableCellLight">
<td align="left">Loader</td>
<td align="left">Loader (<span id="loaderqueuemax">#[loaderMax]#</span>)</td>
<td align="right"><span id="loaderqueuesize">#[loaderSize]#</span></td>
<td>&nbsp;</td>
<td align="right"><span id="loaderqueuemax">#[loaderMax]#</span></td>
</tr>
</tbody>
</table>
</fieldset>
<fieldset style="width:160px;height:130px;float:left;">
<legend>Index Size</legend>
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody>
<tr class="TableHeader">
@ -96,7 +92,9 @@
</tr>
</tbody>
</table>
</fieldset>
<fieldset style="width:440px;height:130px;;float:left;">
<legend>Progress</legend>
<form action="Crawler_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody>
@ -158,9 +156,40 @@
it may take some seconds until the first result appears there.</strong>
If you crawl any un-wanted pages, you can delete them <a href="IndexCreateWWWLocalQueue_p.html">here</a>.<br />
#(/info)# </p>
</fieldset>
<p style="clear:both;"></p>
<!-- crawl queues -->
<!-- crawl profile list -->
#(crawlProfilesShow)#::
<fieldset>
<legend>Running Crawls</legend>
<table border="0" cellpadding="2" cellspacing="1" summary="A list of crawl profiles and their current settings.">
<colgroup>
<col width="16" />
<col width="140"/>
</colgroup>
<tr class="TableHeader">
<td><strong>Start URL</strong></td>
<td><strong>Status</strong></td>
</tr>
#{list}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td><a href="#[startURL]#">#[startURL]#</a></td>
<td>#(terminateButton)#::
<div style="text-decoration:blink;float:left;">Running</div>
<form style="float:left;" action="Crawler_p.html" method="get" enctype="multipart/form-data" accept-charset="UTF-8"><div>
<input type="hidden" name="handle" value="#[handle]#" />
<input type="submit" name="terminate" value="Terminate" />
</div></form>
#(/terminateButton)#
</td>
</tr>
#{/list}#
</table>
</fieldset>
#(/crawlProfilesShow)#
<p>See an <a href="/api/latency_p.xml">access timing</a></p>
<iframe id="QueuesTable" src="IndexCreateQueues_p.html?embed=&urlsPerHost=1" width="100%" height="0" align="left" scrolling="no" marginheight="0" marginwidth="0" frameborder="0" ></iframe>

@ -46,6 +46,7 @@ import net.yacy.document.Document;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.peers.NewsPool;
@ -127,6 +128,18 @@ public class Crawler_p {
}
}
if (post != null && post.containsKey("terminate")) try {
final String handle = post.get("handle", "");
// termination of a crawl: shift the crawl from active to passive
final CrawlProfile p = sb.crawler.getActive(handle.getBytes());
if (p != null) sb.crawler.putPassive(handle.getBytes(), p);
// delete all entries from the crawl queue that are deleted here
sb.crawler.removeActive(handle.getBytes());
sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
} catch (final RowSpaceExceededException e) {
Log.logException(e);
}
if (post != null && post.containsKey("crawlingstart")) {
// init crawl
if (sb.peers == null) {
@ -614,6 +627,24 @@ public class Crawler_p {
prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? "1" : "0");
prop.put("customPPMdefault", Integer.toString(LCppm));
// generate crawl profile table
int count = 0;
boolean dark = true;
final int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160);
CrawlProfile profile;
// put active crawls into list
for (final byte[] h: sb.crawler.getActive()) {
profile = sb.crawler.getActive(h);
if (CrawlProfile.ignoreNames.contains(profile.name())) continue;
profile.putProfileEntry("crawlProfilesShow_list_", prop, sb.crawlStacker, true, dark, count, domlistlength);
dark = !dark;
count++;
}
prop.put("crawlProfilesShow_list", count);
prop.put("crawlProfilesShow", count == 0 ? 0 : 1);
// return rewrite properties
return prop;
}

@ -25,11 +25,17 @@
package de.anomic.crawler;
import java.text.DateFormat;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import de.anomic.server.serverObjects;
import de.anomic.server.servletProperties;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
@ -495,4 +501,64 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
}
}
public static final Set<String> ignoreNames = new HashSet<String>();
static {
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_PROXY);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_REMOTE);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE);
ignoreNames.add(CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES);
ignoreNames.add(CrawlSwitchboard.DBFILE_PASSIVE_CRAWL_PROFILES);
}
public void putProfileEntry(
final String CRAWL_PROFILE_PREFIX,
final serverObjects prop,
final CrawlStacker crawlStacker,
final boolean active,
final boolean dark,
final int count,
final int domlistlength) {
prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_name", this.name());
prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton", (!active || ignoreNames.contains(this.name())) ? "0" : "1");
prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", this.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", (active) ? "0" : "1");
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", this.handle());
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_startURL", this.startURL());
prop.put(CRAWL_PROFILE_PREFIX + count + "_handle", this.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth());
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");
int i = 0;
if (active && this.domMaxPages() > 0
&& this.domMaxPages() != Integer.MAX_VALUE) {
String item;
while (i <= domlistlength && !(item = this.domName(true, i)).isEmpty()){
if (i == domlistlength) {
item += " ...";
}
prop.putHTML(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterContent_" + i + "_item", item);
i++;
}
}
prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i);
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (this.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(this.domMaxPages()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_withQuery", (this.crawlingQ()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_storeCache", (this.storeHTCache()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexText", (this.indexText()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexMedia", (this.indexMedia()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", (this.remoteIndexing()) ? "1" : "0");
}
}

Loading…
Cancel
Save