- fixed display of last entered value at IndexCreate_p plus minor usability/HTML adjustments

- removed double XML-escaping from CacheAdmin_p

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3588 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
karlchenofhell 18 years ago
parent 62b79aa0a9
commit c5c3ecc67e

@ -149,7 +149,7 @@ public class CacheAdmin_p {
prop.put("info_type_headlines", t.length);
for (i = 0; i < t.length; i++)
prop.put("info_type_headlines_" + i + "_headline",
de.anomic.data.wikiCode.replaceXMLEntities(t[i].replaceAll("\n", "").trim()));
t[i].replaceAll("\n", "").trim());
formatAnchor(prop, document.getHyperlinks(), "links");
formatImageAnchor(prop, document.getImages());
@ -166,7 +166,7 @@ public class CacheAdmin_p {
if (sentences != null)
while (sentences.hasNext()) {
prop.put("info_type_lines_" + i + "_line",
de.anomic.data.wikiCode.replaceXMLEntities((new String((StringBuffer) sentences.next())).replaceAll("\n", "").trim()));
new String((StringBuffer) sentences.next()).replaceAll("\n", "").trim());
i++;
}
prop.put("info_type_lines", i);

@ -28,16 +28,16 @@
<td>
<table cellpadding="0" cellspacing="0">
<tr>
<td>From&nbsp;URL:</td>
<td><input type="radio" name="crawlingMode" value="url" checked="checked" /></td>
<td><label for="url"><nobr>From URL</nobr></label>:</td>
<td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td>
<td>
<input name="crawlingURL" type="text" size="41" maxlength="256" value="http://" onkeypress="changed()" />
<span id="robotsOK"></span>
</td>
</tr>
<tr>
<td>From&nbsp;File:</td>
<td><input type="radio" name="crawlingMode" value="file" /></td>
<td><label for="file"><nobr>From File</nobr></label>:</td>
<td><input type="radio" name="crawlingMode" id="file" value="file" /></td>
<td><input type="file" name="crawlingFile" size="28" /></td>
</tr>
<tr>
@ -52,8 +52,8 @@
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td>Crawling Depth:</td>
<td><input name="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" /></td>
<td><label for="crawlingDepth">Crawling Depth</label>:</td>
<td><input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" /></td>
<td>
This defines how often the Crawler will follow links embedded in websites.<br />
A minimum of 0 is recommended and means that the page you enter under "Starting Point" will be added
@ -63,8 +63,8 @@
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>Crawling Filter:</td>
<td><input name="crawlingFilter" type="text" size="20" maxlength="100" value="#[crawlingFilter]#" /></td>
<td><label for="crawlingFilter">Crawling Filter</label>:</td>
<td><input name="crawlingFilter" id="crawlingFilter" type="text" size="20" maxlength="100" value="#[crawlingFilter]#" /></td>
<td>
This is an emacs-like regular expression that must match with the URLs which are used to be crawled.
Use this i.e. to crawl a single domain. If you set this filter it makes sense to increase
@ -74,16 +74,16 @@
<tr valign="top" class="TableCellLight">
<td>Re-Crawl Option:</td>
<td>
Use:<input type="checkbox" name="crawlingIfOlderCheck" #(crawlingIfOlderCheck)#::checked="checked"#(/crawlingIfOlderCheck)# />&nbsp;&nbsp;
Interval:
<input name="crawlingIfOlderNumber" type="text" size="7" maxlength="7" value="#[crawlingIfOlderNumber]#" />
<select>
<option name="crawlingIfOlderUnit" value="year" #(crawlingIfOlderUnitYearCheck)#::selected="selected"#(/crawlingIfOlderUnitYearCheck)# />Year(s)&nbsp;&nbsp;
<option name="crawlingIfOlderUnit" value="month" #(crawlingIfOlderUnitMonthCheck)#::selected="selected"#(/crawlingIfOlderUnitMonthCheck)# />Month(s)&nbsp;&nbsp;
<option name="crawlingIfOlderUnit" value="day" #(crawlingIfOlderUnitDayCheck)#::selected="selected"#(/crawlingIfOlderUnitDayCheck)# />Day(s)&nbsp;&nbsp;
<option name="crawlingIfOlderUnit" value="hour" #(crawlingIfOlderUnitHourCheck)#::selected="selected"#(/crawlingIfOlderUnitHourCheck)# />Hour(s)&nbsp;&nbsp;
<option name="crawlingIfOlderUnit" value="minute" #(crawlingIfOlderUnitMinuteCheck)#::selected="selected"#(/crawlingIfOlderUnitMinuteCheck)# />Minute(s)
<label for="crawlingIfOlderChecked">Use</label>:
<input type="checkbox" name="crawlingIfOlderCheck" id="crawlingIfOlderChecked" #(crawlingIfOlderCheck)#::checked="checked"#(/crawlingIfOlderCheck)# />&nbsp;&nbsp;
<label for="crawlingIfOlderNumber">Interval</label>:
<input name="crawlingIfOlderNumber" id="crawlingIfOlderNumber" type="text" size="7" maxlength="7" value="#[crawlingIfOlderNumber]#" />
<select name="crawlingIfOlderUnit">
<option value="year" #(crawlingIfOlderUnitYearCheck)#::selected="selected"#(/crawlingIfOlderUnitYearCheck)#>Year(s)</option>
<option value="month" #(crawlingIfOlderUnitMonthCheck)#::selected="selected"#(/crawlingIfOlderUnitMonthCheck)#>Month(s)</option>
<option value="day" #(crawlingIfOlderUnitDayCheck)#::selected="selected"#(/crawlingIfOlderUnitDayCheck)#>Day(s)</option>
<option value="hour" #(crawlingIfOlderUnitHourCheck)#::selected="selected"#(/crawlingIfOlderUnitHourCheck)#>Hour(s)</option>
<option value="minute" #(crawlingIfOlderUnitMinuteCheck)#::selected="selected"#(/crawlingIfOlderUnitMinuteCheck)#>Minute(s)</option>
</select>
</td>
<td>
@ -95,8 +95,11 @@
<tr valign="top" class="TableCellDark">
<td>Auto-Dom-Filter:</td>
<td>
Use:<input type="checkbox" name="crawlingDomFilterCheck" #(crawlingDomFilterCheck)#::checked="checked"#(/crawlingDomFilterCheck)# />&nbsp;&nbsp;
Depth:<input name="crawlingDomFilterDepth" type="text" size="2" maxlength="2" value="#[crawlingDomFilterDepth]#" /></td>
<label for="crawlingDomFilterCheck">Use</label>:
<input type="checkbox" name="crawlingDomFilterCheck" id="crawlingDomFilterCheck" #(crawlingDomFilterCheck)#::checked="checked"#(/crawlingDomFilterCheck)# />&nbsp;&nbsp;
<label for="crawlingDomFilterDepth">Depth</label>:
<input name="crawlingDomFilterDepth" id="crawlingDomFilterDepth" type="text" size="2" maxlength="2" value="#[crawlingDomFilterDepth]#" />
</td>
<td>
This option will automatically create a domain-filter which limits the crawl on domains the crawler
will find on the given depth. You can use this option i.e. to crawl a page with bookmarks while
@ -108,8 +111,11 @@
<tr valign="top" class="TableCellLight">
<td>Maximum Pages per Domain:</td>
<td>
Use:<input type="checkbox" name="crawlingDomMaxCheck" #(crawlingDomMaxCheck)#::checked="checked"#(/crawlingDomMaxCheck)# />&nbsp;&nbsp;
Page-Count:<input name="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#" /></td>
<label for="crawlingDomMaxCheck">Use</label>:
<input type="checkbox" name="crawlingDomMaxCheck" id="crawlingDomMaxCheck" #(crawlingDomMaxCheck)#::checked="checked"#(/crawlingDomMaxCheck)# />&nbsp;&nbsp;
<label for="crawlingDomMaxPages">Page-Count</label>:
<input name="crawlingDomMaxPages" id="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#" />
</td>
<td>
You can limit the maxmimum number of pages that are fetched and indexed from a single domain with this option.
You can combine this limitation with the 'Auto-Dom-Filter', so that the limit is applied to all the domains within
@ -117,16 +123,16 @@
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>Accept URLs with '?' / dynamic URLs:</td>
<td><input type="checkbox" name="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /></td>
<td><label for="crawlingQ">Accept URLs with '?' / dynamic URLs</label>:</td>
<td><input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /></td>
<td>
A questionmark is usually a hint for a dynamic page. URLs pointing to dynamic content should usually not be crawled. However, there are sometimes web pages with static content that
is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td>Store to Web Cache:</td>
<td><input type="checkbox" name="storeHTCache" #(storeHTCacheChecked)#::checked="checked"#(/storeHTCacheChecked)# /></td>
<td><label for="storeHTCache">Store to Web Cache</label>:</td>
<td><input type="checkbox" name="storeHTCache" id="storeHTCache" #(storeHTCacheChecked)#::checked="checked"#(/storeHTCacheChecked)# /></td>
<td>
This option is used by default for proxy prefetch, but is not needed for explicit crawling.
We recommend to leave this switched off unless you want to control the crawl results with the
@ -135,24 +141,28 @@
</tr>
<tr valign="top" class="TableCellDark">
<td>Do Local Indexing:</td>
<td>index text:<input type="checkbox" name="indexText" #(indexingTextChecked)#::checked="checked"#(/indexingTextChecked)# />&nbsp;&nbsp;&nbsp;
index media:<input type="checkbox" name="indexMedia" #(indexingMediaChecked)#::checked="checked"#(/indexingMediaChecked)# /></td>
<td>
<label for="indexText">index text</label>:
<input type="checkbox" name="indexText" id="indexText" #(indexingTextChecked)#::checked="checked"#(/indexingTextChecked)# />&nbsp;&nbsp;&nbsp;
<label for="indexMedia">index media</label>:
<input type="checkbox" name="indexMedia" id="indexMedia" #(indexingMediaChecked)#::checked="checked"#(/indexingMediaChecked)# />
</td>
<td>
This enables indexing of the wepages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the
<a href="CacheAdmin_p.html">Proxy Cache</a> without indexing.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td>Do Remote Indexing:</td>
<td><label for="crawlOrder">Do Remote Indexing</label>:</td>
<td>
<table border="0" cellpadding="2" cellspacing="0">
<tr>
<td>
<input type="checkbox" name="crawlOrder" #(crawlOrderChecked)#::checked="checked"#(/crawlOrderChecked)# />
<input type="checkbox" name="crawlOrder" id="crawlOrder" #(crawlOrderChecked)#::checked="checked"#(/crawlOrderChecked)# />
</td>
<td>
Describe your intention to start this global crawl (optional):<br />
<input name="intention" type="text" size="40" maxlength="100" value="" /><br />
<label for="intention">Describe your intention to start this global crawl (optional)</label>:<br />
<input name="intention" id="intention" type="text" size="40" maxlength="100" value="" /><br />
This message will appear in the 'Other Peer Crawl Start' table of other peers.
</td>
</tr>
@ -162,12 +172,13 @@
If checked, the crawler will contact other peers and use them as remote indexers for your crawl.
If you need your crawling results locally, you should switch this off.
Only senior and principal peers can initiate or receive remote crawls.
<strong>A YaCyNews message will be created to inform all peers about a global crawl</strong>, so they can omit starting a crawl with the same start point.
<strong>A YaCyNews message will be created to inform all peers about a global crawl</strong>,
so they can omit starting a crawl with the same start point.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>Exclude <em>static</em> Stop-Words</td>
<td><input type="checkbox" name="xsstopw" #(xsstopwChecked)#::checked="checked"#(/xsstopwChecked)# /></td>
<td><label for="xsstopw">Exclude <em>static</em> Stop-Words</label>:</td>
<td><input type="checkbox" name="xsstopw" id="xsstopw" #(xsstopwChecked)#::checked="checked"#(/xsstopwChecked)# /></td>
<td>
This can be useful to circumvent that extremely common words are added to the database, i.e. "the", "he", "she", "it"... To exclude all words given in the file <tt>yacy.stopwords</tt> from indexing,
check this box.
@ -194,9 +205,9 @@
<tr valign="top" class="TableCellLight">
<td>Wanted Performance:</td>
<td>
<input type="radio" name="crawlingPerformance" value="maximum" #(crawlingSpeedMaxChecked)#::checked="checked"#(/crawlingSpeedMaxChecked)# />maximum&nbsp;&nbsp;
<input type="radio" name="crawlingPerformance" value="custom" #(crawlingSpeedCustChecked)#::checked="checked"#(/crawlingSpeedCustChecked)# />custom: <input name="customPPM" type="text" size="4" maxlength="4" value="#[customPPMdefault]#" />PPM&nbsp;&nbsp;
<input type="radio" name="crawlingPerformance" value="minimum" #(crawlingSpeedMinChecked)#::checked="checked"#(/crawlingSpeedMinChecked)# />optimal as background process
<input type="radio" name="crawlingPerformance" id="maximum" value="maximum" #(crawlingSpeedMaxChecked)#::checked="checked"#(/crawlingSpeedMaxChecked)# /><label for="maximum">maximum</label><br />
<input type="radio" name="crawlingPerformance" id="custom" value="custom" #(crawlingSpeedCustChecked)#::checked="checked"#(/crawlingSpeedCustChecked)# /><label for="custom">custom</label>: <input name="customPPM" type="text" size="4" maxlength="4" value="#[customPPMdefault]#" />PPM<br />
<input type="radio" name="crawlingPerformance" id="minimum" value="minimum" #(crawlingSpeedMinChecked)#::checked="checked"#(/crawlingSpeedMinChecked)# /><label for="minimum"><nobr>optimal as background process</nobr></label>
</td>
<td colspan="3">
Set wanted level of computing power, used for this and other running crawl tasks. (PPM = pages per minute)
@ -222,27 +233,26 @@
</colgroup>
<tr valign="top" class="TableCellDark">
<td>
<input type="radio" name="dcr" value="acceptCrawlMax" #(acceptCrawlMaxChecked)#::checked="checked"#(/acceptCrawlMaxChecked)# />
</td>
<td>
Accept remote crawling requests and perform crawl at maximum load
<input type="radio" name="dcr" id="acceptCrawlMax" value="acceptCrawlMax" #(acceptCrawlMaxChecked)#::checked="checked"#(/acceptCrawlMaxChecked)# />
</td>
<td><label for="acceptCrawlMax">Accept remote crawling requests and perform crawl at maximum load</label></td>
</tr>
<tr valign="top" class="TableCelllight">
<td>
<input type="radio" name="dcr" value="acceptCrawlLimited" #(acceptCrawlLimitedChecked)#::checked="checked"#(/acceptCrawlLimitedChecked)# />
<input type="radio" name="dcr" id="acceptCrawlLimited" value="acceptCrawlLimited" #(acceptCrawlLimitedChecked)#::checked="checked"#(/acceptCrawlLimitedChecked)# />
</td>
<td>
Accept remote crawling requests and perform crawl at maximum of
<label for="acceptCrawlLimited">Accept remote crawling requests and perform crawl at maximum of</label>
<input name="acceptCrawlLimit" type="text" size="4" maxlength="4" value="#[PPM]#" /> Pages Per Minute (minimum is 1, low system load usually at PPM &ge; 30)
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>
<input type="radio" name="dcr" value="acceptCrawlDenied" #(acceptCrawlDeniedChecked)#::checked="checked"#(/acceptCrawlDeniedChecked)# />
<input type="radio" name="dcr" id="acceptCrawlDenied" value="acceptCrawlDenied" #(acceptCrawlDeniedChecked)#::checked="checked"#(/acceptCrawlDeniedChecked)# />
</td>
<td>
Do not accept remote crawling requests (please set this only if you cannot accept to crawl only one page per minute; see option above)
<label for="acceptCrawlDenied">Do not accept remote crawling requests (please set this only if
you cannot accept to crawl only one page per minute; see option above)</label>
</td>
</tr>
<tr valign="top" class="TableCellLight">

@ -115,19 +115,19 @@ public class IndexCreate_p {
prop.put("crawlingIfOlderUnitHourCheck", 0);
prop.put("crawlingIfOlderUnitMinuteCheck", 0);
if ((crawlingIfOlder == -1) || (crawlingIfOlder == Integer.MAX_VALUE)) {
prop.put("crawlingIfOlderNumber", 1);
prop.put("crawlingIfOlderNumber", -1);
prop.put("crawlingIfOlderUnitYearCheck", 1);
} else if (crawlingIfOlder >= 60*24*365) {
prop.put("crawlingIfOlderNumber", crawlingIfOlder / (60*24*365));
prop.put("crawlingIfOlderNumber", Math.round((float)crawlingIfOlder / (float)(60*24*365)));
prop.put("crawlingIfOlderUnitYearCheck", 1);
} else if (crawlingIfOlder >= 60*24*30) {
prop.put("crawlingIfOlderNumber", crawlingIfOlder / (60*24*30));
prop.put("crawlingIfOlderNumber", Math.round((float)crawlingIfOlder / (float)(60*24*30)));
prop.put("crawlingIfOlderUnitMonthCheck", 1);
} else if (crawlingIfOlder >= 60*24) {
prop.put("crawlingIfOlderNumber", crawlingIfOlder / (60*24));
prop.put("crawlingIfOlderNumber", Math.round((float)crawlingIfOlder / (float)(60*24)));
prop.put("crawlingIfOlderUnitDayCheck", 1);
} else if (crawlingIfOlder >= 60) {
prop.put("crawlingIfOlderNumber", crawlingIfOlder / 60);
prop.put("crawlingIfOlderNumber", Math.round((float)crawlingIfOlder / 60f));
prop.put("crawlingIfOlderUnitHourCheck", 1);
} else {
prop.put("crawlingIfOlderNumber", crawlingIfOlder);
@ -146,7 +146,7 @@ public class IndexCreate_p {
prop.put("crawlOrderChecked", env.getConfig("crawlOrder", "").equals("true") ? 1 : 0);
long LCbusySleep = Integer.parseInt(env.getConfig(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, "100"));
int LCppm = (int) (60000L / LCbusySleep);
int LCppm = (LCbusySleep == 0) ? 1000 : (int) (60000L / LCbusySleep);
prop.put("crawlingSpeedMaxChecked", (LCppm >= 1000) ? 1 : 0);
prop.put("crawlingSpeedCustChecked", ((LCppm > 10) && (LCppm < 1000)) ? 1 : 0);
prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? 1 : 0);
@ -172,7 +172,7 @@ public class IndexCreate_p {
prop.put("acceptCrawlLimitedChecked", 0);
prop.put("acceptCrawlDeniedChecked", 1);
}
int RTCppm = (int) (60000L / RTCbusySleep);
int RTCppm = (RTCbusySleep == 0) ? 60 : (int) (60000L / RTCbusySleep);
if (RTCppm > 60) RTCppm = 60;
prop.put("PPM", RTCppm);

@ -47,7 +47,6 @@ import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
@ -187,8 +186,8 @@ public class IndexMonitor {
} else {
prop.put("table_indexed_" + cnt + "_available", 1);
prop.put("table_indexed_" + cnt + "_available_cachepath", cachepath);
prop.put("table_indexed_" + cnt + "_available_urltitle", wikiCode.replaceXMLEntities(urlstr));
prop.put("table_indexed_" + cnt + "_available_url", wikiCode.replaceXMLEntities(urltxt));
prop.put("table_indexed_" + cnt + "_available_urltitle", urlstr);
prop.put("table_indexed_" + cnt + "_available_url", urltxt);
}
dark = !dark;
cnt++;

@ -389,7 +389,7 @@ public class WatchCrawler_p {
private static int recrawlIfOlderC(boolean recrawlIfOlderCheck, int recrawlIfOlderNumber, String crawlingIfOlderUnit) {
if (!recrawlIfOlderCheck) return -1;
if (crawlingIfOlderUnit.equals("year")) return recrawlIfOlderNumber * 60 * 24 * 356;
if (crawlingIfOlderUnit.equals("year")) return recrawlIfOlderNumber * 60 * 24 * 365;
if (crawlingIfOlderUnit.equals("month")) return recrawlIfOlderNumber * 60 * 24 * 30;
if (crawlingIfOlderUnit.equals("day")) return recrawlIfOlderNumber * 60 * 24;
if (crawlingIfOlderUnit.equals("hour")) return recrawlIfOlderNumber * 60;

Loading…
Cancel
Save