made crawStartExpert_p aware of post variables; extended template where needed

pull/1/head
bhoerdzn 11 years ago
parent 3bf0104199
commit 42ea56eaad

@ -50,7 +50,7 @@
Each of these URLs are the root for a crawl start, existing start URLs are always re-loaded.
Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option.
</span></span>
<input type="radio" align="top" name="crawlingMode" id="url" value="url" checked="checked" />
<input type="radio" align="top" name="crawlingMode" id="url" value="url" #(crawlingMode_url)#::checked="checked"#(/crawlingMode_url)# />
<textarea name="crawlingURL" id="crawlingURL" cols="64" rows="3" size="41" onkeypress="changed()" onfocus="check('url')" >#[starturl]#</textarea>
&nbsp;
<span id="robotsOK"></span>
@ -59,20 +59,20 @@
</dd>
<dt></dt>
<dd>
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="46" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="46" maxlength="256" value="#[bookmarkTitle]#" readonly="readonly" style="background:transparent; border:0px"/>
</dd>
<dt>From Link-List of URL</dt>
<dd>
<input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/><br />
<input type="radio" name="crawlingMode" id="sitelist" value="sitelist" #(has_url)#disabled="disabled"::#(/has_url)# #(crawlingMode_sitelist)#::checked="checked"#(/crawlingMode_sitelist)# onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/><br />
<div id="sitelistURLs"></div>
</dd>
<dt>From Sitemap</dt>
<dd>
<input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"/><input name="sitemapURL" type="text" size="71" maxlength="256" value="" readonly="readonly"/>
<input type="radio" name="crawlingMode" id="sitemap" value="sitemap" #(crawlingMode_sitemap)#::checked="checked"#(/crawlingMode_sitemap)# #(has_sitemapURL)#disabled="disabled"::#(/has_sitemapURL)#/><input name="sitemapURL" type="text" size="71" maxlength="256" value="#[sitemapURL]#" readonly="readonly"/>
</dd>
<dt>From File (enter a path<br/>within your local file system)</dt>
<dd>
<input type="radio" name="crawlingMode" id="file" value="file" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/><input type="text" name="crawlingFile" size="71" maxlength="256" onfocus="check('file')"/><!--<input type="file" name="crawlingFile" size="18" onfocus="check('file')"/>-->
<input type="radio" name="crawlingMode" id="file" value="file" onclick="document.getElementById('Crawler').rangeDomain.checked = true;" #(crawlingMode_file)#::checked="checked"#(/crawlingMode_file)#/><input type="text" name="crawlingFile" value="#[crawlingFile]#" size="71" maxlength="256" onfocus="check('file')"/>
</dd>
</dl>
</fieldset>
@ -129,10 +129,10 @@
</span></span>
<table border="0">
<tr><td width="110"><img src="/env/grafics/plus.gif"> must-match</td><td></td></tr>
<tr><td colspan="2"><input type="radio" name="range" id="rangeDomain" value="domain" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;"/>Restrict to start domain(s)</td></tr>
<tr><td colspan="2"><input type="radio" name="range" id="rangeSubpath" value="subpath" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;" />Restrict to sub-path(s)</td></tr>
<tr><td><input type="radio" name="range" id="rangeWide" value="wide" checked="checked" onclick="document.getElementById('mustmatch').disabled=false;document.getElementById('deleteoldoff').checked=true;document.getElementById('deleteoldon').disabled=true;document.getElementById('deleteoldage').disabled=true;"/>Use filter</td>
<td valign="bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="100000" value="#[mustmatch]#" onclick="document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false"/></td></tr>
<tr><td colspan="2"><input type="radio" name="range" id="rangeDomain" value="domain" #(range_domain)#::checked="checked"#(/range_domain)# onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;"/>Restrict to start domain(s)</td></tr>
<tr><td colspan="2"><input type="radio" name="range" id="rangeSubpath" value="subpath" #(range_subpath)#::checked="checked"#(/range_subpath)# onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;" />Restrict to sub-path(s)</td></tr>
<tr><td><input type="radio" name="range" id="rangeWide" value="wide" #(range_wide)#::checked="checked"#(/range_wide)# onclick="document.getElementById('mustmatch').disabled=false;document.getElementById('deleteoldoff').checked=true;document.getElementById('deleteoldon').disabled=true;document.getElementById('deleteoldage').disabled=true;"/>Use filter</td>
<td valign="bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="100000" value="#[mustmatch]#" onclick="document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false" #(range_wide)#disabled="disabled"::#(/range_wide)#/></td></tr>
<tr><td><img src="/env/grafics/minus.gif"> must-not-match</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="100000" value="#[mustnotmatch]#" /></td></tr>
</table>
</dd>
@ -149,8 +149,8 @@
Crawls can be restricted to specific countries. This uses the country code that can be computed from
the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
</span></span>
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="false" checked="checked" />no country code restriction<br />
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter&nbsp;&nbsp;
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="false" #(countryMustMatchSwitchChecked)#::checked="checked"#(/countryMustMatchSwitchChecked)# />no country code restriction<br />
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" #(countryMustMatchSwitchChecked)#checked="checked::"#(/countryMustMatchSwitchChecked)#/>Use filter&nbsp;&nbsp;
<input name="countryMustMatchList" id="countryMustMatchList" type="text" size="60" maxlength="256" value="#[countryMustMatch]#" />
</dd>
</dl>
@ -187,24 +187,33 @@
After a crawl was done in the past, document may become stale and eventually they are also deleted on the target host.
To remove old files from the search index it is not sufficient to just consider them for re-load but it may be necessary
to delete them because they simply do not exist any more. Use this in combination with re-crawl while this time should be longer.
</span></span><input type="radio" name="deleteold" id="deleteoldoff" value="off" checked="checked"/>Do not delete any document before the crawl is started.</dd>
</span></span><input type="radio" name="deleteold" id="deleteoldoff" value="off" #(deleteold_off)#::checked="checked"#(/deleteold_off)#/>Do not delete any document before the crawl is started.</dd>
<dt>Delete sub-path</dt>
<dd><input type="radio" name="deleteold" id="deleteoldon" value="on" disabled="true"/>For each host in the start url list, delete all documents (in the given subpath) from that host.</dd>
<dd><input type="radio" name="deleteold" id="deleteoldon" value="on" #(deleteold_on)#::checked="checked"#(/deleteold_on)# #(range_wide)#::disabled="disabled"#(/range_wide)#/>For each host in the start url list, delete all documents (in the given subpath) from that host.</dd>
<dt>Delete only old</dt>
<dd><input type="radio" name="deleteold" id="deleteoldage" value="age" disabled="true"/>Treat documents that are loaded
<dd><input type="radio" name="deleteold" id="deleteoldage" value="age" #(deleteold_age)#::checked="checked"#(/deleteold_age)# #(range_wide)#::disabled="disabled"#(/range_wide)#/>Treat documents that are loaded
<select name="deleteIfOlderNumber" id="deleteIfOlderNumber">
<option value="1">1</option><option value="2">2</option><option value="3">3</option>
<option value="4">4</option><option value="5">5</option><option value="6">6</option>
<option value="7">7</option>
<option value="8">8</option><option value="9">9</option><option value="10">10</option>
<option value="12">12</option><option value="14" selected="selected">14</option><option value="21">21</option>
<option value="28">28</option><option value="30">30</option>
<option value="1" #(deleteIfOlderNumber_1)#::selected="selected"#(/deleteIfOlderNumber_1)#>1</option>
<option value="2" #(deleteIfOlderNumber_2)#::selected="selected"#(/deleteIfOlderNumber_2)#>2</option>
<option value="3" #(deleteIfOlderNumber_3)#::selected="selected"#(/deleteIfOlderNumber_3)#>3</option>
<option value="4" #(deleteIfOlderNumber_4)#::selected="selected"#(/deleteIfOlderNumber_4)#>4</option>
<option value="5" #(deleteIfOlderNumber_5)#::selected="selected"#(/deleteIfOlderNumber_5)#>5</option>
<option value="6" #(deleteIfOlderNumber_6)#::selected="selected"#(/deleteIfOlderNumber_6)#>6</option>
<option value="7" #(deleteIfOlderNumber_7)#::selected="selected"#(/deleteIfOlderNumber_7)#>7</option>
<option value="8" #(deleteIfOlderNumber_8)#::selected="selected"#(/deleteIfOlderNumber_8)#>8</option>
<option value="9" #(deleteIfOlderNumber_9)#::selected="selected"#(/deleteIfOlderNumber_9)#>9</option>
<option value="10" #(deleteIfOlderNumber_10)#::selected="selected"#(/deleteIfOlderNumber_10)#>10</option>
<option value="12" #(deleteIfOlderNumber_12)#::selected="selected"#(/deleteIfOlderNumber_12)#>12</option>
<option value="14" #(deleteIfOlderNumber_14)#::selected="selected"#(/deleteIfOlderNumber_14)#>14</option>
<option value="21" #(deleteIfOlderNumber_21)#::selected="selected"#(/deleteIfOlderNumber_21)#>21</option>
<option value="28" #(deleteIfOlderNumber_28)#::selected="selected"#(/deleteIfOlderNumber_28)#>28</option>
<option value="30" #(deleteIfOlderNumber_30)#::selected="selected"#(/deleteIfOlderNumber_30)#>30</option>
</select>
<select name="deleteIfOlderUnit" id="deleteIfOlderUnit">
<option value="year">years</option>
<option value="month">months</option>
<option value="day" selected="selected">days</option>
<option value="hour">hours</option>
<select name="deleteIfOlderUnit" id="deleteIfOlderUnit">
<option value="year" #(deleteIfOlderUnit_year)#::selected="selected"#(/deleteIfOlderUnit_year)#>years</option>
<option value="month" #(deleteIfOlderUnit_month)#::selected="selected"#(/deleteIfOlderUnit_month)#>months</option>
<option value="day" #(deleteIfOlderUnit_day)#::selected="selected"#(/deleteIfOlderUnit_day)#>days</option>
<option value="hour" #(deleteIfOlderUnit_hour)#::selected="selected"#(/deleteIfOlderUnit_hour)#>hours</option>
</select> ago as stale and delete them before the crawl is started.
</dd>
</dl>
@ -217,22 +226,31 @@
A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again,
then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age,
to use that check the 're-load' option.
</span></span><input type="radio" name="recrawl" value="nodoubles" checked="checked"/>Never load any page that is already known. Only the start-url may be loaded again.</dd>
</span></span><input type="radio" name="recrawl" value="nodoubles" #(recrawl_nodoubles)#checked="checked"#(/recrawl_nodoubles)#/>Never load any page that is already known. Only the start-url may be loaded again.</dd>
<dt>Re-load</dt>
<dd><input type="radio" name="recrawl" value="reload"/>Treat documents that are loaded
<dd><input type="radio" name="recrawl" value="reload" #(recrawl_reload)#checked="checked"#(/recrawl_reload)#/>Treat documents that are loaded
<select name="reloadIfOlderNumber" id="reloadIfOlderNumber">
<option value="1">1</option><option value="2">2</option><option value="3">3</option>
<option value="4">4</option><option value="5">5</option><option value="6">6</option>
<option value="7" selected="selected">7</option>
<option value="8">8</option><option value="9">9</option><option value="10">10</option>
<option value="12">12</option><option value="14">14</option><option value="21">21</option>
<option value="28">28</option><option value="30">30</option>
<option value="1" #(reloadIfOlderNumber_1)#::selected="selected"#(/reloadIfOlderNumber_1)#>1</option>
<option value="2" #(reloadIfOlderNumber_2)#::selected="selected"#(/reloadIfOlderNumber_2)#>2</option>
<option value="3" #(reloadIfOlderNumber_3)#::selected="selected"#(/reloadIfOlderNumber_3)#>3</option>
<option value="4" #(reloadIfOlderNumber_4)#::selected="selected"#(/reloadIfOlderNumber_4)#>4</option>
<option value="5" #(reloadIfOlderNumber_5)#::selected="selected"#(/reloadIfOlderNumber_5)#>5</option>
<option value="6" #(reloadIfOlderNumber_6)#::selected="selected"#(/reloadIfOlderNumber_6)#>6</option>
<option value="7" #(reloadIfOlderNumber_7)#::selected="selected"#(/reloadIfOlderNumber_7)#>7</option>
<option value="8" #(reloadIfOlderNumber_8)#::selected="selected"#(/reloadIfOlderNumber_8)#>8</option>
<option value="9" #(reloadIfOlderNumber_9)#::selected="selected"#(/reloadIfOlderNumber_9)#>9</option>
<option value="10" #(reloadIfOlderNumber_10)#::selected="selected"#(/reloadIfOlderNumber_10)#>10</option>
<option value="12" #(reloadIfOlderNumber_12)#::selected="selected"#(/reloadIfOlderNumber_12)#>12</option>
<option value="14" #(reloadIfOlderNumber_14)#::selected="selected"#(/reloadIfOlderNumber_14)#>14</option>
<option value="21" #(reloadIfOlderNumber_21)#::selected="selected"#(/reloadIfOlderNumber_21)#>21</option>
<option value="28" #(reloadIfOlderNumber_28)#::selected="selected"#(/reloadIfOlderNumber_28)#>28</option>
<option value="30" #(reloadIfOlderNumber_30)#::selected="selected"#(/reloadIfOlderNumber_30)#>30</option>
</select>
<select name="reloadIfOlderUnit" id="reloadIfOlderUnit">
<option value="year">years</option>
<option value="month">months</option>
<option value="day" selected="selected">days</option>
<option value="hour">hours</option>
<select name="reloadIfOlderUnit" id="reloadIfOlderUnit">
<option value="year" #(reloadIfOlderUnit_year)#::selected="selected"#(/reloadIfOlderUnit_year)#>years</option>
<option value="month" #(reloadIfOlderUnit_month)#::selected="selected"#(/reloadIfOlderUnit_month)#>months</option>
<option value="day" #(reloadIfOlderUnit_day)#::selected="selected"#(/reloadIfOlderUnit_day)#>days</option>
<option value="hour" #(reloadIfOlderUnit_hour)#::selected="selected"#(/reloadIfOlderUnit_hour)#>hours</option>
</select> ago as stale and load them again. If they are younger, they are ignored.
</dd>
</dl>
@ -256,10 +274,10 @@
<b>if&nbsp;exist</b>: use the cache if the cache exist. Do no check freshness. Otherwise use online source;
<b>cache&nbsp;only</b>: never go online, use all content from cache. If no cache exist, treat content as unavailable
</span></span>
<input type="radio" name="cachePolicy" value="nocache" />no&nbsp;cache&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="iffresh" checked="checked" />if&nbsp;fresh&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="ifexist" />if&nbsp;exist&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="cacheonly" />cache&nbsp;only
<input type="radio" name="cachePolicy" value="nocache" #(cachePolicy_nocache)#::checked="checked"#(/cachePolicy_nocache)#/>no&nbsp;cache&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="iffresh" #(cachePolicy_iffresh)#::checked="checked"#(/cachePolicy_iffresh)# />if&nbsp;fresh&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="ifexist" #(cachePolicy_ifexist)#::checked="checked"#(/cachePolicy_ifexist)#/>if&nbsp;exist&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="cacheonly" #(cachePolicy_cacheonly)#::checked="checked"#(/cachePolicy_cacheonly)#/>cache&nbsp;only
</dd>
</dl>
</fieldset>
@ -290,7 +308,7 @@
<dt>Do Local Indexing</dt>
<dd>
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
This enables indexing of the wepages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the
This enables indexing of the webpages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the
Document Cache without indexing.
</span></span>
<label for="indexText">index text</label>:
@ -315,7 +333,7 @@
</td>
<td>
<label for="intention">Describe your intention to start this global crawl (optional)</label>:<br />
<input name="intention" id="intention" type="text" size="40" maxlength="100" value="" /><br />
<input name="intention" id="intention" type="text" size="40" maxlength="100" value="#[intention]#" /><br />
This message will appear in the 'Other Peer Crawl Start' table of other peers.
</td>
</tr>

@ -43,35 +43,434 @@ public class CrawlStartExpert_p {
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
// define visible variables
prop.put("starturl", /*(intranet) ? repository :*/ "");
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
prop.put("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
prop.put("directDocByURLChecked", sb.getConfigBool("crawlingDirectDocByURL", true) ? "1" : "0");
prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL_STRING);
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
prop.put("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
prop.put("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
prop.put("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
prop.put("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING));
prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING));
prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));
// ---------- Start point
// crawl start URL
if (post != null && post.containsKey("crawlingURL")) {
prop.put("starturl", post.get("crawlingURL"));
// simple check for content since it may be empty
if (!post.get("crawlingURL").trim().isEmpty()) {
prop.put("has_url", "1");
}
} else {
prop.put("starturl", "");
}
// sitemap URL
if (post != null && post.containsKey("sitemapURL")) {
prop.put("sitemapURL", post.get("sitemapURL"));
// simple check for content since it may be empty
if (!post.get("sitemapURL").trim().isEmpty()) {
prop.put("has_sitemapURL", "1");
}
} else {
prop.put("sitemapURL", "");
}
// crawling file
if (post != null && post.containsKey("crawlingFile")) {
prop.put("crawlingFile", post.get("crawlingFile"));
// simple check for content since it may be empty
if (!post.get("crawlingFile").trim().isEmpty()) {
prop.put("has_crawlingFile", "1");
}
} else {
prop.put("crawlingFile", "");
}
// Crawling mode
if (post != null && post.containsKey("crawlingMode")) {
final String crawlingMode = post.get("crawlingMode");
boolean hasMode = false;
if (crawlingMode.equalsIgnoreCase("sitelist")
&& prop.getBoolean("has_url")) {
// sitelist needs "crawlingURL" parameter, checked already
prop.put("crawlingMode_sitelist", "1");
hasMode = true;
} else if (crawlingMode.equalsIgnoreCase("sitemap")
&& prop.getBoolean("has_sitemapURL")) {
// sitemap needs "sitemapURL" parameter, checked already
prop.put("crawlingMode_sitemap", "1");
hasMode = true;
} else if (crawlingMode.equalsIgnoreCase("file")
&& prop.getBoolean("has_crawlingFile")) {
// sitemap needs "crawlingFile" parameter, checked already
prop.put("crawlingMode_file", "1");
hasMode = true;
}
// default to URL mode
if (!hasMode) {
prop.put("crawlingMode_url", "1");
}
} else {
// default to URL
prop.put("crawlingMode_url", "1");
}
// Bookmark title (set by script)
if (post != null && post.containsKey("bookmarkTitle")) {
prop.put("bookmarkTitle", post.get("bookmarkTitle"));
} else {
prop.put("bookmarkTitle", "");
}
// ---------- Crawling filter
final int crawlingDomMaxPages = env.getConfigInt(
"crawlingDomMaxPages", -1);
// crawling depth
if (post != null && post.containsKey("crawlingDepth")) {
final Integer depth = post.getInt("crawlingDepth", -1);
// depth is limited to two digits, zero allowed
if (depth >= 0 && depth < 100) {
prop.put("crawlingDepth", depth);
}
}
if (!prop.containsKey("crawlingDepth")) {
prop.put("crawlingDepth", Math.min(3,
env.getConfigLong("crawlingDepth", 0)));
}
// linked non-parseable documents?
if (post == null) {
prop.put("directDocByURLChecked",
sb.getConfigBool("crawlingDirectDocByURL", true) ? "1" : "0");
} else {
prop.put("directDocByURLChecked",
post.getBoolean("directDocByURL") ? "1" : "0");
}
// Unlimited crawl depth for URLs matching with
if (post != null && post.containsKey("crawlingDepthExtension")) {
prop.put("crawlingDepthExtension", post.get("crawlingDepthExtension"));
} else {
prop.put("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
}
// Limit by maximum Pages per Domain?
if (post == null) {
prop.put("crawlingDomMaxCheck",
(crawlingDomMaxPages == -1) ? "0" : "1");
} else {
prop.put("crawlingDomMaxCheck",
post.getBoolean("crawlingDomMaxCheck") ? "1" : "0");
}
// Maximum Pages per Domain
if (post != null && post.containsKey("crawlingDomMaxPages")) {
final Integer maxPages = post.getInt("crawlingDomMaxPages", -1);
// depth is limited to six digits, zero not allowed
if (maxPages > 0 && maxPages < 1000000) {
prop.put("crawlingDomMaxPages", maxPages);
}
}
if (!prop.containsKey("crawlingDomMaxPages")) {
prop.put("crawlingDomMaxPages",
(crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages);
}
// Accept URLs with query-part?
// Obey html-robots-noindex?
if (post == null) {
prop.put("crawlingQChecked",
env.getConfigBool("crawlingQ", true) ? "1" : "0");
prop.put("obeyHtmlRobotsNoindexChecked",
env.getConfigBool("obeyHtmlRobotsNoindex", true) ? "1" : "0");
} else {
prop.put("crawlingQChecked", post.getBoolean("crawlingQ") ? "1" : "0");
prop.put("obeyHtmlRobotsNoindexChecked",
post.getBoolean("obeyHtmlRobotsNoindex") ? "1" : "0");
}
// Load Filter on URLs (range)
if (post != null && post.containsKey("range")) {
final String range = post.get("range");
if (range.equalsIgnoreCase("domain")) {
prop.put("range_domain", "1");
} else if (range.equalsIgnoreCase("subpath")) {
prop.put("range_subpath", "1");
} else if (range.equalsIgnoreCase("wide")) {
prop.put("range_wide", "1");
}
} else {
prop.put("range_wide", "1");
}
// Load Filter on URLs: must match
if (post != null && post.containsKey("mustmatch")) {
prop.put("mustmatch", post.get("mustmatch"));
} else {
prop.put("mustmatch", CrawlProfile.MATCH_ALL_STRING);
}
// Load Filter on URLs: must-not-match
if (post != null && post.containsKey("mustnotmatch")) {
prop.put("mustnotmatch", post.get("mustnotmatch"));
} else {
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
}
// Load Filter on IPs: must match
if (post != null && post.containsKey("ipMustmatch")) {
prop.put("ipMustmatch", post.get("ipMustmatch"));
} else {
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch",
CrawlProfile.MATCH_ALL_STRING));
}
// Load Filter on IPs: must-not-match
if (post != null && post.containsKey("ipMustnotmatch")) {
prop.put("ipMustnotmatch", post.get("ipMustnotmatch"));
} else {
prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch",
CrawlProfile.MATCH_NEVER_STRING));
}
// Use Country Codes Match-List?
if (post == null) {
// use the default that was set in the original template
prop.put("countryMustMatchSwitchChecked", "0");
} else {
prop.put("countryMustMatchSwitchChecked",
post.getBoolean("countryMustMatchSwitch") ? "1" : "0");
}
// Must-Match List for Country Codes
if (post != null && post.containsKey("countryMustMatchList")) {
prop.put("countryMustMatch", post.get("countryMustMatchList"));
} else {
prop.put("countryMustMatch",
sb.getConfig("crawlingCountryMustMatch", ""));
}
// ---------- Document filter
// Indexer filter on URLs: must match
if (post != null && post.containsKey("indexmustmatch")) {
prop.put("indexmustmatch", post.get("indexmustmatch"));
} else {
prop.put("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
}
// Indexer filter on URLs: must-no-match
if (post != null && post.containsKey("indexmustnotmatch")) {
prop.put("indexmustnotmatch", post.get("indexmustnotmatch"));
} else {
prop.put("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
}
// Filter on Content of Document: must match
if (post != null && post.containsKey("indexcontentmustmatch")) {
prop.put("indexcontentmustmatch", post.get("indexcontentmustmatch"));
} else {
prop.put("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
}
// Filter on Content of Document: must-not-match
if (post != null && post.containsKey("indexcontentmustnotmatch")) {
prop.put("indexcontentmustnotmatch",
post.get("indexcontentmustnotmatch"));
} else {
prop.put("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
}
// ---------- Clean-Up before Crawl Start
// delete if older settings: number value
if (post != null && post.containsKey("deleteIfOlderNumber")) {
final Integer olderNumber = post.getInt("deleteIfOlderNumber", -1);
if (olderNumber >0 && olderNumber <=12) {
prop.put("deleteIfOlderNumber_" + olderNumber, "1");
} else {
switch (olderNumber) {
case 14: prop.put("deleteIfOlderNumber_14", "1"); break;
case 21: prop.put("deleteIfOlderNumber_21", "1"); break;
case 28: prop.put("deleteIfOlderNumber_28", "1"); break;
case 30: prop.put("deleteIfOlderNumber_30", "1"); break;
default: prop.put("deleteIfOlderNumber_14", "1"); break;
}
}
} else {
prop.put("deleteIfOlderNumber_14", "1");
}
// delete if older settings: number unit
if (post != null && post.containsKey("deleteIfOlderUnit")) {
final String olderUnit = post.get("deleteIfOlderUnit");
if (olderUnit.equalsIgnoreCase("year")) {
prop.put("deleteIfOlderUnit_year", "1");
} else if (olderUnit.equalsIgnoreCase("month")) {
prop.put("deleteIfOlderUnit_month", "1");
} else if (olderUnit.equalsIgnoreCase("hour")) {
prop.put("deleteIfOlderUnit_hour", "1");
} else {
prop.put("deleteIfOlderUnit_day", "1");
}
} else {
prop.put("deleteIfOlderUnit_day", "1");
}
// delete any document before the crawl is started?
if (post != null && post.containsKey("deleteold")) {
final String deleteold = post.get("deletold");
if (deleteold.equalsIgnoreCase("on")){
post.put("deleteold_on", "1");
} else if (deleteold.equalsIgnoreCase("age")) {
post.put("deleteold_age", "1");
} else {
post.put("deleteold_off", "1");
}
} else {
prop.put("deleteold_off", "1");
}
// ---------- Double-Check Rules
// reload settings: number value
if (post != null && post.containsKey("reloadIfOlderNumber")) {
final Integer olderNumber = post.getInt("reloadIfOlderNumber", -1);
if (olderNumber >0 && olderNumber <=12) {
prop.put("reloadIfOlderNumber" + olderNumber, "1");
} else {
switch (olderNumber) {
case 14: prop.put("reloadIfOlderNumber_14", "1"); break;
case 21: prop.put("reloadIfOlderNumber_21", "1"); break;
case 28: prop.put("reloadIfOlderNumber_28", "1"); break;
case 30: prop.put("reloadIfOlderNumber_30", "1"); break;
default: prop.put("reloadIfOlderNumber_14", "1"); break;
}
}
} else {
prop.put("reloadIfOlderNumber_14", "1");
}
// reload settings: number unit
if (post != null && post.containsKey("reloadIfOlderUnit")) {
final String olderUnit = post.get("reloadIfOlderUnit");
if (olderUnit.equalsIgnoreCase("year")) {
prop.put("reloadIfOlderUnit_year", "1");
} else if (olderUnit.equalsIgnoreCase("month")) {
prop.put("reloadIfOlderUnit_month", "1");
} else if (olderUnit.equalsIgnoreCase("hour")) {
prop.put("reloadIfOlderUnit_hour", "1");
} else {
prop.put("reloadIfOlderUnit_day", "1");
}
} else {
prop.put("reloadIfOlderUnit_day", "1");
}
if (post != null && post.containsKey("recrawl")) {
final String recrawl = post.get("recrawl");
if (recrawl.equalsIgnoreCase("reload")) {
prop.put("recrawl_reload", "1");
} else {
prop.put("recrawl_nodoubles", "1");
}
} else {
prop.put("recrawl_nodoubles", "1");
}
// ---------- Document Cache
// Store to Web Cache?
if (post == null) {
prop.put("storeHTCacheChecked",
env.getConfigBool("storeHTCache", true) ? "1" : "0");
} else {
prop.put("storeHTCacheChecked",
post.getBoolean("storeHTCache") ? "1" : "0");
}
// Policy for usage of Web Cache
if (post != null && post.containsKey("cachePolicy")) {
final String cachePolicy = post.get("chachePolicy");
if (cachePolicy.equalsIgnoreCase("nocache")) {
prop.put("cachePolicy_nocache", "1");
} else if (cachePolicy.equalsIgnoreCase("ifexist")) {
prop.put("cachePolicy_ifexist", "1");
} else if (cachePolicy.equalsIgnoreCase("cacheonly")) {
prop.put("cachePolicy_cacheonly", "1");
} else {
prop.put("cachePolicy_iffresh", "1");
}
} else {
prop.put("cachePolicy_iffresh", "1");
}
// ---------- Agent name (untested & untouched)
if (sb.isP2PMode()) {
prop.put("agentSelect", 0);
} else {
prop.put("agentSelect", 1);
List<String> agentNames = new ArrayList<String>();
if (sb.isIntranetMode()) {
agentNames.add(ClientIdentification.yacyIntranetCrawlerAgentName);
}
if (sb.isGlobalMode()) {
agentNames.add(ClientIdentification.yacyInternetCrawlerAgentName);
}
agentNames.add(ClientIdentification.googleAgentName);
if (sb.isAllIPMode()) {
agentNames.add(ClientIdentification.browserAgentName);
}
for (int i = 0; i < agentNames.size(); i++) {
prop.put("agentSelect_list_" + i + "_name", agentNames.get(i));
}
prop.put("agentSelect_list", agentNames.size());
}
prop.put("agentSelect_defaultAgentName",
ClientIdentification.yacyInternetCrawlerAgentName);
// ---------- Index Administration
// Do Local Indexing
if (post == null) {
// Local index text?
prop.put("indexingTextChecked",
env.getConfigBool("indexText", true) ? "1" : "0");
// Local index media?
prop.put("indexingMediaChecked",
env.getConfigBool("indexMedia", true) ? "1" : "0");
// Do Remote Indexing?
prop.put("crawlOrderChecked",
env.getConfigBool("crawlOrder", true) ? "1" : "0");
// Remote crawl intention
prop.put("intention", "");
} else {
prop.put("indexingTextChecked",
post.getBoolean("indexText") ? "1" : "0");
prop.put("indexingMediaChecked",
post.getBoolean("indexMedia") ? "1" : "0");
prop.put("crawlOrderChecked",
post.getBoolean("crawlOrder") ? "1" : "0");
prop.put("intention", post.get("intention"));
}
// Target collection
boolean collectionEnabled =
sb.index.fulltext().getDefaultConfiguration().isEmpty() ||
sb.index.fulltext().getDefaultConfiguration().contains(
CollectionSchema.collection_sxt);
prop.put("collectionEnabled", collectionEnabled ? 1 : 0);
if (collectionEnabled) {
if (post != null && post.containsKey("collection")) {
prop.put("collection", post.get("collection"));
} else {
prop.put("collection", collectionEnabled ? "user" : "");
}
}
/* problaby unused (no corresponding entry in template)
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
final int crawlingDomFilterDepth = env.getConfigInt("crawlingDomFilterDepth", -1);
prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? "0" : "1");
prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth);
final int crawlingDomMaxPages = env.getConfigInt("crawlingDomMaxPages", -1);
prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? "0" : "1");
prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages);
prop.put("crawlingQChecked", env.getConfigBool("crawlingQ", true) ? "1" : "0");
prop.put("followFramesChecked", env.getConfigBool("followFrames", true) ? "1" : "0");
prop.put("obeyHtmlRobotsNoindexChecked", env.getConfigBool("obeyHtmlRobotsNoindex", true) ? "1" : "0");
prop.put("storeHTCacheChecked", env.getConfigBool("storeHTCache", true) ? "1" : "0");
prop.put("indexingTextChecked", env.getConfigBool("indexText", true) ? "1" : "0");
prop.put("indexingMediaChecked", env.getConfigBool("indexMedia", true) ? "1" : "0");
prop.put("crawlOrderChecked", env.getConfigBool("crawlOrder", true) ? "1" : "0");
final long LCbusySleep = env.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 100L);
final int LCppm = (LCbusySleep == 0) ? 1000 : (int) (60000L / LCbusySleep);
@ -83,25 +482,8 @@ public class CrawlStartExpert_p {
prop.put("xsstopwChecked", env.getConfigBool("xsstopw", true) ? "1" : "0");
prop.put("xdstopwChecked", env.getConfigBool("xdstopw", true) ? "1" : "0");
prop.put("xpstopwChecked", env.getConfigBool("xpstopw", true) ? "1" : "0");
*/
boolean collectionEnabled = sb.index.fulltext().getDefaultConfiguration().isEmpty() || sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.collection_sxt);
prop.put("collectionEnabled", collectionEnabled ? 1 : 0);
prop.put("collection", collectionEnabled ? "user" : "");
if (sb.isP2PMode()) {
prop.put("agentSelect", 0);
} else {
prop.put("agentSelect", 1);
List<String> agentNames = new ArrayList<String>();
if (sb.isIntranetMode()) agentNames.add(ClientIdentification.yacyIntranetCrawlerAgentName);
if (sb.isGlobalMode()) agentNames.add(ClientIdentification.yacyInternetCrawlerAgentName);
agentNames.add(ClientIdentification.googleAgentName);
if (sb.isAllIPMode()) agentNames.add(ClientIdentification.browserAgentName);
for (int i = 0; i < agentNames.size(); i++) {
prop.put("agentSelect_list_" + i + "_name", agentNames.get(i));
}
prop.put("agentSelect_list", agentNames.size());
}
prop.put("agentSelect_defaultAgentName", ClientIdentification.yacyInternetCrawlerAgentName);
// return rewrite properties
return prop;
}

@ -1,81 +1,81 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://www.netbeans.org/ns/project/1">
<type>org.netbeans.modules.ant.freeform</type>
<configuration>
<general-data xmlns="http://www.netbeans.org/ns/freeform-project/1">
<name>YaCy</name>
</general-data>
<general-data xmlns="http://www.netbeans.org/ns/freeform-project/2">
<!-- Do not use Project Properties customizer when editing this file manually. -->
<name>YaCy</name>
<properties/>
<folders>
<source-folder>
<label>source</label>
<type>java</type>
<location>source</location>
<encoding>UTF-8</encoding>
</source-folder>
<source-folder>
<label>htroot</label>
<type>java</type>
<location>htroot</location>
<encoding>UTF-8</encoding>
</source-folder>
</folders>
<ide-actions>
<action name="build">
<target>compile</target>
</action>
<action name="clean">
<target>clean</target>
</action>
<action name="javadoc">
<target>javadoc</target>
</action>
<action name="run">
<target>run</target>
</action>
<action name="test">
<target>test</target>
</action>
<action name="rebuild">
<target>clean</target>
<target>compile</target>
</action>
</ide-actions>
<view>
<items>
<source-folder style="packages">
<label>source</label>
<location>source</location>
</source-folder>
<source-folder style="packages">
<label>htroot</label>
<location>htroot</location>
</source-folder>
<source-file>
<location>build.xml</location>
</source-file>
</items>
<context-menu>
<ide-action name="build"/>
<ide-action name="rebuild"/>
<ide-action name="clean"/>
<ide-action name="javadoc"/>
<ide-action name="run"/>
<ide-action name="test"/>
</context-menu>
</view>
<subprojects/>
</general-data>
<java-data xmlns="http://www.netbeans.org/ns/freeform-project-java/3">
<compilation-unit>
<package-root>source</package-root>
<package-root>htroot</package-root>
<classpath mode="compile">lib/activation.jar;lib/apache-mime4j-0.6.jar;lib/arq-2.8.7.jar;lib/bcmail-jdk15-145.jar;lib/bcprov-jdk15-145.jar;lib/commons-codec-1.7.jar;lib/commons-compress-1.4.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-httpclient-3.1.jar;lib/commons-io-2.1.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.3.jar;lib/fontbox-1.7.1.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-13.0.1.jar;lib/htmllexer.jar;lib/httpclient-4.3.jar;lib/httpcore-4.3.jar;lib/httpmime-4.3.jar;lib/icu4j-core.jar;lib/iri-0.8.jar;lib/J7Zip-modified.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/jcifs-1.3.15.jar;lib/jcl-over-slf4j-1.7.2.jar;lib/jempbox-1.7.1.jar;lib/jena-2.6.4.jar;lib/jsch-0.1.42.jar;lib/json-simple-1.1.jar;lib/jsoup-1.6.3.jar;lib/log4j-1.2.17.jar;lib/log4j-over-slf4j-1.7.2.jar;lib/lucene-analyzers-common-4.2.1.jar;lib/lucene-analyzers-phonetic-4.2.1.jar;lib/lucene-core-4.2.1.jar;lib/lucene-misc-4.2.1.jar;lib/lucene-spatial-4.2.1.jar;lib/metadata-extractor-2.4.0-beta-1.jar;lib/mysql-connector-java-5.1.12-bin.jar;lib/pdfbox-1.7.1.jar;lib/poi-3.6-20091214.jar;lib/poi-scratchpad-3.6-20091214.jar;lib/sax-2.0.1.jar;lib/servlet-api-2.5-20081211.jar;lib/slf4j-api-1.7.2.jar;lib/slf4j-jdk14-1.7.2.jar;lib/solr-core-4.2.1.jar;lib/solr-solrj-4.2.1.jar;lib/spatial4j-0.3.jar;lib/webcat-0.1-swf.jar;lib/wstx-asl-3.2.7.jar;lib/xercesImpl.jar;lib/xml-apis.jar;lib/zookeeper-3.4.5.jar</classpath>
<source-level>1.6</source-level>
</compilation-unit>
</java-data>
</configuration>
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://www.netbeans.org/ns/project/1">
<type>org.netbeans.modules.ant.freeform</type>
<configuration>
<general-data xmlns="http://www.netbeans.org/ns/freeform-project/1">
<name>YaCy-clone</name>
</general-data>
<general-data xmlns="http://www.netbeans.org/ns/freeform-project/2">
<!-- Do not use Project Properties customizer when editing this file manually. -->
<name>YaCy-clone</name>
<properties/>
<folders>
<source-folder>
<label>source</label>
<type>java</type>
<location>source</location>
<encoding>UTF-8</encoding>
</source-folder>
<source-folder>
<label>htroot</label>
<type>java</type>
<location>htroot</location>
<encoding>UTF-8</encoding>
</source-folder>
</folders>
<ide-actions>
<action name="build">
<target>compile</target>
</action>
<action name="clean">
<target>clean</target>
</action>
<action name="javadoc">
<target>javadoc</target>
</action>
<action name="run">
<target>run</target>
</action>
<action name="test">
<target>test</target>
</action>
<action name="rebuild">
<target>clean</target>
<target>compile</target>
</action>
</ide-actions>
<view>
<items>
<source-folder style="packages">
<label>source</label>
<location>source</location>
</source-folder>
<source-folder style="packages">
<label>htroot</label>
<location>htroot</location>
</source-folder>
<source-file>
<location>build.xml</location>
</source-file>
</items>
<context-menu>
<ide-action name="build"/>
<ide-action name="rebuild"/>
<ide-action name="clean"/>
<ide-action name="javadoc"/>
<ide-action name="run"/>
<ide-action name="test"/>
</context-menu>
</view>
<subprojects/>
</general-data>
<java-data xmlns="http://www.netbeans.org/ns/freeform-project-java/3">
<compilation-unit>
<package-root>source</package-root>
<package-root>htroot</package-root>
<classpath mode="compile">lib/activation.jar;lib/apache-mime4j-0.6.jar;lib/arq-2.8.7.jar;lib/bcmail-jdk15-145.jar;lib/bcprov-jdk15-145.jar;lib/commons-codec-1.7.jar;lib/commons-compress-1.4.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-httpclient-3.1.jar;lib/commons-io-2.1.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.3.jar;lib/fontbox-1.7.1.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-13.0.1.jar;lib/htmllexer.jar;lib/httpclient-4.3.jar;lib/httpcore-4.3.jar;lib/httpmime-4.3.jar;lib/icu4j-core.jar;lib/iri-0.8.jar;lib/J7Zip-modified.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/jcifs-1.3.15.jar;lib/jcl-over-slf4j-1.7.2.jar;lib/jempbox-1.7.1.jar;lib/jena-2.6.4.jar;lib/jsch-0.1.42.jar;lib/json-simple-1.1.jar;lib/jsoup-1.6.3.jar;lib/log4j-1.2.17.jar;lib/log4j-over-slf4j-1.7.2.jar;lib/lucene-analyzers-common-4.2.1.jar;lib/lucene-analyzers-phonetic-4.2.1.jar;lib/lucene-core-4.2.1.jar;lib/lucene-misc-4.2.1.jar;lib/lucene-spatial-4.2.1.jar;lib/metadata-extractor-2.4.0-beta-1.jar;lib/mysql-connector-java-5.1.12-bin.jar;lib/pdfbox-1.7.1.jar;lib/poi-3.6-20091214.jar;lib/poi-scratchpad-3.6-20091214.jar;lib/sax-2.0.1.jar;lib/servlet-api-2.5-20081211.jar;lib/slf4j-api-1.7.2.jar;lib/slf4j-jdk14-1.7.2.jar;lib/solr-core-4.2.1.jar;lib/solr-solrj-4.2.1.jar;lib/spatial4j-0.3.jar;lib/webcat-0.1-swf.jar;lib/wstx-asl-3.2.7.jar;lib/xercesImpl.jar;lib/xml-apis.jar;lib/zookeeper-3.4.5.jar</classpath>
<source-level>1.6</source-level>
</compilation-unit>
</java-data>
</configuration>
</project>

Loading…
Cancel
Save