replaced auto-dom filter with easy-to-understand Site Link-List crawler option

- nobody understand the auto-dom filter without a lenghtly introduction about the function of a crawler
- nobody ever used the auto-dom filter other than with a crawl depth of 1
- the auto-dom filter was buggy since the filter did not survive a restart and then a search index contained waste
- the function of the auto-dom filter was in fact to just load a link list from the given start url and then start separate crawls for all these urls restricted by their domain
- the new Site Link-List option shows the target urls in real-time during input of the start url (like the robots check) and gives a transparent feed-back what it does before it can be used
- the new option also fits into the easy site-crawl start menu

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7213 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 63e387508c
commit f6eebb6f99

@ -38,8 +38,7 @@
<td><strong>Must Match</strong></td> <td><strong>Must Match</strong></td>
<td><strong>Must Not Match</strong></td> <td><strong>Must Not Match</strong></td>
<td><strong>MaxAge</strong></td> <td><strong>MaxAge</strong></td>
<td><strong>Auto Filter Depth</strong></td> <td><strong>Domain Counter Content</strong></td>
<td><strong>Auto Filter Content</strong></td>
<td><strong>Max Page Per Domain</strong></td> <td><strong>Max Page Per Domain</strong></td>
<td><strong>Accept '?' URLs</strong></td> <td><strong>Accept '?' URLs</strong></td>
<td><strong>Fill Proxy Cache</strong></td> <td><strong>Fill Proxy Cache</strong></td>
@ -70,7 +69,6 @@
<td>#[mustmatch]#</td> <td>#[mustmatch]#</td>
<td>#[mustnotmatch]#</td> <td>#[mustnotmatch]#</td>
<td>#[crawlingIfOlder]#</td> <td>#[crawlingIfOlder]#</td>
<td>#[crawlingDomFilterDepth]#</td>
<td>#{crawlingDomFilterContent}##[item]#<br />#{/crawlingDomFilterContent}#</td> <td>#{crawlingDomFilterContent}##[item]#<br />#{/crawlingDomFilterContent}#</td>
<td>#[crawlingDomMaxPages]#</td> <td>#[crawlingDomMaxPages]#</td>
<td>#(withQuery)#no::yes#(/withQuery)#</td> <td>#(withQuery)#no::yes#(/withQuery)#</td>

@ -87,7 +87,6 @@ public class CrawlProfileEditor_p {
labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING)); labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN));
@ -245,7 +244,7 @@ public class CrawlProfileEditor_p {
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.mustMatchPattern().toString()); prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.mustMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.mustNotMatchPattern().toString()); prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.mustNotMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder())); prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth())); prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");
// start contrib [MN] // start contrib [MN]
int i = 0; int i = 0;

@ -9,7 +9,6 @@
<mustmatch>#[mustmatch]#</mustmatch> <mustmatch>#[mustmatch]#</mustmatch>
<mustnotmatch>#[mustnotmatch]#</mustnotmatch> <mustnotmatch>#[mustnotmatch]#</mustnotmatch>
<crawlingIfOlder>#[crawlingIfOlder]#</crawlingIfOlder> <crawlingIfOlder>#[crawlingIfOlder]#</crawlingIfOlder>
<crawlingDomFilterDepth>#[crawlingDomFilterDepth]#</crawlingDomFilterDepth>
<crawlingDomFilterContent> <crawlingDomFilterContent>
#{crawlingDomFilterContent}# #{crawlingDomFilterContent}#
<item>#[item]#</item> <item>#[item]#</item>

@ -44,6 +44,13 @@
<input name="crawlingURL" type="text" size="41" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" /> <input name="crawlingURL" type="text" size="41" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" />
</td> </td>
</tr> </tr>
<tr>
<td><label for="url"><span class="nobr">From Link-List of URL</span></label>:</td>
<td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled"/></td>
<td>
<div id="sitelistURLs"></div>
</td>
</tr>
<tr> <tr>
<td><label for="url"><span class="nobr">From Sitemap</span></label>:</td> <td><label for="url"><span class="nobr">From Sitemap</span></label>:</td>
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"/></td> <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"/></td>
@ -154,22 +161,6 @@
If you don't know what this means, please leave this field empty. If you don't know what this means, please leave this field empty.
</td> </td>
</tr> </tr>
<tr valign="top" class="TableCellLight">
<td>Auto-Dom-Filter:</td>
<td>
<label for="crawlingDomFilterCheck">Use</label>:
<input type="checkbox" name="crawlingDomFilterCheck" id="crawlingDomFilterCheck" #(crawlingDomFilterCheck)#::checked="checked"#(/crawlingDomFilterCheck)# />&nbsp;&nbsp;
<label for="crawlingDomFilterDepth">Depth</label>:
<input name="crawlingDomFilterDepth" id="crawlingDomFilterDepth" type="text" size="2" maxlength="2" value="#[crawlingDomFilterDepth]#" />
</td>
<td>
This option will automatically create a domain-filter which limits the crawl on domains the crawler
will find on the given depth. You can use this option i.e. to crawl a page with bookmarks while
restricting the crawl on only those domains that appear on the bookmark-page. The adequate depth
for this example would be 1.<br />
The default value 0 gives no restrictions.
</td>
</tr>
<tr valign="top" class="TableCellDark"> <tr valign="top" class="TableCellDark">
<td>Maximum Pages per Domain:</td> <td>Maximum Pages per Domain:</td>
<td> <td>

@ -42,13 +42,18 @@
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/> <input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
</td> </td>
<td> <td>
<span id="robotsOK"></span> <span id="robotsOK"></span><img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
<img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" /> </td>
</td></tr><tr> </tr><tr>
<td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled"
onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Link-List of URL</td>
<td><div id="sitelistURLs"></div></td>
</tr><tr>
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled" <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"
onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Sitemap URL</td> onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Sitemap URL</td>
<td><input name="sitemapURL" type="text" size="41" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/></td> <td><input name="sitemapURL" type="text" size="41" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/></td>
</tr></table><br/> </tr>
</table><br/>
</dd> </dd>
<input type="hidden" name="crawlingDepth" id="crawlingDepth" value="99"> <input type="hidden" name="crawlingDepth" id="crawlingDepth" value="99">
<dt><label>Scheduler</label></dt> <dt><label>Scheduler</label></dt>

@ -60,10 +60,6 @@ import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyNewsPool; import de.anomic.yacy.yacyNewsPool;
public class Crawler_p { public class Crawler_p {
public static final String CRAWLING_MODE_URL = "url";
public static final String CRAWLING_MODE_FILE = "file";
public static final String CRAWLING_MODE_SITEMAP = "sitemap";
// this servlet does NOT create the Crawler servlet page content! // this servlet does NOT create the Crawler servlet page content!
// this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html // this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html
@ -102,372 +98,405 @@ public class Crawler_p {
} }
prop.put("info", "0"); prop.put("info", "0");
if (post != null) {
// a crawl start
if (post.containsKey("continue")) { if (post != null && post.containsKey("continue")) {
// continue queue // continue queue
final String queue = post.get("continue", ""); final String queue = post.get("continue", "");
if (queue.equals("localcrawler")) { if (queue.equals("localcrawler")) {
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
} else if (queue.equals("remotecrawler")) { } else if (queue.equals("remotecrawler")) {
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
}
} }
}
if (post.containsKey("pause")) { if (post != null && post.containsKey("pause")) {
// pause queue // pause queue
final String queue = post.get("pause", ""); final String queue = post.get("pause", "");
if (queue.equals("localcrawler")) { if (queue.equals("localcrawler")) {
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
} else if (queue.equals("remotecrawler")) { } else if (queue.equals("remotecrawler")) {
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
}
} }
}
if (post.containsKey("crawlingstart")) {
// init crawl if (post != null && post.containsKey("crawlingstart")) {
if (sb.peers == null) { // init crawl
prop.put("info", "3"); if (sb.peers == null) {
} else { prop.put("info", "3");
String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url } else {
// add the prefix http:// if necessary String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url
int pos = crawlingStart.indexOf("://"); // add the prefix http:// if necessary
if (pos == -1) crawlingStart = "http://" + crawlingStart; int pos = crawlingStart.indexOf("://");
if (pos == -1) crawlingStart = "http://" + crawlingStart;
// normalizing URL // normalize URL
DigestURI crawlingStartURL = null; DigestURI crawlingStartURL = null;
try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {} try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {}
crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true); crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
// set new properties // set new properties
final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
final boolean subPath = post.get("range", "wide").equals("subpath"); // special property in simple crawl start final boolean subPath = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
// set the crawling filter // set the crawl filter
String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER); String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
// special cases: // special cases:
if (crawlingStartURL!= null && fullDomain) { if (crawlingStartURL!= null && fullDomain) {
newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*"; newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
} }
if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) { if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*"; newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
} }
final boolean crawlOrder = post.get("crawlOrder", "off").equals("on"); final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false"); env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8")); int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8"));
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth)); env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8; if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
// recrawl // recrawl
final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on"); boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1")); int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour
int repeat_time = Integer.parseInt(post.get("repeat_time", "-1")); int repeat_time = Integer.parseInt(post.get("repeat_time", "-1"));
final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays
if (recrawl.equals("scheduler") && repeat_time > 0) { if (recrawl.equals("scheduler") && repeat_time > 0) {
// set crawlingIfOlder attributes that are appropriate for scheduled crawling // set crawlingIfOlder attributes that are appropriate for scheduled crawling
crawlingIfOlderCheck = true; crawlingIfOlderCheck = true;
crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12; crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12;
crawlingIfOlderUnit = "hour"; crawlingIfOlderUnit = "hour";
} else if (recrawl.equals("reload")) { } else if (recrawl.equals("reload")) {
repeat_time = -1; repeat_time = -1;
crawlingIfOlderCheck = true; crawlingIfOlderCheck = true;
} else if (recrawl.equals("nodoubles")) { } else if (recrawl.equals("nodoubles")) {
repeat_time = -1; repeat_time = -1;
crawlingIfOlderCheck = false; crawlingIfOlderCheck = false;
} }
long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit); long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
env.setConfig("crawlingIfOlder", crawlingIfOlder); env.setConfig("crawlingIfOlder", crawlingIfOlder);
// store this call as api call // store this call as api call
if (repeat_time > 0) { if (repeat_time > 0) {
// store as scheduled api call // store as scheduled api call
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3)); sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3));
} else { } else {
// store just a protocol // store just a protocol
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart); sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart);
} }
final boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on");
final int crawlingDomFilterDepth = (crawlingDomFilterCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1; final boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on");
env.setConfig("crawlingDomFilterDepth", Integer.toString(crawlingDomFilterDepth)); final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomMaxPages", "-1")) : -1;
env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));
final boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on");
final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomMaxPages", "-1")) : -1; final boolean crawlingQ = post.get("crawlingQ", "off").equals("on");
env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages)); env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
final boolean crawlingQ = post.get("crawlingQ", "off").equals("on"); final boolean indexText = post.get("indexText", "off").equals("on");
env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false"); env.setConfig("indexText", (indexText) ? "true" : "false");
final boolean indexText = post.get("indexText", "off").equals("on"); final boolean indexMedia = post.get("indexMedia", "off").equals("on");
env.setConfig("indexText", (indexText) ? "true" : "false"); env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
final boolean indexMedia = post.get("indexMedia", "off").equals("on"); final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
env.setConfig("indexMedia", (indexMedia) ? "true" : "false"); env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
final boolean storeHTCache = post.get("storeHTCache", "off").equals("on"); final String cachePolicyString = post.get("cachePolicy", "iffresh");
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false"); CrawlProfile.CacheStrategy cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CacheStrategy.NOCACHE;
final String cachePolicyString = post.get("cachePolicy", "iffresh"); if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
CrawlProfile.CacheStrategy cachePolicy = CrawlProfile.CacheStrategy.IFFRESH; if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CacheStrategy.IFEXIST;
if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CacheStrategy.NOCACHE; if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CacheStrategy.CACHEONLY;
if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CacheStrategy.IFEXIST; final boolean xsstopw = post.get("xsstopw", "off").equals("on");
if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CacheStrategy.CACHEONLY; env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
final boolean xsstopw = post.get("xsstopw", "off").equals("on"); final boolean xdstopw = post.get("xdstopw", "off").equals("on");
env.setConfig("xsstopw", (xsstopw) ? "true" : "false"); env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
final boolean xdstopw = post.get("xdstopw", "off").equals("on"); final boolean xpstopw = post.get("xpstopw", "off").equals("on");
env.setConfig("xdstopw", (xdstopw) ? "true" : "false"); env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
final boolean xpstopw = post.get("xpstopw", "off").equals("on"); final String crawlingMode = post.get("crawlingMode","url");
env.setConfig("xpstopw", (xpstopw) ? "true" : "false"); if (crawlingMode.equals("url")) {
final String crawlingMode = post.get("crawlingMode","url"); // check if pattern matches
if (crawlingMode.equals(CRAWLING_MODE_URL)) { if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
// print error message
prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_crawlingStart", crawlingStart);
} else try {
// check if pattern matches // check if the crawl filter works correctly
if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) { Pattern.compile(newcrawlingMustMatch);
// print error message
prop.put("info", "4"); //crawlfilter does not match url // stack request
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); // first delete old entry, if exists
prop.putHTML("info_crawlingStart", crawlingStart); final DigestURI url = new DigestURI(crawlingStart, null);
} else try { final byte[] urlhash = url.hash();
indexSegment.urlMetadata().remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);
// stack url
sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile pe = new CrawlProfile(
(crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
crawlingStartURL,
newcrawlingMustMatch,
newcrawlingMustNotMatch,
newcrawlingdepth,
crawlingIfOlder, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),
url,
null,
"CRAWLING-ROOT",
new Date(),
pe.handle(),
0,
0,
0
));
if (reasonString == null) {
// create a bookmark from crawl start url
Set<String> tags=listManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
tags.add("crawlStart");
if (post.get("createBookmark","off").equals("on")) {
bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
if(bookmark != null){
bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));
bookmark.setOwner("admin");
bookmark.setPublic(false);
bookmark.setTags(tags, true);
sb.bookmarksDB.saveBookmark(bookmark);
}
}
// liftoff!
prop.put("info", "8");//start msg
prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
// generate a YaCyNews if the global flag was set
if (crawlOrder) {
final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
m.remove("specificDepth");
m.remove("indexText");
m.remove("indexMedia");
m.remove("remoteIndexing");
m.remove("xsstopw");
m.remove("xpstopw");
m.remove("xdstopw");
m.remove("storeTXCache");
m.remove("storeHTCache");
m.remove("generalFilter");
m.remove("specificFilter");
m.put("intention", post.get("intention", "").replace(',', '/'));
sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m);
}
} else {
prop.put("info", "5"); //Crawling failed
prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
prop.putHTML("info_reasonString", reasonString);
sb.crawlQueues.errorURL.push(
new Request(
sb.peers.mySeed().hash.getBytes(),
crawlingStartURL,
null,
"",
new Date(),
pe.handle(),
0,
0,
0),
sb.peers.mySeed().hash.getBytes(),
new Date(),
1,
reasonString);
}
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
prop.put("info", "6");//Error with url
prop.putHTML("info_crawlingStart", crawlingStart);
prop.putHTML("info_error", e.getMessage());
Log.logException(e);
}
} else if (crawlingMode.equals("file")) {
if (post.containsKey("crawlingFile")) {
final String fileName = post.get("crawlingFile");
try {
// check if the crawl filter works correctly // check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch); Pattern.compile(newcrawlingMustMatch);
final File file = new File(fileName);
// stack request final String fileString = post.get("crawlingFile$file");
// first delete old entry, if exists final ContentScraper scraper = new ContentScraper(new DigestURI(file));
final DigestURI url = new DigestURI(crawlingStart, null); final Writer writer = new TransformerWriter(null, null, scraper, null, false);
final byte[] urlhash = url.hash(); FileUtils.copy(fileString, writer);
indexSegment.urlMetadata().remove(urlhash); writer.close();
sb.crawlQueues.noticeURL.removeByURLHash(urlhash); final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
sb.crawlQueues.errorURL.remove(urlhash); final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
final CrawlProfile profile = new CrawlProfile(
// stack url fileName, crawlURL,
sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile pe = new CrawlProfile(
(crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
crawlingStartURL,
newcrawlingMustMatch, newcrawlingMustMatch,
newcrawlingMustNotMatch, CrawlProfile.MATCH_NEVER,
newcrawlingdepth, newcrawlingdepth,
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ, crawlingQ,
indexText, indexMedia, indexText,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy); indexMedia,
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe); storeHTCache,
final String reasonString = sb.crawlStacker.stackCrawl(new Request( true,
sb.peers.mySeed().hash.getBytes(), crawlOrder,
url, xsstopw, xdstopw, xpstopw,
null, cachePolicy);
"CRAWLING-ROOT", sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
new Date(), sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
pe.handle(), final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
0, DigestURI nexturl;
0, while (linkiterator.hasNext()) {
0 final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
)); if (e.getKey() == null) continue;
nexturl = new DigestURI(e.getKey());
if (reasonString == null) { sb.crawlStacker.enqueueEntry(new Request(
// create a bookmark from crawl start url sb.peers.mySeed().hash.getBytes(),
Set<String> tags=listManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart"))); nexturl,
tags.add("crawlStart"); null,
if (post.get("createBookmark","off").equals("on")) { e.getValue(),
bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
if(bookmark != null){
bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));
bookmark.setOwner("admin");
bookmark.setPublic(false);
bookmark.setTags(tags, true);
sb.bookmarksDB.saveBookmark(bookmark);
}
}
// liftoff!
prop.put("info", "8");//start msg
prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
// generate a YaCyNews if the global flag was set
if (crawlOrder) {
final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
m.remove("specificDepth");
m.remove("indexText");
m.remove("indexMedia");
m.remove("remoteIndexing");
m.remove("xsstopw");
m.remove("xpstopw");
m.remove("xdstopw");
m.remove("storeTXCache");
m.remove("storeHTCache");
m.remove("generalFilter");
m.remove("specificFilter");
m.put("intention", post.get("intention", "").replace(',', '/'));
sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m);
}
} else {
prop.put("info", "5"); //Crawling failed
prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
prop.putHTML("info_reasonString", reasonString);
sb.crawlQueues.errorURL.push(
new Request(
sb.peers.mySeed().hash.getBytes(),
crawlingStartURL,
null,
"",
new Date(),
pe.handle(),
0,
0,
0),
sb.peers.mySeed().hash.getBytes(),
new Date(), new Date(),
1, profile.handle(),
reasonString); 0,
0,
0
));
} }
} catch (final PatternSyntaxException e) { } catch (final PatternSyntaxException e) {
prop.put("info", "4"); //crawlfilter does not match url prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage()); prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) { } catch (final Exception e) {
// mist // mist
prop.put("info", "6");//Error with url prop.put("info", "7");//Error with file
prop.putHTML("info_crawlingStart", crawlingStart); prop.putHTML("info_crawlingStart", fileName);
prop.putHTML("info_error", e.getMessage()); prop.putHTML("info_error", e.getMessage());
Log.logException(e); Log.logException(e);
} }
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
}
} else if (crawlingMode.equals("sitemap")) {
String sitemapURLStr = post.get("sitemapURL","");
try {
final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
final CrawlProfile pe = new CrawlProfile(
sitemapURLStr, sitemapURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
crawlingIfOlder, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder,
xsstopw, xdstopw, xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
importer.start();
} catch (final Exception e) {
// mist
prop.put("info", "6");//Error with url
prop.putHTML("info_crawlingStart", sitemapURLStr);
prop.putHTML("info_error", e.getMessage());
Log.logException(e);
}
} else if (crawlingMode.equals("sitelist")) {
try {
final DigestURI sitelistURL = new DigestURI(crawlingStart, null);
// download document
ContentScraper scraper = null;
scraper = sb.loader.parseResource(sitelistURL, CrawlProfile.CacheStrategy.IFFRESH);
String title = scraper.getTitle();
// String description = scraper.getDescription();
} else if (crawlingMode.equals(CRAWLING_MODE_FILE)) { // get links and generate filter
if (post.containsKey("crawlingFile")) { StringBuilder filter = new StringBuilder();
// getting the name of the uploaded file final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
final String fileName = post.get("crawlingFile"); for (MultiProtocolURI uri: hyperlinks.keySet()) {
try { filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
// check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch);
// loading the file content
final File file = new File(fileName);
// getting the content of the bookmark file
final String fileString = post.get("crawlingFile$file");
// parsing the bookmark file and fetching the headline and contained links
final ContentScraper scraper = new ContentScraper(new DigestURI(file));
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer = new TransformerWriter(null,null,scraper,null,false);
FileUtils.copy(fileString, writer);
writer.close();
//String headline = scraper.getHeadline();
final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
// creating a crawler profile
final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
final CrawlProfile profile = new CrawlProfile(
fileName, crawlURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
crawlingIfOlder,
crawlingDomFilterDepth,
crawlingDomMaxPages,
crawlingQ,
indexText,
indexMedia,
storeHTCache,
true,
crawlOrder,
xsstopw, xdstopw, xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
// pause local crawl here
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
// loop through the contained links
final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
DigestURI nexturl;
while (linkiterator.hasNext()) {
final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
if (e.getKey() == null) continue;
nexturl = new DigestURI(e.getKey());
// enqueuing the url for crawling
sb.crawlStacker.enqueueEntry(new Request(
sb.peers.mySeed().hash.getBytes(),
nexturl,
null,
e.getValue(),
new Date(),
profile.handle(),
0,
0,
0
));
}
} catch (final PatternSyntaxException e) {
// print error message
prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
prop.put("info", "7");//Error with file
prop.putHTML("info_crawlingStart", fileName);
prop.putHTML("info_error", e.getMessage());
Log.logException(e);
}
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
} }
} else if (crawlingMode.equals(CRAWLING_MODE_SITEMAP)) { newcrawlingMustMatch = filter.length() > 0 ? filter.substring(1) : "";
String sitemapURLStr = null;
try { // put links onto crawl queue
// getting the sitemap URL final CrawlProfile profile = new CrawlProfile(
sitemapURLStr = post.get("sitemapURL",""); title == null || title.length() == 0 ? sitelistURL.getHost() : title,
final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null); sitelistURL,
newcrawlingMustMatch,
// create a new profile CrawlProfile.MATCH_NEVER,
final CrawlProfile pe = new CrawlProfile( newcrawlingdepth,
sitemapURLStr, sitemapURL, crawlingIfOlder,
newcrawlingMustMatch, crawlingDomMaxPages,
CrawlProfile.MATCH_NEVER, crawlingQ,
newcrawlingdepth, indexText,
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, indexMedia,
crawlingQ, storeHTCache,
indexText, indexMedia, true,
storeHTCache, true, crawlOrder, crawlOrder,
xsstopw, xdstopw, xpstopw, xsstopw, xdstopw, xpstopw,
cachePolicy); cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe); sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
// create a new sitemap importer final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
final SitemapImporter importer = new SitemapImporter(sb, new DigestURI(sitemapURLStr, null), pe); DigestURI nexturl;
importer.start(); while (linkiterator.hasNext()) {
final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
} catch (final Exception e) { if (e.getKey() == null) continue;
// mist nexturl = new DigestURI(e.getKey());
prop.put("info", "6");//Error with url // remove the url from the database to be prepared to crawl them again
prop.putHTML("info_crawlingStart", sitemapURLStr); final byte[] urlhash = nexturl.hash();
prop.putHTML("info_error", e.getMessage()); indexSegment.urlMetadata().remove(urlhash);
Log.logException(e); sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
} sb.crawlQueues.errorURL.remove(urlhash);
sb.crawlStacker.enqueueEntry(new Request(
sb.peers.mySeed().hash.getBytes(),
nexturl,
null,
e.getValue(),
new Date(),
profile.handle(),
0,
0,
0
));
}
} catch (final Exception e) {
// mist
prop.put("info", "6");//Error with url
prop.putHTML("info_crawlingStart", crawlingStart);
prop.putHTML("info_error", e.getMessage());
Log.logException(e);
} }
} }
} }
}
if (post.containsKey("crawlingPerformance")) {
setPerformance(sb, post); if (post != null && post.containsKey("crawlingPerformance")) {
} setPerformance(sb, post);
} }
// performance settings // performance settings

@ -152,7 +152,6 @@ public class QuickCrawlLink_p {
crawlingMustNotMatch, crawlingMustNotMatch,
CrawlingDepth, CrawlingDepth,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month 60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
-1, // domFilterDepth, if negative: no auto-filter
-1, // domMaxPages, if negative: no count restriction -1, // domMaxPages, if negative: no count restriction
crawlDynamic, crawlDynamic,
indexText, indexText,

@ -81,6 +81,20 @@ public class getpageinfo_p {
// put language // put language
Set<String> languages = scraper.getContentLanguages(); Set<String> languages = scraper.getContentLanguages();
prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next()); prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
// get links and put them into a semicolon-separated list
StringBuilder links = new StringBuilder();
StringBuilder filter = new StringBuilder();
count = 0;
for (MultiProtocolURI uri: scraper.getAnchors().keySet()) {
links.append(';').append(uri.toNormalform(true, false));
filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
prop.putXML("links_" + count + "_link", uri.toNormalform(true, false));
count++;
}
prop.put("links", count);
prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : "");
prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
} }
} }
if(actions.indexOf("robots")>=0){ if(actions.indexOf("robots")>=0){

@ -6,9 +6,16 @@
<robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots> <robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
<sitemap>#[sitemap]#</sitemap> <sitemap>#[sitemap]#</sitemap>
<favicon>#[favicon]#</favicon> <favicon>#[favicon]#</favicon>
<sitelist>#[sitelist]#</sitelist>
<filter>#[filter]#</filter>
<tags> <tags>
#{tags}# #{tags}#
<tag name="#[tag]#" /> <tag name="#[tag]#" />
#{/tags}# #{/tags}#
</tags> </tags>
<links>
#{links}#
<link name="#[link]#" />
#{/links}#
</links>
</pageinfo> </pageinfo>

@ -3,12 +3,12 @@ var AJAX_ON="/env/grafics/ajax.gif";
var timeout=""; var timeout="";
function handleResponse(){ function handleResponse(){
if(http.readyState == 4){ if (http.readyState == 4){
var response = http.responseXML; var response = http.responseXML;
// getting the document title // get the document title
doctitle=""; doctitle="";
if(response.getElementsByTagName("title")[0].firstChild!=null){ if (response.getElementsByTagName("title")[0].firstChild!=null){
doctitle=response.getElementsByTagName("title")[0].firstChild.nodeValue; doctitle=response.getElementsByTagName("title")[0].firstChild.nodeValue;
} }
// document.getElementById("title").innerHTML=doctitle; // document.getElementById("title").innerHTML=doctitle;
@ -23,43 +23,51 @@ function handleResponse(){
if(robotsOKspan.firstChild){ if(robotsOKspan.firstChild){
robotsOKspan.removeChild(robotsOKspan.firstChild); robotsOKspan.removeChild(robotsOKspan.firstChild);
} }
if(docrobotsOK==1){ if (docrobotsOK==1){
img=document.createElement("img"); img=document.createElement("img");
img.setAttribute("src", "/env/grafics/ok.png"); img.setAttribute("src", "/env/grafics/ok.png");
img.setAttribute("width", "32px"); img.setAttribute("width", "32px");
img.setAttribute("height", "32px"); img.setAttribute("height", "32px");
robotsOKspan.appendChild(img); robotsOKspan.appendChild(img);
}else if(docrobotsOK==0){ } else if(docrobotsOK==0){
img=document.createElement("img"); img=document.createElement("img");
img.setAttribute("src", "/env/grafics/bad.png"); img.setAttribute("src", "/env/grafics/bad.png");
img.setAttribute("width", "32px"); img.setAttribute("width", "32px");
img.setAttribute("height", "32px"); img.setAttribute("height", "32px");
robotsOKspan.appendChild(img); robotsOKspan.appendChild(img);
robotsOKspan.appendChild(img); robotsOKspan.appendChild(img);
}else{ } else {
robotsOKspan.appendChild(document.createTextNode("")); robotsOKspan.appendChild(document.createTextNode(""));
document.getElementById("robotsOK").innerHTML=""; document.getElementById("robotsOK").innerHTML="";
} }
// getting the sitemap URL contained in the robots.txt // get the sitemap URL contained in the robots.txt
if (document.getElementsByName("sitemapURL").length > 0) { if (document.getElementsByName("sitemapURL").length > 0) {
sitemap=""; sitemap="";
if(response.getElementsByTagName("sitemap")[0].firstChild!=null){ if (response.getElementsByTagName("sitemap")[0].firstChild!=null){
sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue; sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue;
} }
document.getElementsByName("sitemapURL")[0].value=sitemap; document.getElementsByName("sitemapURL")[0].value=sitemap;
document.getElementById("sitemap").disabled=false; document.getElementById("sitemap").disabled=false;
} }
sitelist="";
if (response.getElementsByTagName("sitelist")[0].firstChild!=null){
sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
}
document.getElementById("sitelistURLs").innerHTML = sitelist;
document.getElementById("sitelist").disabled=false;
// clear the ajax image // clear the ajax image
document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF); document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);
} }
} }
function changed(){
function changed() {
window.clearTimeout(timeout); window.clearTimeout(timeout);
timeout=window.setTimeout("loadInfos()", 1500); timeout=window.setTimeout("loadInfos()", 1500);
} }
function loadInfos(){
function loadInfos() {
// displaying ajax image // displaying ajax image
document.getElementsByName("ajax")[0].setAttribute("src",AJAX_ON); document.getElementsByName("ajax")[0].setAttribute("src",AJAX_ON);

@ -48,7 +48,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String FILTER_MUSTNOTMATCH = "nevermatch"; public static final String FILTER_MUSTNOTMATCH = "nevermatch";
public static final String DEPTH = "generalDepth"; public static final String DEPTH = "generalDepth";
public static final String RECRAWL_IF_OLDER = "recrawlIfOlder"; public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
public static final String DOM_FILTER_DEPTH = "domFilterDepth";
public static final String DOM_MAX_PAGES = "domMaxPages"; public static final String DOM_MAX_PAGES = "domMaxPages";
public static final String CRAWLING_Q = "crawlingQ"; public static final String CRAWLING_Q = "crawlingQ";
public static final String INDEX_TEXT = "indexText"; public static final String INDEX_TEXT = "indexText";
@ -70,7 +69,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final String mustnotmatch, final String mustnotmatch,
final int depth, final int depth,
final long recrawlIfOlder /*date*/, final long recrawlIfOlder /*date*/,
final int domFilterDepth, final int domMaxPages, final int domMaxPages,
final boolean crawlingQ, final boolean crawlingQ,
final boolean indexText, final boolean indexMedia, final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache, final boolean storeHTCache, final boolean storeTXCache,
@ -87,7 +86,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch); put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch);
put(DEPTH, depth); put(DEPTH, depth);
put(RECRAWL_IF_OLDER, recrawlIfOlder); put(RECRAWL_IF_OLDER, recrawlIfOlder);
put(DOM_FILTER_DEPTH, domFilterDepth);
put(DOM_MAX_PAGES, domMaxPages); put(DOM_MAX_PAGES, domMaxPages);
put(CRAWLING_Q, crawlingQ); // crawling of urls with '?' put(CRAWLING_Q, crawlingQ); // crawling of urls with '?'
put(INDEX_TEXT, indexText); put(INDEX_TEXT, indexText);
@ -186,21 +184,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return 0L; return 0L;
} }
} }
public int domFilterDepth() {
// if the depth is equal or less to this depth,
// then the current url feeds with its domain the crawl filter
// if this is -1, all domains are feeded
final String r = get(DOM_FILTER_DEPTH);
if (r == null) return Integer.MAX_VALUE;
try {
final int i = Integer.parseInt(r);
if (i < 0) return Integer.MAX_VALUE;
return i;
} catch (final NumberFormatException e) {
Log.logException(e);
return Integer.MAX_VALUE;
}
}
public int domMaxPages() { public int domMaxPages() {
// this is the maximum number of pages that are crawled for a single domain // this is the maximum number of pages that are crawled for a single domain
// if -1, this means no limit // if -1, this means no limit
@ -270,16 +253,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
dp.inc(); dp.inc();
} }
} }
public boolean grantedDomAppearance(final String domain) {
final int max = domFilterDepth();
if (max == Integer.MAX_VALUE) return true;
final DomProfile dp = doms.get(domain);
if (dp == null) {
return 0 < max;
}
return dp.depth <= max;
}
public boolean grantedDomCount(final String domain) { public boolean grantedDomCount(final String domain) {
final int max = domMaxPages(); final int max = domMaxPages();
if (max == Integer.MAX_VALUE) return true; if (max == Integer.MAX_VALUE) return true;
@ -292,10 +265,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public int domSize() { public int domSize() {
return doms.size(); return doms.size();
} }
public boolean domExists(final String domain) {
if (domFilterDepth() == Integer.MAX_VALUE) return true;
return doms.containsKey(domain);
}
public String domName(final boolean attr, final int index){ public String domName(final boolean attr, final int index){
final Iterator<Map.Entry<String, DomProfile>> domnamesi = doms.entrySet().iterator(); final Iterator<Map.Entry<String, DomProfile>> domnamesi = doms.entrySet().iterator();

@ -196,7 +196,7 @@ public final class CrawlStacker {
final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash()); final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash());
// add domain to profile domain list // add domain to profile domain list
if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) { if (profile.domMaxPages() != Integer.MAX_VALUE) {
profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth()); profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth());
} }
@ -296,12 +296,6 @@ public final class CrawlStacker {
return "post url not allowed"; return "post url not allowed";
} }
// deny urls that do not match with the profile domain list
if (!(profile.grantedDomAppearance(url.getHost()))) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is not listed in granted domains.");
return "url does not match domain filter";
}
// deny urls that exceed allowed number of occurrences // deny urls that exceed allowed number of occurrences
if (!(profile.grantedDomCount(url.getHost()))) { if (!(profile.grantedDomCount(url.getHost()))) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed."); if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed.");

@ -164,9 +164,10 @@ public final class CrawlSwitchboard {
if (this.defaultProxyProfile == null) { if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling // generate new default entry for proxy crawling
this.defaultProxyProfile = new CrawlProfile("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, this.defaultProxyProfile = new CrawlProfile(
"proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true, true, true, true,
@ -177,38 +178,38 @@ public final class CrawlSwitchboard {
if (this.defaultRemoteProfile == null) { if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling // generate new default entry for remote crawling
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile); this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile);
} }
if (this.defaultTextSnippetLocalProfile == null) { if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile); this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
} }
if (this.defaultTextSnippetGlobalProfile == null) { if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile); this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile);
} }
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST); this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) { if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile); this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile);
} }
if (this.defaultMediaSnippetGlobalProfile == null) { if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile); this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile);
} }
if (this.defaultSurrogateProfile == null) { if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing // generate new default entry for surrogate parsing
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile); this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile);
} }
} }

Loading…
Cancel
Save