replaced auto-dom filter with easy-to-understand Site Link-List crawler option

- nobody understand the auto-dom filter without a lenghtly introduction about the function of a crawler
- nobody ever used the auto-dom filter other than with a crawl depth of 1
- the auto-dom filter was buggy since the filter did not survive a restart and then a search index contained waste
- the function of the auto-dom filter was in fact to just load a link list from the given start url and then start separate crawls for all these urls restricted by their domain
- the new Site Link-List option shows the target urls in real-time during input of the start url (like the robots check) and gives a transparent feed-back what it does before it can be used
- the new option also fits into the easy site-crawl start menu

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7213 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 63e387508c
commit f6eebb6f99

@ -38,8 +38,7 @@
<td><strong>Must Match</strong></td>
<td><strong>Must Not Match</strong></td>
<td><strong>MaxAge</strong></td>
<td><strong>Auto Filter Depth</strong></td>
<td><strong>Auto Filter Content</strong></td>
<td><strong>Domain Counter Content</strong></td>
<td><strong>Max Page Per Domain</strong></td>
<td><strong>Accept '?' URLs</strong></td>
<td><strong>Fill Proxy Cache</strong></td>
@ -70,7 +69,6 @@
<td>#[mustmatch]#</td>
<td>#[mustnotmatch]#</td>
<td>#[crawlingIfOlder]#</td>
<td>#[crawlingDomFilterDepth]#</td>
<td>#{crawlingDomFilterContent}##[item]#<br />#{/crawlingDomFilterContent}#</td>
<td>#[crawlingDomMaxPages]#</td>
<td>#(withQuery)#no::yes#(/withQuery)#</td>

@ -87,7 +87,6 @@ public class CrawlProfileEditor_p {
labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN));
@ -245,7 +244,7 @@ public class CrawlProfileEditor_p {
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.mustMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.mustNotMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");
// start contrib [MN]
int i = 0;

@ -9,7 +9,6 @@
<mustmatch>#[mustmatch]#</mustmatch>
<mustnotmatch>#[mustnotmatch]#</mustnotmatch>
<crawlingIfOlder>#[crawlingIfOlder]#</crawlingIfOlder>
<crawlingDomFilterDepth>#[crawlingDomFilterDepth]#</crawlingDomFilterDepth>
<crawlingDomFilterContent>
#{crawlingDomFilterContent}#
<item>#[item]#</item>

@ -44,6 +44,13 @@
<input name="crawlingURL" type="text" size="41" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" />
</td>
</tr>
<tr>
<td><label for="url"><span class="nobr">From Link-List of URL</span></label>:</td>
<td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled"/></td>
<td>
<div id="sitelistURLs"></div>
</td>
</tr>
<tr>
<td><label for="url"><span class="nobr">From Sitemap</span></label>:</td>
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"/></td>
@ -154,22 +161,6 @@
If you don't know what this means, please leave this field empty.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td>Auto-Dom-Filter:</td>
<td>
<label for="crawlingDomFilterCheck">Use</label>:
<input type="checkbox" name="crawlingDomFilterCheck" id="crawlingDomFilterCheck" #(crawlingDomFilterCheck)#::checked="checked"#(/crawlingDomFilterCheck)# />&nbsp;&nbsp;
<label for="crawlingDomFilterDepth">Depth</label>:
<input name="crawlingDomFilterDepth" id="crawlingDomFilterDepth" type="text" size="2" maxlength="2" value="#[crawlingDomFilterDepth]#" />
</td>
<td>
This option will automatically create a domain-filter which limits the crawl on domains the crawler
will find on the given depth. You can use this option i.e. to crawl a page with bookmarks while
restricting the crawl on only those domains that appear on the bookmark-page. The adequate depth
for this example would be 1.<br />
The default value 0 gives no restrictions.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>Maximum Pages per Domain:</td>
<td>

@ -42,13 +42,18 @@
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
</td>
<td>
<span id="robotsOK"></span>
<img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
</td></tr><tr>
<span id="robotsOK"></span><img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
</td>
</tr><tr>
<td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled"
onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Link-List of URL</td>
<td><div id="sitelistURLs"></div></td>
</tr><tr>
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"
onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Sitemap URL</td>
<td><input name="sitemapURL" type="text" size="41" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/></td>
</tr></table><br/>
</tr>
</table><br/>
</dd>
<input type="hidden" name="crawlingDepth" id="crawlingDepth" value="99">
<dt><label>Scheduler</label></dt>

@ -60,10 +60,6 @@ import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyNewsPool;
public class Crawler_p {
public static final String CRAWLING_MODE_URL = "url";
public static final String CRAWLING_MODE_FILE = "file";
public static final String CRAWLING_MODE_SITEMAP = "sitemap";
// this servlet does NOT create the Crawler servlet page content!
// this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html
@ -102,372 +98,405 @@ public class Crawler_p {
}
prop.put("info", "0");
if (post != null) {
// a crawl start
if (post.containsKey("continue")) {
// continue queue
final String queue = post.get("continue", "");
if (queue.equals("localcrawler")) {
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
} else if (queue.equals("remotecrawler")) {
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
}
if (post != null && post.containsKey("continue")) {
// continue queue
final String queue = post.get("continue", "");
if (queue.equals("localcrawler")) {
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
} else if (queue.equals("remotecrawler")) {
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
}
}
if (post.containsKey("pause")) {
// pause queue
final String queue = post.get("pause", "");
if (queue.equals("localcrawler")) {
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
} else if (queue.equals("remotecrawler")) {
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
}
if (post != null && post.containsKey("pause")) {
// pause queue
final String queue = post.get("pause", "");
if (queue.equals("localcrawler")) {
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
} else if (queue.equals("remotecrawler")) {
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
}
if (post.containsKey("crawlingstart")) {
// init crawl
if (sb.peers == null) {
prop.put("info", "3");
} else {
String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url
// add the prefix http:// if necessary
int pos = crawlingStart.indexOf("://");
if (pos == -1) crawlingStart = "http://" + crawlingStart;
}
if (post != null && post.containsKey("crawlingstart")) {
// init crawl
if (sb.peers == null) {
prop.put("info", "3");
} else {
String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url
// add the prefix http:// if necessary
int pos = crawlingStart.indexOf("://");
if (pos == -1) crawlingStart = "http://" + crawlingStart;
// normalizing URL
DigestURI crawlingStartURL = null;
try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {}
crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
// set new properties
final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
final boolean subPath = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
// set the crawling filter
String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
// special cases:
if (crawlingStartURL!= null && fullDomain) {
newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
}
if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
}
final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8"));
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
// recrawl
final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour
int repeat_time = Integer.parseInt(post.get("repeat_time", "-1"));
final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays
if (recrawl.equals("scheduler") && repeat_time > 0) {
// set crawlingIfOlder attributes that are appropriate for scheduled crawling
crawlingIfOlderCheck = true;
crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12;
crawlingIfOlderUnit = "hour";
} else if (recrawl.equals("reload")) {
repeat_time = -1;
crawlingIfOlderCheck = true;
} else if (recrawl.equals("nodoubles")) {
repeat_time = -1;
crawlingIfOlderCheck = false;
}
long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
env.setConfig("crawlingIfOlder", crawlingIfOlder);
// normalize URL
DigestURI crawlingStartURL = null;
try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {}
crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
// set new properties
final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
final boolean subPath = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
// set the crawl filter
String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
// special cases:
if (crawlingStartURL!= null && fullDomain) {
newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
}
if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
}
final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8"));
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
// recrawl
final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour
int repeat_time = Integer.parseInt(post.get("repeat_time", "-1"));
final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays
if (recrawl.equals("scheduler") && repeat_time > 0) {
// set crawlingIfOlder attributes that are appropriate for scheduled crawling
crawlingIfOlderCheck = true;
crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12;
crawlingIfOlderUnit = "hour";
} else if (recrawl.equals("reload")) {
repeat_time = -1;
crawlingIfOlderCheck = true;
} else if (recrawl.equals("nodoubles")) {
repeat_time = -1;
crawlingIfOlderCheck = false;
}
long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
env.setConfig("crawlingIfOlder", crawlingIfOlder);
// store this call as api call
if (repeat_time > 0) {
// store as scheduled api call
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3));
} else {
// store just a protocol
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart);
}
final boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on");
final int crawlingDomFilterDepth = (crawlingDomFilterCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1;
env.setConfig("crawlingDomFilterDepth", Integer.toString(crawlingDomFilterDepth));
final boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on");
final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomMaxPages", "-1")) : -1;
env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));
final boolean crawlingQ = post.get("crawlingQ", "off").equals("on");
env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
final boolean indexText = post.get("indexText", "off").equals("on");
env.setConfig("indexText", (indexText) ? "true" : "false");
final boolean indexMedia = post.get("indexMedia", "off").equals("on");
env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
final String cachePolicyString = post.get("cachePolicy", "iffresh");
CrawlProfile.CacheStrategy cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CacheStrategy.NOCACHE;
if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CacheStrategy.IFEXIST;
if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CacheStrategy.CACHEONLY;
final boolean xsstopw = post.get("xsstopw", "off").equals("on");
env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
final boolean xdstopw = post.get("xdstopw", "off").equals("on");
env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
final boolean xpstopw = post.get("xpstopw", "off").equals("on");
env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
// store this call as api call
if (repeat_time > 0) {
// store as scheduled api call
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3));
} else {
// store just a protocol
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart);
}
final boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on");
final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomMaxPages", "-1")) : -1;
env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));
final boolean crawlingQ = post.get("crawlingQ", "off").equals("on");
env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
final boolean indexText = post.get("indexText", "off").equals("on");
env.setConfig("indexText", (indexText) ? "true" : "false");
final boolean indexMedia = post.get("indexMedia", "off").equals("on");
env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
final String cachePolicyString = post.get("cachePolicy", "iffresh");
CrawlProfile.CacheStrategy cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CacheStrategy.NOCACHE;
if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CacheStrategy.IFEXIST;
if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CacheStrategy.CACHEONLY;
final boolean xsstopw = post.get("xsstopw", "off").equals("on");
env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
final boolean xdstopw = post.get("xdstopw", "off").equals("on");
env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
final boolean xpstopw = post.get("xpstopw", "off").equals("on");
env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
final String crawlingMode = post.get("crawlingMode","url");
if (crawlingMode.equals("url")) {
final String crawlingMode = post.get("crawlingMode","url");
if (crawlingMode.equals(CRAWLING_MODE_URL)) {
// check if pattern matches
if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
// print error message
prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_crawlingStart", crawlingStart);
} else try {
// check if pattern matches
if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
// print error message
prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_crawlingStart", crawlingStart);
} else try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch);
// stack request
// first delete old entry, if exists
final DigestURI url = new DigestURI(crawlingStart, null);
final byte[] urlhash = url.hash();
indexSegment.urlMetadata().remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);
// stack url
sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile pe = new CrawlProfile(
(crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
crawlingStartURL,
newcrawlingMustMatch,
newcrawlingMustNotMatch,
newcrawlingdepth,
crawlingIfOlder, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),
url,
null,
"CRAWLING-ROOT",
new Date(),
pe.handle(),
0,
0,
0
));
if (reasonString == null) {
// create a bookmark from crawl start url
Set<String> tags=listManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
tags.add("crawlStart");
if (post.get("createBookmark","off").equals("on")) {
bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
if(bookmark != null){
bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));
bookmark.setOwner("admin");
bookmark.setPublic(false);
bookmark.setTags(tags, true);
sb.bookmarksDB.saveBookmark(bookmark);
}
}
// liftoff!
prop.put("info", "8");//start msg
prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
// generate a YaCyNews if the global flag was set
if (crawlOrder) {
final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
m.remove("specificDepth");
m.remove("indexText");
m.remove("indexMedia");
m.remove("remoteIndexing");
m.remove("xsstopw");
m.remove("xpstopw");
m.remove("xdstopw");
m.remove("storeTXCache");
m.remove("storeHTCache");
m.remove("generalFilter");
m.remove("specificFilter");
m.put("intention", post.get("intention", "").replace(',', '/'));
sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m);
}
} else {
prop.put("info", "5"); //Crawling failed
prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
prop.putHTML("info_reasonString", reasonString);
sb.crawlQueues.errorURL.push(
new Request(
sb.peers.mySeed().hash.getBytes(),
crawlingStartURL,
null,
"",
new Date(),
pe.handle(),
0,
0,
0),
sb.peers.mySeed().hash.getBytes(),
new Date(),
1,
reasonString);
}
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
prop.put("info", "6");//Error with url
prop.putHTML("info_crawlingStart", crawlingStart);
prop.putHTML("info_error", e.getMessage());
Log.logException(e);
}
} else if (crawlingMode.equals("file")) {
if (post.containsKey("crawlingFile")) {
final String fileName = post.get("crawlingFile");
try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch);
// stack request
// first delete old entry, if exists
final DigestURI url = new DigestURI(crawlingStart, null);
final byte[] urlhash = url.hash();
indexSegment.urlMetadata().remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);
// stack url
sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile pe = new CrawlProfile(
(crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
crawlingStartURL,
final File file = new File(fileName);
final String fileString = post.get("crawlingFile$file");
final ContentScraper scraper = new ContentScraper(new DigestURI(file));
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(fileString, writer);
writer.close();
final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
final CrawlProfile profile = new CrawlProfile(
fileName, crawlURL,
newcrawlingMustMatch,
newcrawlingMustNotMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),
url,
null,
"CRAWLING-ROOT",
new Date(),
pe.handle(),
0,
0,
0
));
if (reasonString == null) {
// create a bookmark from crawl start url
Set<String> tags=listManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
tags.add("crawlStart");
if (post.get("createBookmark","off").equals("on")) {
bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
if(bookmark != null){
bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));
bookmark.setOwner("admin");
bookmark.setPublic(false);
bookmark.setTags(tags, true);
sb.bookmarksDB.saveBookmark(bookmark);
}
}
// liftoff!
prop.put("info", "8");//start msg
prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
// generate a YaCyNews if the global flag was set
if (crawlOrder) {
final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
m.remove("specificDepth");
m.remove("indexText");
m.remove("indexMedia");
m.remove("remoteIndexing");
m.remove("xsstopw");
m.remove("xpstopw");
m.remove("xdstopw");
m.remove("storeTXCache");
m.remove("storeHTCache");
m.remove("generalFilter");
m.remove("specificFilter");
m.put("intention", post.get("intention", "").replace(',', '/'));
sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m);
}
} else {
prop.put("info", "5"); //Crawling failed
prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
prop.putHTML("info_reasonString", reasonString);
sb.crawlQueues.errorURL.push(
new Request(
sb.peers.mySeed().hash.getBytes(),
crawlingStartURL,
null,
"",
new Date(),
pe.handle(),
0,
0,
0),
sb.peers.mySeed().hash.getBytes(),
indexText,
indexMedia,
storeHTCache,
true,
crawlOrder,
xsstopw, xdstopw, xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
DigestURI nexturl;
while (linkiterator.hasNext()) {
final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
if (e.getKey() == null) continue;
nexturl = new DigestURI(e.getKey());
sb.crawlStacker.enqueueEntry(new Request(
sb.peers.mySeed().hash.getBytes(),
nexturl,
null,
e.getValue(),
new Date(),
1,
reasonString);
profile.handle(),
0,
0,
0
));
}
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
prop.put("info", "6");//Error with url
prop.putHTML("info_crawlingStart", crawlingStart);
prop.put("info", "7");//Error with file
prop.putHTML("info_crawlingStart", fileName);
prop.putHTML("info_error", e.getMessage());
Log.logException(e);
}
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
}
} else if (crawlingMode.equals("sitemap")) {
String sitemapURLStr = post.get("sitemapURL","");
try {
final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
final CrawlProfile pe = new CrawlProfile(
sitemapURLStr, sitemapURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
crawlingIfOlder, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder,
xsstopw, xdstopw, xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
importer.start();
} catch (final Exception e) {
// mist
prop.put("info", "6");//Error with url
prop.putHTML("info_crawlingStart", sitemapURLStr);
prop.putHTML("info_error", e.getMessage());
Log.logException(e);
}
} else if (crawlingMode.equals("sitelist")) {
try {
final DigestURI sitelistURL = new DigestURI(crawlingStart, null);
// download document
ContentScraper scraper = null;
scraper = sb.loader.parseResource(sitelistURL, CrawlProfile.CacheStrategy.IFFRESH);
String title = scraper.getTitle();
// String description = scraper.getDescription();
} else if (crawlingMode.equals(CRAWLING_MODE_FILE)) {
if (post.containsKey("crawlingFile")) {
// getting the name of the uploaded file
final String fileName = post.get("crawlingFile");
try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch);
// loading the file content
final File file = new File(fileName);
// getting the content of the bookmark file
final String fileString = post.get("crawlingFile$file");
// parsing the bookmark file and fetching the headline and contained links
final ContentScraper scraper = new ContentScraper(new DigestURI(file));
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer = new TransformerWriter(null,null,scraper,null,false);
FileUtils.copy(fileString, writer);
writer.close();
//String headline = scraper.getHeadline();
final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
// creating a crawler profile
final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
final CrawlProfile profile = new CrawlProfile(
fileName, crawlURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
crawlingIfOlder,
crawlingDomFilterDepth,
crawlingDomMaxPages,
crawlingQ,
indexText,
indexMedia,
storeHTCache,
true,
crawlOrder,
xsstopw, xdstopw, xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
// pause local crawl here
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
// loop through the contained links
final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
DigestURI nexturl;
while (linkiterator.hasNext()) {
final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
if (e.getKey() == null) continue;
nexturl = new DigestURI(e.getKey());
// enqueuing the url for crawling
sb.crawlStacker.enqueueEntry(new Request(
sb.peers.mySeed().hash.getBytes(),
nexturl,
null,
e.getValue(),
new Date(),
profile.handle(),
0,
0,
0
));
}
} catch (final PatternSyntaxException e) {
// print error message
prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
prop.put("info", "7");//Error with file
prop.putHTML("info_crawlingStart", fileName);
prop.putHTML("info_error", e.getMessage());
Log.logException(e);
}
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
// get links and generate filter
StringBuilder filter = new StringBuilder();
final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
for (MultiProtocolURI uri: hyperlinks.keySet()) {
filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
}
} else if (crawlingMode.equals(CRAWLING_MODE_SITEMAP)) {
String sitemapURLStr = null;
try {
// getting the sitemap URL
sitemapURLStr = post.get("sitemapURL","");
final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
// create a new profile
final CrawlProfile pe = new CrawlProfile(
sitemapURLStr, sitemapURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder,
xsstopw, xdstopw, xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
// create a new sitemap importer
final SitemapImporter importer = new SitemapImporter(sb, new DigestURI(sitemapURLStr, null), pe);
importer.start();
} catch (final Exception e) {
// mist
prop.put("info", "6");//Error with url
prop.putHTML("info_crawlingStart", sitemapURLStr);
prop.putHTML("info_error", e.getMessage());
Log.logException(e);
}
newcrawlingMustMatch = filter.length() > 0 ? filter.substring(1) : "";
// put links onto crawl queue
final CrawlProfile profile = new CrawlProfile(
title == null || title.length() == 0 ? sitelistURL.getHost() : title,
sitelistURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ,
indexText,
indexMedia,
storeHTCache,
true,
crawlOrder,
xsstopw, xdstopw, xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
DigestURI nexturl;
while (linkiterator.hasNext()) {
final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
if (e.getKey() == null) continue;
nexturl = new DigestURI(e.getKey());
// remove the url from the database to be prepared to crawl them again
final byte[] urlhash = nexturl.hash();
indexSegment.urlMetadata().remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);
sb.crawlStacker.enqueueEntry(new Request(
sb.peers.mySeed().hash.getBytes(),
nexturl,
null,
e.getValue(),
new Date(),
profile.handle(),
0,
0,
0
));
}
} catch (final Exception e) {
// mist
prop.put("info", "6");//Error with url
prop.putHTML("info_crawlingStart", crawlingStart);
prop.putHTML("info_error", e.getMessage());
Log.logException(e);
}
}
}
if (post.containsKey("crawlingPerformance")) {
setPerformance(sb, post);
}
}
if (post != null && post.containsKey("crawlingPerformance")) {
setPerformance(sb, post);
}
// performance settings

@ -152,7 +152,6 @@ public class QuickCrawlLink_p {
crawlingMustNotMatch,
CrawlingDepth,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
-1, // domFilterDepth, if negative: no auto-filter
-1, // domMaxPages, if negative: no count restriction
crawlDynamic,
indexText,

@ -81,6 +81,20 @@ public class getpageinfo_p {
// put language
Set<String> languages = scraper.getContentLanguages();
prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
// get links and put them into a semicolon-separated list
StringBuilder links = new StringBuilder();
StringBuilder filter = new StringBuilder();
count = 0;
for (MultiProtocolURI uri: scraper.getAnchors().keySet()) {
links.append(';').append(uri.toNormalform(true, false));
filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
prop.putXML("links_" + count + "_link", uri.toNormalform(true, false));
count++;
}
prop.put("links", count);
prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : "");
prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
}
}
if(actions.indexOf("robots")>=0){

@ -6,9 +6,16 @@
<robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
<sitemap>#[sitemap]#</sitemap>
<favicon>#[favicon]#</favicon>
<sitelist>#[sitelist]#</sitelist>
<filter>#[filter]#</filter>
<tags>
#{tags}#
<tag name="#[tag]#" />
#{/tags}#
</tags>
<links>
#{links}#
<link name="#[link]#" />
#{/links}#
</links>
</pageinfo>

@ -3,12 +3,12 @@ var AJAX_ON="/env/grafics/ajax.gif";
var timeout="";
function handleResponse(){
if(http.readyState == 4){
if (http.readyState == 4){
var response = http.responseXML;
// getting the document title
// get the document title
doctitle="";
if(response.getElementsByTagName("title")[0].firstChild!=null){
if (response.getElementsByTagName("title")[0].firstChild!=null){
doctitle=response.getElementsByTagName("title")[0].firstChild.nodeValue;
}
// document.getElementById("title").innerHTML=doctitle;
@ -23,43 +23,51 @@ function handleResponse(){
if(robotsOKspan.firstChild){
robotsOKspan.removeChild(robotsOKspan.firstChild);
}
if(docrobotsOK==1){
if (docrobotsOK==1){
img=document.createElement("img");
img.setAttribute("src", "/env/grafics/ok.png");
img.setAttribute("width", "32px");
img.setAttribute("height", "32px");
robotsOKspan.appendChild(img);
}else if(docrobotsOK==0){
} else if(docrobotsOK==0){
img=document.createElement("img");
img.setAttribute("src", "/env/grafics/bad.png");
img.setAttribute("width", "32px");
img.setAttribute("height", "32px");
robotsOKspan.appendChild(img);
robotsOKspan.appendChild(img);
}else{
} else {
robotsOKspan.appendChild(document.createTextNode(""));
document.getElementById("robotsOK").innerHTML="";
}
// getting the sitemap URL contained in the robots.txt
// get the sitemap URL contained in the robots.txt
if (document.getElementsByName("sitemapURL").length > 0) {
sitemap="";
if(response.getElementsByTagName("sitemap")[0].firstChild!=null){
if (response.getElementsByTagName("sitemap")[0].firstChild!=null){
sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue;
}
document.getElementsByName("sitemapURL")[0].value=sitemap;
document.getElementById("sitemap").disabled=false;
}
sitelist="";
if (response.getElementsByTagName("sitelist")[0].firstChild!=null){
sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
}
document.getElementById("sitelistURLs").innerHTML = sitelist;
document.getElementById("sitelist").disabled=false;
// clear the ajax image
document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);
}
}
function changed(){
function changed() {
window.clearTimeout(timeout);
timeout=window.setTimeout("loadInfos()", 1500);
}
function loadInfos(){
function loadInfos() {
// displaying ajax image
document.getElementsByName("ajax")[0].setAttribute("src",AJAX_ON);

@ -48,7 +48,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String FILTER_MUSTNOTMATCH = "nevermatch";
public static final String DEPTH = "generalDepth";
public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
public static final String DOM_FILTER_DEPTH = "domFilterDepth";
public static final String DOM_MAX_PAGES = "domMaxPages";
public static final String CRAWLING_Q = "crawlingQ";
public static final String INDEX_TEXT = "indexText";
@ -70,7 +69,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final String mustnotmatch,
final int depth,
final long recrawlIfOlder /*date*/,
final int domFilterDepth, final int domMaxPages,
final int domMaxPages,
final boolean crawlingQ,
final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache,
@ -87,7 +86,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch);
put(DEPTH, depth);
put(RECRAWL_IF_OLDER, recrawlIfOlder);
put(DOM_FILTER_DEPTH, domFilterDepth);
put(DOM_MAX_PAGES, domMaxPages);
put(CRAWLING_Q, crawlingQ); // crawling of urls with '?'
put(INDEX_TEXT, indexText);
@ -186,21 +184,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return 0L;
}
}
public int domFilterDepth() {
// if the depth is equal or less to this depth,
// then the current url feeds with its domain the crawl filter
// if this is -1, all domains are feeded
final String r = get(DOM_FILTER_DEPTH);
if (r == null) return Integer.MAX_VALUE;
try {
final int i = Integer.parseInt(r);
if (i < 0) return Integer.MAX_VALUE;
return i;
} catch (final NumberFormatException e) {
Log.logException(e);
return Integer.MAX_VALUE;
}
}
public int domMaxPages() {
// this is the maximum number of pages that are crawled for a single domain
// if -1, this means no limit
@ -270,16 +253,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
dp.inc();
}
}
public boolean grantedDomAppearance(final String domain) {
final int max = domFilterDepth();
if (max == Integer.MAX_VALUE) return true;
final DomProfile dp = doms.get(domain);
if (dp == null) {
return 0 < max;
}
return dp.depth <= max;
}
public boolean grantedDomCount(final String domain) {
final int max = domMaxPages();
if (max == Integer.MAX_VALUE) return true;
@ -292,10 +265,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public int domSize() {
return doms.size();
}
public boolean domExists(final String domain) {
if (domFilterDepth() == Integer.MAX_VALUE) return true;
return doms.containsKey(domain);
}
public String domName(final boolean attr, final int index){
final Iterator<Map.Entry<String, DomProfile>> domnamesi = doms.entrySet().iterator();

@ -196,7 +196,7 @@ public final class CrawlStacker {
final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash());
// add domain to profile domain list
if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) {
if (profile.domMaxPages() != Integer.MAX_VALUE) {
profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth());
}
@ -296,12 +296,6 @@ public final class CrawlStacker {
return "post url not allowed";
}
// deny urls that do not match with the profile domain list
if (!(profile.grantedDomAppearance(url.getHost()))) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is not listed in granted domains.");
return "url does not match domain filter";
}
// deny urls that exceed allowed number of occurrences
if (!(profile.grantedDomCount(url.getHost()))) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed.");

@ -164,9 +164,10 @@ public final class CrawlSwitchboard {
if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling
this.defaultProxyProfile = new CrawlProfile("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
this.defaultProxyProfile = new CrawlProfile(
"proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true, true,
@ -177,38 +178,38 @@ public final class CrawlSwitchboard {
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
-1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile);
}
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile);
}
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile);
}
if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile);
}
}

Loading…
Cancel
Save