|
|
|
@ -368,8 +368,15 @@ public class Crawler_p {
|
|
|
|
|
prop.putHTML("info_error", e.getMessage());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
boolean hasCrawlstartDataOK = true;
|
|
|
|
|
// check crawlurl was given in sitecrawl
|
|
|
|
|
if ("url".equals(crawlingMode) && rootURLs.size() == 0) hasCrawlstartDataOK = false;
|
|
|
|
|
|
|
|
|
|
// prepare a new crawling profile
|
|
|
|
|
final CrawlProfile profile = new CrawlProfile(
|
|
|
|
|
final CrawlProfile profile;
|
|
|
|
|
byte[] handle;
|
|
|
|
|
if (hasCrawlstartDataOK) {
|
|
|
|
|
profile = new CrawlProfile(
|
|
|
|
|
crawlName,
|
|
|
|
|
newcrawlingMustMatch,
|
|
|
|
|
newcrawlingMustNotMatch,
|
|
|
|
@ -393,12 +400,14 @@ public class Crawler_p {
|
|
|
|
|
cachePolicy,
|
|
|
|
|
collection,
|
|
|
|
|
agentName);
|
|
|
|
|
byte[] handle = ASCII.getBytes(profile.handle());
|
|
|
|
|
handle = ASCII.getBytes(profile.handle());
|
|
|
|
|
|
|
|
|
|
// before we fire up a new crawl, we make sure that another crawl with the same name is not running
|
|
|
|
|
sb.crawler.removeActive(handle);
|
|
|
|
|
sb.crawler.removePassive(handle);
|
|
|
|
|
try {sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);} catch (final SpaceExceededException e1) {}
|
|
|
|
|
try {
|
|
|
|
|
sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);
|
|
|
|
|
} catch (final SpaceExceededException e1) { }
|
|
|
|
|
|
|
|
|
|
// delete all error urls for that domain
|
|
|
|
|
Set<String> hosthashes = new HashSet<String>();
|
|
|
|
@ -408,6 +417,10 @@ public class Crawler_p {
|
|
|
|
|
}
|
|
|
|
|
sb.crawlQueues.errorURL.removeHosts(hosthashes);
|
|
|
|
|
sb.index.fulltext().commit(true);
|
|
|
|
|
} else {
|
|
|
|
|
profile = null;
|
|
|
|
|
handle = null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// start the crawl
|
|
|
|
|
if ("url".equals(crawlingMode)) {
|
|
|
|
|