prevent exception on Site Crawl if no start url is given

pull/1/head
reger 11 years ago
parent a373fb717d
commit d052bbdfe1

@ -368,47 +368,60 @@ public class Crawler_p {
prop.putHTML("info_error", e.getMessage()); prop.putHTML("info_error", e.getMessage());
} }
boolean hasCrawlstartDataOK = true;
// check crawlurl was given in sitecrawl
if ("url".equals(crawlingMode) && rootURLs.size() == 0) hasCrawlstartDataOK = false;
// prepare a new crawling profile // prepare a new crawling profile
final CrawlProfile profile = new CrawlProfile( final CrawlProfile profile;
crawlName, byte[] handle;
newcrawlingMustMatch, if (hasCrawlstartDataOK) {
newcrawlingMustNotMatch, profile = new CrawlProfile(
ipMustMatch, crawlName,
ipMustNotMatch, newcrawlingMustMatch,
countryMustMatch, newcrawlingMustNotMatch,
crawlerNoDepthLimitMatch, ipMustMatch,
indexUrlMustMatch, ipMustNotMatch,
indexUrlMustNotMatch, countryMustMatch,
indexContentMustMatch, crawlerNoDepthLimitMatch,
indexContentMustNotMatch, indexUrlMustMatch,
newcrawlingdepth, indexUrlMustNotMatch,
directDocByURL, indexContentMustMatch,
crawlingIfOlder, indexContentMustNotMatch,
crawlingDomMaxPages, newcrawlingdepth,
crawlingQ, followFrames, obeyHtmlRobotsNoindex, directDocByURL,
indexText, crawlingIfOlder,
indexMedia, crawlingDomMaxPages,
storeHTCache, crawlingQ, followFrames, obeyHtmlRobotsNoindex,
crawlOrder, indexText,
cachePolicy, indexMedia,
collection, storeHTCache,
agentName); crawlOrder,
byte[] handle = ASCII.getBytes(profile.handle()); cachePolicy,
collection,
// before we fire up a new crawl, we make sure that another crawl with the same name is not running agentName);
sb.crawler.removeActive(handle); handle = ASCII.getBytes(profile.handle());
sb.crawler.removePassive(handle);
try {sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);} catch (final SpaceExceededException e1) {} // before we fire up a new crawl, we make sure that another crawl with the same name is not running
sb.crawler.removeActive(handle);
// delete all error urls for that domain sb.crawler.removePassive(handle);
Set<String> hosthashes = new HashSet<String>(); try {
for (DigestURL u: rootURLs) { sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);
sb.index.fulltext().remove(u.hash()); } catch (final SpaceExceededException e1) { }
hosthashes.add(u.hosthash());
// delete all error urls for that domain
Set<String> hosthashes = new HashSet<String>();
for (DigestURL u : rootURLs) {
sb.index.fulltext().remove(u.hash());
hosthashes.add(u.hosthash());
}
sb.crawlQueues.errorURL.removeHosts(hosthashes);
sb.index.fulltext().commit(true);
} else {
profile = null;
handle = null;
} }
sb.crawlQueues.errorURL.removeHosts(hosthashes);
sb.index.fulltext().commit(true);
// start the crawl // start the crawl
if ("url".equals(crawlingMode)) { if ("url".equals(crawlingMode)) {
if (rootURLs.size() == 0) { if (rootURLs.size() == 0) {

Loading…
Cancel
Save