prevent exception on Site Crawl if no start url is given

11 years ago · d052bbdfe1
parent a373fb717d
commit d052bbdfe1
1 changed files with 52 additions and 39 deletions
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -368,47 +368,60 @@ public class Crawler_p {
                    prop.putHTML("info_error", e.getMessage());
                } 
                boolean hasCrawlstartDataOK = true;
                // check crawlurl was given in sitecrawl
                if ("url".equals(crawlingMode) && rootURLs.size() == 0) hasCrawlstartDataOK = false;
                // prepare a new crawling profile
-                final CrawlProfile profile = new CrawlProfile(
+                final CrawlProfile profile;
-                        crawlName,
+                byte[] handle;
-                        newcrawlingMustMatch,
+                if (hasCrawlstartDataOK) {
-                        newcrawlingMustNotMatch,
+                    profile = new CrawlProfile(
-                        ipMustMatch,
+                            crawlName,
-                        ipMustNotMatch,
+                            newcrawlingMustMatch,
-                        countryMustMatch,
+                            newcrawlingMustNotMatch,
-                        crawlerNoDepthLimitMatch,
+                            ipMustMatch,
-                        indexUrlMustMatch,
+                            ipMustNotMatch,
-                        indexUrlMustNotMatch,
+                            countryMustMatch,
-                        indexContentMustMatch,
+                            crawlerNoDepthLimitMatch,
-                        indexContentMustNotMatch,
+                            indexUrlMustMatch,
-                        newcrawlingdepth,
+                            indexUrlMustNotMatch,
-                        directDocByURL,
+                            indexContentMustMatch,
-                        crawlingIfOlder,
+                            indexContentMustNotMatch,
-                        crawlingDomMaxPages,
+                            newcrawlingdepth,
-                        crawlingQ, followFrames, obeyHtmlRobotsNoindex,
+                            directDocByURL,
-                        indexText,
+                            crawlingIfOlder,
-                        indexMedia,
+                            crawlingDomMaxPages,
-                        storeHTCache,
+                            crawlingQ, followFrames, obeyHtmlRobotsNoindex,
-                        crawlOrder,
+                            indexText,
-                        cachePolicy,
+                            indexMedia,
-                        collection,
+                            storeHTCache,
-                        agentName);
+                            crawlOrder,
-                byte[] handle = ASCII.getBytes(profile.handle());
+                            cachePolicy,
-                
+                            collection,
-                // before we fire up a new crawl, we make sure that another crawl with the same name is not running
+                            agentName);
-                sb.crawler.removeActive(handle);
+                    handle = ASCII.getBytes(profile.handle());
-                sb.crawler.removePassive(handle);
+
-                try {sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);} catch (final SpaceExceededException e1) {}
+                    // before we fire up a new crawl, we make sure that another crawl with the same name is not running
-                
+                    sb.crawler.removeActive(handle);
-                // delete all error urls for that domain
+                    sb.crawler.removePassive(handle);
-                Set<String> hosthashes = new HashSet<String>();
+                    try {
-                for (DigestURL u: rootURLs) {
+                        sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);
-                    sb.index.fulltext().remove(u.hash());
+                    } catch (final SpaceExceededException e1) { }
-                    hosthashes.add(u.hosthash());
+
                    // delete all error urls for that domain
                    Set<String> hosthashes = new HashSet<String>();
                    for (DigestURL u : rootURLs) {
                        sb.index.fulltext().remove(u.hash());
                        hosthashes.add(u.hosthash());
                    }
                    sb.crawlQueues.errorURL.removeHosts(hosthashes);
                    sb.index.fulltext().commit(true);
                } else {
                    profile = null;
                    handle = null;
                }
-                sb.crawlQueues.errorURL.removeHosts(hosthashes);
+
                sb.index.fulltext().commit(true);
                // start the crawl
                if ("url".equals(crawlingMode)) {
                    if (rootURLs.size() == 0) {