diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html index b495ee91d..a82a2c390 100644 --- a/htroot/Crawler_p.html +++ b/htroot/Crawler_p.html @@ -21,7 +21,7 @@ #%env/templates/submenuCrawlMonitor.template%#

Crawler

-
+
Queues @@ -75,7 +75,7 @@
 #[queuemessage]#
-
+
Index Size @@ -107,7 +107,7 @@
-
+
Progress
@@ -158,8 +158,17 @@
-
+

#(info)# diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 5f9a238b6..4bb106d23 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -368,47 +368,60 @@ public class Crawler_p { prop.putHTML("info_error", e.getMessage()); } + boolean hasCrawlstartDataOK = true; + // check crawlurl was given in sitecrawl + if ("url".equals(crawlingMode) && rootURLs.size() == 0) hasCrawlstartDataOK = false; + // prepare a new crawling profile - final CrawlProfile profile = new CrawlProfile( - crawlName, - newcrawlingMustMatch, - newcrawlingMustNotMatch, - ipMustMatch, - ipMustNotMatch, - countryMustMatch, - crawlerNoDepthLimitMatch, - indexUrlMustMatch, - indexUrlMustNotMatch, - indexContentMustMatch, - indexContentMustNotMatch, - newcrawlingdepth, - directDocByURL, - crawlingIfOlder, - crawlingDomMaxPages, - crawlingQ, followFrames, obeyHtmlRobotsNoindex, - indexText, - indexMedia, - storeHTCache, - crawlOrder, - cachePolicy, - collection, - agentName); - byte[] handle = ASCII.getBytes(profile.handle()); - - // before we fire up a new crawl, we make sure that another crawl with the same name is not running - sb.crawler.removeActive(handle); - sb.crawler.removePassive(handle); - try {sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);} catch (final SpaceExceededException e1) {} - - // delete all error urls for that domain - Set hosthashes = new HashSet(); - for (DigestURL u: rootURLs) { - sb.index.fulltext().remove(u.hash()); - hosthashes.add(u.hosthash()); + final CrawlProfile profile; + byte[] handle; + if (hasCrawlstartDataOK) { + profile = new CrawlProfile( + crawlName, + newcrawlingMustMatch, + newcrawlingMustNotMatch, + ipMustMatch, + ipMustNotMatch, + countryMustMatch, + crawlerNoDepthLimitMatch, + indexUrlMustMatch, + indexUrlMustNotMatch, + indexContentMustMatch, + indexContentMustNotMatch, + newcrawlingdepth, + directDocByURL, + crawlingIfOlder, + crawlingDomMaxPages, + crawlingQ, followFrames, obeyHtmlRobotsNoindex, + indexText, + indexMedia, + storeHTCache, + crawlOrder, + cachePolicy, + collection, + agentName); + handle = ASCII.getBytes(profile.handle()); + + // before we fire up a new crawl, we make sure that another crawl with the same name is not running + sb.crawler.removeActive(handle); + sb.crawler.removePassive(handle); + try { + sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000); + } catch (final SpaceExceededException e1) { } + + // delete all error urls for that domain + Set hosthashes = new HashSet(); + for (DigestURL u : rootURLs) { + sb.index.fulltext().remove(u.hash()); + hosthashes.add(u.hosthash()); + } + sb.crawlQueues.errorURL.removeHosts(hosthashes); + sb.index.fulltext().commit(true); + } else { + profile = null; + handle = null; } - sb.crawlQueues.errorURL.removeHosts(hosthashes); - sb.index.fulltext().commit(true); - + // start the crawl if ("url".equals(crawlingMode)) { if (rootURLs.size() == 0) { diff --git a/htroot/Vocabulary_p.html b/htroot/Vocabulary_p.html index d3ac606b3..5b1c7b019 100644 --- a/htroot/Vocabulary_p.html +++ b/htroot/Vocabulary_p.html @@ -134,7 +134,7 @@ To see a list of all APIs, please visit the
if set, uses the predicate
#[objectspacepredicate]# for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)# -

This produces the following triples in the triplestore if a term or synonym matches in a document:

+

This produces the following triples in the triplestore if a term or synonym matches in a document:

Triple #1
#[triple1]#
Triple #2
#[triple2]#
diff --git a/pom.xml b/pom.xml index 455dde921..b46297b49 100644 --- a/pom.xml +++ b/pom.xml @@ -325,7 +325,7 @@ org.apache.pdfbox fontbox - 1.8.3 + 1.8.4 org.apache.geronimo.specs @@ -546,7 +546,7 @@ xerces xercesImpl - 2.7.1 + 2.11.0