|
|
|
@ -171,6 +171,7 @@ public class Crawler_p {
|
|
|
|
|
}
|
|
|
|
|
final boolean deleteold = (deleteage && deleteageDate != null) || (restrictedcrawl && post.getBoolean("deleteold"));
|
|
|
|
|
|
|
|
|
|
final String sitemapURLStr = post.get("sitemapURL","");
|
|
|
|
|
String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url
|
|
|
|
|
String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
|
|
|
|
|
Set<DigestURL> rootURLs = new HashSet<DigestURL>();
|
|
|
|
@ -199,7 +200,7 @@ public class Crawler_p {
|
|
|
|
|
if (p >= 8) crawlName = crawlName.substring(0, p);
|
|
|
|
|
}
|
|
|
|
|
if (crawlName.endsWith(",")) crawlName = crawlName.substring(0, crawlName.length() - 1);
|
|
|
|
|
|
|
|
|
|
if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr;
|
|
|
|
|
|
|
|
|
|
// set the crawl filter
|
|
|
|
|
String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);
|
|
|
|
@ -442,7 +443,6 @@ public class Crawler_p {
|
|
|
|
|
if (successurls.size() > 0) sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
|
|
|
|
|
}
|
|
|
|
|
} else if ("sitemap".equals(crawlingMode)) {
|
|
|
|
|
final String sitemapURLStr = post.get("sitemapURL","");
|
|
|
|
|
try {
|
|
|
|
|
final DigestURL sitemapURL = new DigestURL(sitemapURLStr);
|
|
|
|
|
sb.crawler.putActive(handle, profile);
|
|
|
|
|