|
|
|
@ -25,6 +25,7 @@ import java.io.FileNotFoundException;
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
import java.io.Writer;
|
|
|
|
|
import java.net.MalformedURLException;
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
|
import java.util.Date;
|
|
|
|
|
import java.util.HashMap;
|
|
|
|
|
import java.util.HashSet;
|
|
|
|
@ -267,6 +268,7 @@ public class Crawler_p {
|
|
|
|
|
Set<DigestURL> rootURLs = new HashSet<DigestURL>();
|
|
|
|
|
String crawlName = "";
|
|
|
|
|
if (crawlingFile == null) for (String crawlingStart: rootURLs0) {
|
|
|
|
|
StringBuilder crawlNameBuilder = new StringBuilder(); // for large crawl queues this can be pretty large
|
|
|
|
|
if (crawlingStart == null || crawlingStart.length() == 0) continue;
|
|
|
|
|
// add the prefix http:// if necessary
|
|
|
|
|
int pos = crawlingStart.indexOf("://",0);
|
|
|
|
@ -276,12 +278,12 @@ public class Crawler_p {
|
|
|
|
|
try {
|
|
|
|
|
DigestURL crawlingStartURL = new DigestURL(crawlingStart);
|
|
|
|
|
rootURLs.add(crawlingStartURL);
|
|
|
|
|
crawlName += ((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()) + ',';
|
|
|
|
|
crawlNameBuilder.append((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()).append(',');
|
|
|
|
|
if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false;
|
|
|
|
|
|
|
|
|
|
} catch (final MalformedURLException e) {
|
|
|
|
|
ConcurrentLog.warn("Crawler_p", "crawl start url invalid: " + e.getMessage());
|
|
|
|
|
}
|
|
|
|
|
crawlName = crawlNameBuilder.toString();
|
|
|
|
|
} else {
|
|
|
|
|
crawlName = crawlingFile.getName();
|
|
|
|
|
}
|
|
|
|
@ -415,13 +417,15 @@ public class Crawler_p {
|
|
|
|
|
|
|
|
|
|
// delete all error urls for that domain
|
|
|
|
|
// and all urls for that host from the crawl queue
|
|
|
|
|
List<String> deleteIDs = new ArrayList<>();
|
|
|
|
|
Set<String> hosthashes = new HashSet<String>();
|
|
|
|
|
boolean anysmbftporpdf = false;
|
|
|
|
|
for (DigestURL u : rootURLs) {
|
|
|
|
|
sb.index.fulltext().remove(u.hash());
|
|
|
|
|
deleteIDs.add(new String(u.hash()));
|
|
|
|
|
hosthashes.add(u.hosthash());
|
|
|
|
|
if ("smb.ftp".indexOf(u.getProtocol()) >= 0 || "pdf".equals(MultiProtocolURL.getFileExtension(u.getFileName()))) anysmbftporpdf = true;
|
|
|
|
|
}
|
|
|
|
|
sb.index.fulltext().remove(deleteIDs);
|
|
|
|
|
sb.crawlQueues.removeHosts(hosthashes);
|
|
|
|
|
sb.index.fulltext().commit(true);
|
|
|
|
|
|
|
|
|
@ -458,8 +462,10 @@ public class Crawler_p {
|
|
|
|
|
// check if the crawl filter works correctly
|
|
|
|
|
try {
|
|
|
|
|
Pattern mmp = Pattern.compile(newcrawlingMustMatch);
|
|
|
|
|
int maxcheck = 100;
|
|
|
|
|
for (DigestURL u: rootURLs) {
|
|
|
|
|
assert mmp.matcher(u.toNormalform(true)).matches() : "pattern " + mmp.toString() + " does not match url " + u.toNormalform(true);
|
|
|
|
|
if (maxcheck-- <= 0) break;
|
|
|
|
|
}
|
|
|
|
|
} catch (final PatternSyntaxException e) {
|
|
|
|
|
prop.put("info", "4"); // crawlfilter does not match url
|
|
|
|
@ -590,9 +596,6 @@ public class Crawler_p {
|
|
|
|
|
hasCrawlstartDataOK = false;
|
|
|
|
|
prop.put("info", "9");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// prepare a new crawling profile
|
|
|
|
@ -645,7 +648,6 @@ public class Crawler_p {
|
|
|
|
|
profile.put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key,
|
|
|
|
|
post.getBoolean("crawlerAlwaysCheckMediaType"));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
handle = ASCII.getBytes(profile.handle());
|
|
|
|
|
|
|
|
|
|
// before we fire up a new crawl, we make sure that another crawl with the same name is not running
|
|
|
|
@ -659,10 +661,8 @@ public class Crawler_p {
|
|
|
|
|
handle = null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// start the crawl
|
|
|
|
|
if (hasCrawlstartDataOK) {
|
|
|
|
|
|
|
|
|
|
final boolean wontReceiptRemoteRsults = crawlOrder && !sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false);
|
|
|
|
|
|
|
|
|
|
if ("url".equals(crawlingMode)) {
|
|
|
|
@ -710,7 +710,6 @@ public class Crawler_p {
|
|
|
|
|
}
|
|
|
|
|
if (successurls.size() > 0) {
|
|
|
|
|
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
|
|
|
|
|
|
|
|
|
|
prop.put("wontReceiptRemoteResults", wontReceiptRemoteRsults);
|
|
|
|
|
}
|
|
|
|
|
} else if ("sitemap".equals(crawlingMode)) {
|
|
|
|
|