better removal of stored urls when doing a crawl start

pull/1/head
orbiter 11 years ago
parent 2f63bd0261
commit c6f0bd05f8

@ -356,6 +356,16 @@ public class Crawler_p {
crawlingMode = "url";
if ((fullDomain || subPath) && newcrawlingdepth > 0) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // to prevent that there is a restriction on the original urls
}
// delete all error urls for that domain
// and all urls for that host from the crawl queue
Set<String> hosthashes = new HashSet<String>();
for (DigestURL u : rootURLs) {
sb.index.fulltext().remove(u.hash());
hosthashes.add(u.hosthash());
}
sb.crawlQueues.removeHosts(hosthashes);
sb.index.fulltext().commit(true);
// compute mustmatch filter according to rootURLs
if ((fullDomain || subPath) && newcrawlingdepth > 0) {
@ -363,23 +373,17 @@ public class Crawler_p {
if (fullDomain) {
siteFilter = CrawlProfile.siteFilter(rootURLs);
if (deleteold) {
Set<String> hosthashes = new HashSet<String>();
for (DigestURL u: rootURLs) hosthashes.add(u.hosthash());
sb.index.fulltext().deleteStaleDomainHashes(hosthashes, deleteageDate);
sb.crawlQueues.removeHosts(hosthashes);
}
} else if (subPath) {
siteFilter = CrawlProfile.subpathFilter(rootURLs);
if (deleteold) {
Set<String> hosthashes = new HashSet<String>();
for (DigestURL u: rootURLs) {
hosthashes.add(u.hosthash());
String basepath = u.toNormalform(true);
if (!basepath.endsWith("/")) {int p = basepath.lastIndexOf("/"); if (p > 0) basepath = basepath.substring(0, p + 1);}
int count = sb.index.fulltext().remove(basepath, deleteageDate);
if (count > 0) ConcurrentLog.info("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
}
sb.crawlQueues.removeHosts(hosthashes);
}
}
if (CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch)) {
@ -449,15 +453,6 @@ public class Crawler_p {
try {
sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);
} catch (final SpaceExceededException e1) { }
// delete all error urls for that domain
Set<String> hosthashes = new HashSet<String>();
for (DigestURL u : rootURLs) {
sb.index.fulltext().remove(u.hash());
hosthashes.add(u.hosthash());
}
sb.crawlQueues.errorURL.removeHosts(hosthashes);
sb.index.fulltext().commit(true);
} else {
profile = null;
handle = null;

Loading…
Cancel
Save