- bad regex computation for crawl start from file (limitation on domain
did not work)
- servlet error when starting crawl from a large list of urls
pull/8/head
Michael Peter Christen 10 years ago
parent 1fec7fb3c1
commit 3c4c69adea

@ -254,11 +254,12 @@ public class Crawler_p {
} else { } else {
crawlName = crawlingFile.getName(); crawlName = crawlingFile.getName();
} }
if (crawlName.length() > 256) { if (crawlName.endsWith(",")) crawlName = crawlName.substring(0, crawlName.length() - 1);
if (crawlName.length() > 64) {
crawlName = "crawl_for_" + rootURLs.size() + "_start_points_" + Integer.toHexString(crawlName.hashCode());
int p = crawlName.lastIndexOf(','); int p = crawlName.lastIndexOf(',');
if (p >= 8) crawlName = crawlName.substring(0, p); if (p >= 8) crawlName = crawlName.substring(0, p);
} }
if (crawlName.endsWith(",")) crawlName = crawlName.substring(0, crawlName.length() - 1);
if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr; if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr;
// delete old robots entries // delete old robots entries
@ -466,6 +467,40 @@ public class Crawler_p {
int timezoneOffset = post.getInt("timezoneOffset", 0); int timezoneOffset = post.getInt("timezoneOffset", 0);
// in case that we crawl from a file, load that file and re-compute mustmatch pattern
List<AnchorURL> hyperlinks_from_file = null;
if ("file".equals(crawlingMode) && post.containsKey("crawlingFile") && crawlingFile != null) {
final String crawlingFileContent = post.get("crawlingFile$file", "");
try {
// check if the crawl filter works correctly
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
if (crawlingFile != null && crawlingFile.exists()) {
FileUtils.copy(new FileInputStream(crawlingFile), writer);
} else {
FileUtils.copy(crawlingFileContent, writer);
}
writer.close();
// get links and generate filter
hyperlinks_from_file = scraper.getAnchors();
if (newcrawlingdepth > 0) {
if (fullDomain) {
newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file);
} else if (subPath) {
newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file);
}
}
} catch (final Exception e) {
// mist
prop.put("info", "7"); // Error with file
prop.putHTML("info_crawlingStart", crawlingFileName);
prop.putHTML("info_error", e.getMessage());
ConcurrentLog.logException(e);
}
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
}
// prepare a new crawling profile // prepare a new crawling profile
final CrawlProfile profile; final CrawlProfile profile;
byte[] handle; byte[] handle;
@ -578,32 +613,17 @@ public class Crawler_p {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
} else if ("file".equals(crawlingMode)) { } else if ("file".equals(crawlingMode)) {
if (post.containsKey("crawlingFile") && crawlingFile != null) { if (post.containsKey("crawlingFile") && crawlingFile != null && hyperlinks_from_file != null) {
final String crawlingFileContent = post.get("crawlingFile$file", "");
try { try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch);
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
if (crawlingFile != null && crawlingFile.exists()) {
FileUtils.copy(new FileInputStream(crawlingFile), writer);
} else {
FileUtils.copy(crawlingFileContent, writer);
}
writer.close();
// get links and generate filter
final List<AnchorURL> hyperlinks = scraper.getAnchors();
if (newcrawlingdepth > 0) { if (newcrawlingdepth > 0) {
if (fullDomain) { if (fullDomain) {
newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks); newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file);
} else if (subPath) { } else if (subPath) {
newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks); newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file);
} }
} }
sb.crawler.putActive(handle, profile); sb.crawler.putActive(handle, profile);
sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, profile.timezoneOffset()); sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset());
} catch (final PatternSyntaxException e) { } catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);

@ -691,11 +691,16 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
public static String siteFilter(final Collection<? extends MultiProtocolURL> urls) { public static String siteFilter(final Collection<? extends MultiProtocolURL> urls) {
LinkedHashSet<String> filters = new LinkedHashSet<String>(); // first collect in a set to eliminate doubles
for (final MultiProtocolURL url: urls) filters.add(mustMatchFilterFullDomain(url));
final StringBuilder filter = new StringBuilder(); final StringBuilder filter = new StringBuilder();
for (final String urlfilter: filters) filter.append('|').append(urlfilter); filter.append("(smb|ftp|https?)://(www.)?(");
return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING; for (final MultiProtocolURL url: urls) {
String host = url.getHost();
if (host == null) continue;
if (host.startsWith("www.")) host = host.substring(4);
filter.append(Pattern.quote(host.toLowerCase())).append(".*|");
}
filter.setCharAt(filter.length() - 1, ')');
return filter.toString();
} }
public static String mustMatchFilterFullDomain(final MultiProtocolURL url) { public static String mustMatchFilterFullDomain(final MultiProtocolURL url) {
@ -721,7 +726,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (host.startsWith("www.")) host = host.substring(4); if (host.startsWith("www.")) host = host.substring(4);
String protocol = url.getProtocol(); String protocol = url.getProtocol();
if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+"; if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+";
return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(url.getPath()).append(".*").toString(); return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host.toLowerCase())).append(url.getPath()).append(".*").toString();
} }
public boolean isPushCrawlProfile() { public boolean isPushCrawlProfile() {

Loading…
Cancel
Save