fix for wrong crawlName construction

pull/419/head
Michael Peter Christen 4 years ago
parent e81b770f79
commit 4377bd2b70

@ -267,21 +267,23 @@ public class Crawler_p {
String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
Set<DigestURL> rootURLs = new HashSet<DigestURL>();
String crawlName = "";
if (crawlingFile == null) for (String crawlingStart: rootURLs0) {
if (crawlingFile == null) {
StringBuilder crawlNameBuilder = new StringBuilder(); // for large crawl queues this can be pretty large
if (crawlingStart == null || crawlingStart.length() == 0) continue;
// add the prefix http:// if necessary
int pos = crawlingStart.indexOf("://",0);
if (pos == -1) {
if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; else crawlingStart = "http://" + crawlingStart;
}
try {
DigestURL crawlingStartURL = new DigestURL(crawlingStart);
rootURLs.add(crawlingStartURL);
crawlNameBuilder.append((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()).append(',');
if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false;
} catch (final MalformedURLException e) {
ConcurrentLog.warn("Crawler_p", "crawl start url invalid: " + e.getMessage());
for (String crawlingStart: rootURLs0) {
if (crawlingStart == null || crawlingStart.length() == 0) continue;
// add the prefix http:// if necessary
int pos = crawlingStart.indexOf("://",0);
if (pos == -1) {
if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; else crawlingStart = "https://" + crawlingStart; // we default to https instead of http becuase those outnumber http by far
}
try {
DigestURL crawlingStartURL = new DigestURL(crawlingStart);
rootURLs.add(crawlingStartURL);
crawlNameBuilder.append((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()).append(',');
if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false;
} catch (final MalformedURLException e) {
ConcurrentLog.warn("Crawler_p", "crawl start url invalid: " + e.getMessage());
}
}
crawlName = crawlNameBuilder.toString();
} else {
@ -676,7 +678,7 @@ public class Crawler_p {
// liftoff!
prop.put("info", "8");
prop.putHTML("info_crawlingURL", post.get("crawlingURL"));
// generate a YaCyNews if the global flag was set
if (!sb.isRobinsonMode() && crawlOrder) {
final Map<String, String> m = new HashMap<String, String>(profile); // must be cloned

Loading…
Cancel
Save