From 4377bd2b70d19b1fd09672d780a9c21a07a6e038 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 30 Jun 2021 18:03:54 +0200 Subject: [PATCH] fix for wrong crawlName construction --- htroot/Crawler_p.java | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 5ee2bbd05..93d8728ab 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -267,21 +267,23 @@ public class Crawler_p { String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|")); Set rootURLs = new HashSet(); String crawlName = ""; - if (crawlingFile == null) for (String crawlingStart: rootURLs0) { + if (crawlingFile == null) { StringBuilder crawlNameBuilder = new StringBuilder(); // for large crawl queues this can be pretty large - if (crawlingStart == null || crawlingStart.length() == 0) continue; - // add the prefix http:// if necessary - int pos = crawlingStart.indexOf("://",0); - if (pos == -1) { - if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; else crawlingStart = "http://" + crawlingStart; - } - try { - DigestURL crawlingStartURL = new DigestURL(crawlingStart); - rootURLs.add(crawlingStartURL); - crawlNameBuilder.append((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()).append(','); - if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false; - } catch (final MalformedURLException e) { - ConcurrentLog.warn("Crawler_p", "crawl start url invalid: " + e.getMessage()); + for (String crawlingStart: rootURLs0) { + if (crawlingStart == null || crawlingStart.length() == 0) continue; + // add the prefix http:// if necessary + int pos = crawlingStart.indexOf("://",0); + if (pos == -1) { + if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; else crawlingStart = "https://" + crawlingStart; // we default to https instead of http becuase those outnumber http by far + } + try { + DigestURL crawlingStartURL = new DigestURL(crawlingStart); + rootURLs.add(crawlingStartURL); + crawlNameBuilder.append((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()).append(','); + if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false; + } catch (final MalformedURLException e) { + ConcurrentLog.warn("Crawler_p", "crawl start url invalid: " + e.getMessage()); + } } crawlName = crawlNameBuilder.toString(); } else { @@ -676,7 +678,7 @@ public class Crawler_p { // liftoff! prop.put("info", "8"); prop.putHTML("info_crawlingURL", post.get("crawlingURL")); - + // generate a YaCyNews if the global flag was set if (!sb.isRobinsonMode() && crawlOrder) { final Map m = new HashMap(profile); // must be cloned