From 8600ea01dd2e165f391f06da9c4f9aa89114f098 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 28 Dec 2014 14:27:42 +0100 Subject: [PATCH] automatically swith on query option in case intranet protocols (smb/ftp) are used. This supports the new split-pdf option. --- htroot/Crawler_p.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 7e7bcceb9..7cf20d281 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -36,6 +36,7 @@ import java.util.regex.PatternSyntaxException; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; @@ -314,9 +315,6 @@ public class Crawler_p { final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? post.getInt("crawlingDomMaxPages", -1) : -1; env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages)); - boolean crawlingQ = "on".equals(post.get("crawlingQ", "off")); // on unchecked checkbox "crawlingQ" not contained in post - env.setConfig("crawlingQ", crawlingQ); - boolean followFrames = "on".equals(post.get("followFrames", "false")); env.setConfig("followFrames", followFrames); @@ -354,7 +352,6 @@ public class Crawler_p { newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING; newcrawlingdepth = 0; directDocByURL = false; - crawlingQ = true; } if ("sitelist".equals(crawlingMode)) { @@ -381,13 +378,18 @@ public class Crawler_p { // delete all error urls for that domain // and all urls for that host from the crawl queue Set hosthashes = new HashSet(); + boolean anysmbftporpdf = false; for (DigestURL u : rootURLs) { sb.index.fulltext().remove(u.hash()); hosthashes.add(u.hosthash()); + if ("smb.ftp".indexOf(u.getProtocol()) >= 0 || "pdf".equals(MultiProtocolURL.getFileExtension(u.getFileName()))) anysmbftporpdf = true; } sb.crawlQueues.removeHosts(hosthashes); sb.index.fulltext().commit(true); + boolean crawlingQ = anysmbftporpdf || "on".equals(post.get("crawlingQ", "off")) || "sitemap".equals(crawlingMode); + env.setConfig("crawlingQ", crawlingQ); + // compute mustmatch filter according to rootURLs if ((fullDomain || subPath) && newcrawlingdepth > 0) { String siteFilter = ".*";