diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index ce48c96e6..7a36de594 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -394,7 +394,7 @@ public class Balancer { // at this point we must check if the crawlEntry has relevance because the crawl profile still exists // if not: return null. A calling method must handle the null value and try again - profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); + profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle())); if (profileEntry == null) { ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); continue; @@ -481,7 +481,7 @@ public class Balancer { rowEntry = this.urlFileIndex.get(urlhash, false); if (rowEntry == null) continue; // may have been deleted there manwhile Request crawlEntry = new Request(rowEntry); - CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); + CrawlProfile profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle())); if (profileEntry == null) { ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); continue; diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 5ff4cbe5e..9d79efefd 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -149,7 +149,7 @@ public final class CrawlStacker { // if the url was rejected we store it into the error URL db if (rejectReason != null && !rejectReason.startsWith("double in")) { - final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle())); + final CrawlProfile profile = this.crawler.get(UTF8.getBytes(entry.profileHandle())); this.nextQueue.errorURL.push(entry.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); } } catch (final Exception e) { @@ -294,7 +294,8 @@ public final class CrawlStacker { public String stackCrawl(final Request entry) { //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'"); - final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle())); + byte[] handle = UTF8.getBytes(entry.profileHandle()); + final CrawlProfile profile = this.crawler.get(handle); String error; if (profile == null) { error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url(); diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index 15ad6c1d7..0e58f1674 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -166,6 +166,23 @@ public final class CrawlSwitchboard { / 1024); } + /** + * Get a profile from active or passive stack. Should be used to be sure not to miss old, cleaned profiles. + * A profile that was discovered from the passive stack is automatically shifted back to the active stack. + * @param profileKey + * @return + */ + public CrawlProfile get(final byte[] profileKey) { + CrawlProfile profile = getActive(profileKey); + if (profile != null) return profile; + profile = getPassive(profileKey); + if (profile == null) return null; + // clean up + this.putActive(profileKey, profile); + this.removePassive(profileKey); + return profile; + } + public CrawlProfile getActive(final byte[] profileKey) { if ( profileKey == null ) { return null; diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index b393c08fe..819a84065 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -148,7 +148,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M } if (name.length() > 256) name = name.substring(256); this.doms = new ConcurrentHashMap(); - final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages)).substring(0, Word.commonHashLength); + final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages + collections)).substring(0, Word.commonHashLength); put(HANDLE, handle); put(NAME, name); put(AGENT_NAME, userAgentName); diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index be18de809..08bc10a0d 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -255,7 +255,7 @@ public class CrawlQueues { this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); return true; } - final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(profileHandle)); + final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(profileHandle)); if (profile == null) { this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); return true; @@ -297,7 +297,7 @@ public class CrawlQueues { * @return */ private void load(final Request urlEntry, final String stats, final String profileHandle) { - final CrawlProfile profile = this.sb.crawler.getActive(UTF8.getBytes(profileHandle)); + final CrawlProfile profile = this.sb.crawler.get(UTF8.getBytes(profileHandle)); if (profile != null) { // check if the protocol is supported @@ -606,7 +606,7 @@ public class CrawlQueues { this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED); this.code = Integer.valueOf(entry.hashCode()); this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse - this.profile = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle())); + this.profile = CrawlQueues.this.sb.crawler.get(UTF8.getBytes(this.request.profileHandle())); } private long age() { diff --git a/source/net/yacy/crawler/retrieval/FTPLoader.java b/source/net/yacy/crawler/retrieval/FTPLoader.java index 81bc12e68..aaf3b6c2a 100644 --- a/source/net/yacy/crawler/retrieval/FTPLoader.java +++ b/source/net/yacy/crawler/retrieval/FTPLoader.java @@ -101,7 +101,7 @@ public class FTPLoader { // create new ftp client final FTPClient ftpClient = new FTPClient(); - final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); + final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle())); // get a connection if (openConnection(ftpClient, entryUrl)) { // test if the specified file is a directory @@ -249,7 +249,7 @@ public class FTPLoader { // create response with metadata only responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); - final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); + final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle())); final Response response = new Response( request, requestHeader, @@ -264,7 +264,7 @@ public class FTPLoader { final byte[] b = ftpClient.get(path); // create a response - final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); + final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle())); final Response response = new Response( request, requestHeader, diff --git a/source/net/yacy/crawler/retrieval/FileLoader.java b/source/net/yacy/crawler/retrieval/FileLoader.java index 06d8bde3c..676ab9d4f 100644 --- a/source/net/yacy/crawler/retrieval/FileLoader.java +++ b/source/net/yacy/crawler/retrieval/FileLoader.java @@ -83,7 +83,7 @@ public class FileLoader { ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); - final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); + final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle())); Response response = new Response( request, requestHeader, @@ -123,7 +123,7 @@ public class FileLoader { // create response with metadata only responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); - final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); + final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle())); Response response = new Response( request, requestHeader, @@ -140,7 +140,7 @@ public class FileLoader { is.close(); // create response with loaded content - final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); + final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle())); Response response = new Response( request, requestHeader, diff --git a/source/net/yacy/crawler/retrieval/SMBLoader.java b/source/net/yacy/crawler/retrieval/SMBLoader.java index 56c2adca1..302bf6bca 100644 --- a/source/net/yacy/crawler/retrieval/SMBLoader.java +++ b/source/net/yacy/crawler/retrieval/SMBLoader.java @@ -101,7 +101,7 @@ public class SMBLoader { ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); - final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); + final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle())); Response response = new Response( request, requestHeader, @@ -141,7 +141,7 @@ public class SMBLoader { // create response with metadata only responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); - final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); + final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes()); Response response = new Response( request, requestHeader, @@ -158,7 +158,7 @@ public class SMBLoader { is.close(); // create response with loaded content - final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); + final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes()); Response response = new Response( request, requestHeader, diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 7f7d59836..d421d5258 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -187,7 +187,7 @@ public final class LoaderDispatcher { if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system final String protocol = url.getProtocol(); final String host = url.getHost(); - final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle())); + final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.get(UTF8.getBytes(request.profileHandle())); // check if url is in blacklist if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 6a0c9f106..f87573b78 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2514,7 +2514,7 @@ public final class Switchboard extends serverSwitch { } else { // we consider this as fail urls to have a tracking of the problem if (rejectReason != null && !rejectReason.startsWith("double in")) { - final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(response.profile().handle())); + final CrawlProfile profile = this.crawler.get(UTF8.getBytes(response.profile().handle())); this.crawlStacker.nextQueue.errorURL.push(response.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); } } @@ -3002,7 +3002,7 @@ public final class Switchboard extends serverSwitch { continue; } final Request request = this.loader.request(e.getValue(), true, true); - final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle())); + final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle())); final String acceptedError = this.crawlStacker.checkAcceptance(e.getValue(), profile, 0); if (acceptedError != null) { this.log.warn("addToIndex: cannot load " + urlName + ": " + acceptedError); @@ -3032,7 +3032,7 @@ public final class Switchboard extends serverSwitch { final Document[] documents = response.parse(); if (documents != null) { for (final Document document: documents) { - final CrawlProfile profile = crawler.getActive(ASCII.getBytes(request.profileHandle())); + final CrawlProfile profile = crawler.get(ASCII.getBytes(request.profileHandle())); if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) { throw new Parser.Failure("indexing is denied", url); } @@ -3075,7 +3075,7 @@ public final class Switchboard extends serverSwitch { if (existingids.contains(e.getKey())) continue; // double DigestURL url = e.getValue(); final Request request = this.loader.request(url, true, true); - final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle())); + final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle())); final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0); if (acceptedError != null) { this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);