self-healing of mistakenly deactivated crawl profiles. This fixes a bug

which can happen in rare cases when a crawl start and a cleanup process
happen at the same time.
pull/1/head
Michael Peter Christen 12 years ago
parent 095053a9b4
commit 91a875dff5

@ -394,7 +394,7 @@ public class Balancer {
// at this point we must check if the crawlEntry has relevance because the crawl profile still exists // at this point we must check if the crawlEntry has relevance because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again // if not: return null. A calling method must handle the null value and try again
profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) { if (profileEntry == null) {
ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
continue; continue;
@ -481,7 +481,7 @@ public class Balancer {
rowEntry = this.urlFileIndex.get(urlhash, false); rowEntry = this.urlFileIndex.get(urlhash, false);
if (rowEntry == null) continue; // may have been deleted there manwhile if (rowEntry == null) continue; // may have been deleted there manwhile
Request crawlEntry = new Request(rowEntry); Request crawlEntry = new Request(rowEntry);
CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); CrawlProfile profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) { if (profileEntry == null) {
ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
continue; continue;

@ -149,7 +149,7 @@ public final class CrawlStacker {
// if the url was rejected we store it into the error URL db // if the url was rejected we store it into the error URL db
if (rejectReason != null && !rejectReason.startsWith("double in")) { if (rejectReason != null && !rejectReason.startsWith("double in")) {
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle())); final CrawlProfile profile = this.crawler.get(UTF8.getBytes(entry.profileHandle()));
this.nextQueue.errorURL.push(entry.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); this.nextQueue.errorURL.push(entry.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
} }
} catch (final Exception e) { } catch (final Exception e) {
@ -294,7 +294,8 @@ public final class CrawlStacker {
public String stackCrawl(final Request entry) { public String stackCrawl(final Request entry) {
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'"); //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle())); byte[] handle = UTF8.getBytes(entry.profileHandle());
final CrawlProfile profile = this.crawler.get(handle);
String error; String error;
if (profile == null) { if (profile == null) {
error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url(); error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();

@ -166,6 +166,23 @@ public final class CrawlSwitchboard {
/ 1024); / 1024);
} }
/**
* Get a profile from active or passive stack. Should be used to be sure not to miss old, cleaned profiles.
* A profile that was discovered from the passive stack is automatically shifted back to the active stack.
* @param profileKey
* @return
*/
public CrawlProfile get(final byte[] profileKey) {
CrawlProfile profile = getActive(profileKey);
if (profile != null) return profile;
profile = getPassive(profileKey);
if (profile == null) return null;
// clean up
this.putActive(profileKey, profile);
this.removePassive(profileKey);
return profile;
}
public CrawlProfile getActive(final byte[] profileKey) { public CrawlProfile getActive(final byte[] profileKey) {
if ( profileKey == null ) { if ( profileKey == null ) {
return null; return null;

@ -148,7 +148,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
if (name.length() > 256) name = name.substring(256); if (name.length() > 256) name = name.substring(256);
this.doms = new ConcurrentHashMap<String, AtomicInteger>(); this.doms = new ConcurrentHashMap<String, AtomicInteger>();
final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages)).substring(0, Word.commonHashLength); final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages + collections)).substring(0, Word.commonHashLength);
put(HANDLE, handle); put(HANDLE, handle);
put(NAME, name); put(NAME, name);
put(AGENT_NAME, userAgentName); put(AGENT_NAME, userAgentName);

@ -255,7 +255,7 @@ public class CrawlQueues {
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true; return true;
} }
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(profileHandle)); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(profileHandle));
if (profile == null) { if (profile == null) {
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true; return true;
@ -297,7 +297,7 @@ public class CrawlQueues {
* @return * @return
*/ */
private void load(final Request urlEntry, final String stats, final String profileHandle) { private void load(final Request urlEntry, final String stats, final String profileHandle) {
final CrawlProfile profile = this.sb.crawler.getActive(UTF8.getBytes(profileHandle)); final CrawlProfile profile = this.sb.crawler.get(UTF8.getBytes(profileHandle));
if (profile != null) { if (profile != null) {
// check if the protocol is supported // check if the protocol is supported
@ -606,7 +606,7 @@ public class CrawlQueues {
this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED); this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
this.code = Integer.valueOf(entry.hashCode()); this.code = Integer.valueOf(entry.hashCode());
this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
this.profile = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle())); this.profile = CrawlQueues.this.sb.crawler.get(UTF8.getBytes(this.request.profileHandle()));
} }
private long age() { private long age() {

@ -101,7 +101,7 @@ public class FTPLoader {
// create new ftp client // create new ftp client
final FTPClient ftpClient = new FTPClient(); final FTPClient ftpClient = new FTPClient();
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
// get a connection // get a connection
if (openConnection(ftpClient, entryUrl)) { if (openConnection(ftpClient, entryUrl)) {
// test if the specified file is a directory // test if the specified file is a directory
@ -249,7 +249,7 @@ public class FTPLoader {
// create response with metadata only // create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
final Response response = new Response( final Response response = new Response(
request, request,
requestHeader, requestHeader,
@ -264,7 +264,7 @@ public class FTPLoader {
final byte[] b = ftpClient.get(path); final byte[] b = ftpClient.get(path);
// create a response // create a response
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
final Response response = new Response( final Response response = new Response(
request, request,
requestHeader, requestHeader,

@ -83,7 +83,7 @@ public class FileLoader {
ResponseHeader responseHeader = new ResponseHeader(200); ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
@ -123,7 +123,7 @@ public class FileLoader {
// create response with metadata only // create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
@ -140,7 +140,7 @@ public class FileLoader {
is.close(); is.close();
// create response with loaded content // create response with loaded content
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,

@ -101,7 +101,7 @@ public class SMBLoader {
ResponseHeader responseHeader = new ResponseHeader(200); ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
@ -141,7 +141,7 @@ public class SMBLoader {
// create response with metadata only // create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
@ -158,7 +158,7 @@ public class SMBLoader {
is.close(); is.close();
// create response with loaded content // create response with loaded content
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,

@ -187,7 +187,7 @@ public final class LoaderDispatcher {
if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
final String protocol = url.getProtocol(); final String protocol = url.getProtocol();
final String host = url.getHost(); final String host = url.getHost();
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle())); final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
// check if url is in blacklist // check if url is in blacklist
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) { if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {

@ -2514,7 +2514,7 @@ public final class Switchboard extends serverSwitch {
} else { } else {
// we consider this as fail urls to have a tracking of the problem // we consider this as fail urls to have a tracking of the problem
if (rejectReason != null && !rejectReason.startsWith("double in")) { if (rejectReason != null && !rejectReason.startsWith("double in")) {
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(response.profile().handle())); final CrawlProfile profile = this.crawler.get(UTF8.getBytes(response.profile().handle()));
this.crawlStacker.nextQueue.errorURL.push(response.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); this.crawlStacker.nextQueue.errorURL.push(response.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
} }
} }
@ -3002,7 +3002,7 @@ public final class Switchboard extends serverSwitch {
continue; continue;
} }
final Request request = this.loader.request(e.getValue(), true, true); final Request request = this.loader.request(e.getValue(), true, true);
final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
final String acceptedError = this.crawlStacker.checkAcceptance(e.getValue(), profile, 0); final String acceptedError = this.crawlStacker.checkAcceptance(e.getValue(), profile, 0);
if (acceptedError != null) { if (acceptedError != null) {
this.log.warn("addToIndex: cannot load " + urlName + ": " + acceptedError); this.log.warn("addToIndex: cannot load " + urlName + ": " + acceptedError);
@ -3032,7 +3032,7 @@ public final class Switchboard extends serverSwitch {
final Document[] documents = response.parse(); final Document[] documents = response.parse();
if (documents != null) { if (documents != null) {
for (final Document document: documents) { for (final Document document: documents) {
final CrawlProfile profile = crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = crawler.get(ASCII.getBytes(request.profileHandle()));
if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) { if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) {
throw new Parser.Failure("indexing is denied", url); throw new Parser.Failure("indexing is denied", url);
} }
@ -3075,7 +3075,7 @@ public final class Switchboard extends serverSwitch {
if (existingids.contains(e.getKey())) continue; // double if (existingids.contains(e.getKey())) continue; // double
DigestURL url = e.getValue(); DigestURL url = e.getValue();
final Request request = this.loader.request(url, true, true); final Request request = this.loader.request(url, true, true);
final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0); final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
if (acceptedError != null) { if (acceptedError != null) {
this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError); this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);

Loading…
Cancel
Save