|
|
@ -607,8 +607,7 @@ public class Balancer {
|
|
|
|
if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return;
|
|
|
|
if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return;
|
|
|
|
this.domainStacks.clear();
|
|
|
|
this.domainStacks.clear();
|
|
|
|
this.lastDomainStackFill = System.currentTimeMillis();
|
|
|
|
this.lastDomainStackFill = System.currentTimeMillis();
|
|
|
|
//final HandleSet handles = this.urlFileIndex.keysFromBuffer(objectIndexBufferSize / 2);
|
|
|
|
final HandleSet blackhandles = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10);
|
|
|
|
//final CloneableIterator<byte[]> i = handles.keys(true, null);
|
|
|
|
|
|
|
|
String host;
|
|
|
|
String host;
|
|
|
|
Request request;
|
|
|
|
Request request;
|
|
|
|
int count = 0;
|
|
|
|
int count = 0;
|
|
|
@ -616,6 +615,14 @@ public class Balancer {
|
|
|
|
for (Row.Entry entry: this.urlFileIndex.random(10000)) {
|
|
|
|
for (Row.Entry entry: this.urlFileIndex.random(10000)) {
|
|
|
|
if (entry == null) continue;
|
|
|
|
if (entry == null) continue;
|
|
|
|
request = new Request(entry);
|
|
|
|
request = new Request(entry);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// check blacklist (again) because the user may have created blacklist entries after the queue has been filled
|
|
|
|
|
|
|
|
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, request.url())) {
|
|
|
|
|
|
|
|
Log.logFine("CRAWLER", "URL '" + request.url() + "' is in blacklist.");
|
|
|
|
|
|
|
|
try {blackhandles.put(entry.getPrimaryKeyBytes());} catch (SpaceExceededException e) {}
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
host = request.url().getHost();
|
|
|
|
host = request.url().getHost();
|
|
|
|
try {
|
|
|
|
try {
|
|
|
|
pushHashToDomainStacks(host, request.url().hosthash(), entry.getPrimaryKeyBytes());
|
|
|
|
pushHashToDomainStacks(host, request.url().hosthash(), entry.getPrimaryKeyBytes());
|
|
|
@ -625,7 +632,11 @@ public class Balancer {
|
|
|
|
count++;
|
|
|
|
count++;
|
|
|
|
if (this.domainStacks.size() >= 1000 || count >= 100000 || System.currentTimeMillis() > timeout) break;
|
|
|
|
if (this.domainStacks.size() >= 1000 || count >= 100000 || System.currentTimeMillis() > timeout) break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Log.logInfo("BALANCER", "re-fill of domain stacks; fileIndex.size() = " + this.urlFileIndex.size() + ", domainStacks.size = " + this.domainStacks.size() + ", collection time = " + (System.currentTimeMillis() - this.lastDomainStackFill) + " ms");
|
|
|
|
|
|
|
|
|
|
|
|
// if we collected blacklist entries then delete them now
|
|
|
|
|
|
|
|
for (byte[] blackhandle: blackhandles) this.urlFileIndex.remove(blackhandle);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Log.logInfo("BALANCER", "re-fill of domain stacks; fileIndex.size() = " + this.urlFileIndex.size() + ", domainStacks.size = " + this.domainStacks.size() + ", blackhandles = " + blackhandles.size() + ", collection time = " + (System.currentTimeMillis() - this.lastDomainStackFill) + " ms");
|
|
|
|
this.domStackInitSize = this.domainStacks.size();
|
|
|
|
this.domStackInitSize = this.domainStacks.size();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|