- better url double check in crawler

- more logging for error urls

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7032 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent a6ed6e8cb9
commit a82a93f2fc

@ -32,6 +32,7 @@ import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.BufferedObjectIndex;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.Row;
@ -56,6 +57,7 @@ public class Balancer {
private final ConcurrentHashMap<String, HandleSet> domainStacks; // a map from host name to lists with url hashs
private final ConcurrentLinkedQueue<byte[]> top;
private final TreeMap<Long, byte[]> delayed;
private final HandleSet ddc;
private BufferedObjectIndex urlFileIndex;
private final File cacheStacksPath;
private long minimumLocalDelta;
@ -77,6 +79,7 @@ public class Balancer {
this.minimumLocalDelta = minimumLocalDelta;
this.minimumGlobalDelta = minimumGlobalDelta;
this.domStackInitSize = Integer.MAX_VALUE;
this.ddc = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
// create a stack for newly entered entries
if (!(cachePath.exists())) cachePath.mkdir(); // make the path
@ -208,8 +211,10 @@ public class Balancer {
return removedCounter;
}
public boolean has(final String urlhash) {
return urlFileIndex.has(urlhash.getBytes());
public boolean has(final byte[] urlhashb) {
synchronized (this) {
return this.urlFileIndex.has(urlhashb) || this.ddc.has(urlhashb);
}
}
public boolean notEmpty() {
@ -240,15 +245,13 @@ public class Balancer {
assert entry != null;
final byte[] hash = entry.url().hash();
synchronized (this) {
if (urlFileIndex.has(hash)) {
return;
}
if (this.urlFileIndex.has(hash) || this.ddc.has(hash)) return;
// add to index
final int s = urlFileIndex.size();
urlFileIndex.put(entry.toRow());
assert s < urlFileIndex.size() : "hash = " + new String(hash) + ", s = " + s + ", size = " + urlFileIndex.size();
assert urlFileIndex.has(hash) : "hash = " + new String(hash);
final int s = this.urlFileIndex.size();
this.urlFileIndex.put(entry.toRow());
assert s < this.urlFileIndex.size() : "hash = " + new String(hash) + ", s = " + s + ", size = " + this.urlFileIndex.size();
assert this.urlFileIndex.has(hash) : "hash = " + new String(hash);
// add the hash to a queue
pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
@ -409,6 +412,8 @@ public class Balancer {
}
break;
}
if (crawlEntry != null)
try { this.ddc.put(crawlEntry.url().hash()); } catch (RowSpaceExceededException e) {}
}
if (crawlEntry == null) return null;
@ -430,6 +435,7 @@ public class Balancer {
try {synchronized(this) { this.wait(3000); }} catch (final InterruptedException e) {}
}
}
this.ddc.remove(crawlEntry.url().hash());
Latency.update(crawlEntry.url());
return crawlEntry;
}

@ -131,19 +131,17 @@ public class CrawlQueues {
}
/**
* tests if hash occurrs in any database
* tests if hash occurs in any database
* @param hash
* @return if the hash exists, the name of the database is returned, otherwise null is returned
*/
public String urlExists(final byte[] hash) {
if (delegatedURL.exists(hash)) return "delegated";
if (errorURL.exists(hash)) return "errors";
/*
if (noticeURL.existsInStack(hash)) return "crawler";
for (final crawlWorker worker: workers.values()) {
if (worker.request.url().hash().equals(hash)) return "worker";
if (Base64Order.enhancedCoder.equal(worker.request.url().hash(), hash)) return "worker";
}
*/
return null;
}
@ -539,7 +537,7 @@ public class CrawlQueues {
// checking robots.txt for http(s) resources
this.request.setStatus("worker-checkingrobots", WorkflowJob.STATUS_STARTED);
if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) && sb.robots.isDisallowed(request.url())) {
if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt.");
//if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt.");
errorURL.push(
this.request,
sb.peers.mySeed().hash.getBytes(),

@ -322,14 +322,14 @@ public final class CrawlStacker {
} else {
final boolean recrawl = profile.recrawlIfOlder() > oldEntry.loaddate().getTime();
if (recrawl) {
if (this.log.isFine())
this.log.logFine("RE-CRAWL of URL '" + url.toString() + "': this url was crawled " +
if (this.log.isInfo())
this.log.logInfo("RE-CRAWL of URL '" + url.toString() + "': this url was crawled " +
((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
} else {
if (dbocc == null) {
return "double in: LURL-DB";
} else {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time:");
if (this.log.isInfo()) this.log.logInfo("URL '" + url.toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time:");
if (dbocc.equals("errors")) {
ZURL.Entry errorEntry = nextQueue.errorURL.get(url.hash());
return "double in: errors (" + errorEntry.anycause() + ")";

@ -142,12 +142,12 @@ public class NoticedURL {
}
}
public boolean existsInStack(final String urlhash) {
public boolean existsInStack(final byte[] urlhashb) {
return
coreStack.has(urlhash) ||
limitStack.has(urlhash) ||
//overhangStack.has(urlhash) ||
remoteStack.has(urlhash);
coreStack.has(urlhashb) ||
limitStack.has(urlhashb) ||
//overhangStack.has(urlhashb) ||
remoteStack.has(urlhashb);
}
public void push(final int stackType, final Request entry) {

@ -132,6 +132,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
Entry entry = new Entry(bentry, executor, workdate, workcount, anycause);
put(entry);
stack.add(entry.hash());
Log.logInfo("URL Errors", bentry.url().toNormalform(false, false) + " - " + anycause);
while (stack.size() > maxStackSize) stack.poll();
}

@ -67,7 +67,7 @@ public class FileLoader {
String[] l = url.list();
if (l == null) {
// this can only happen if there is no connection or the directory does not exist
log.logInfo("directory listing not available. URL = " + request.url().toString());
//log.logInfo("directory listing not available. URL = " + request.url().toString());
sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, "directory listing not available. URL = " + request.url().toString());
throw new IOException("directory listing not available. URL = " + request.url().toString());
}

@ -81,7 +81,7 @@ public class SMBLoader {
String[] l = url.list();
if (l == null) {
// this can only happen if there is no connection or the directory does not exist
log.logInfo("directory listing not available. URL = " + request.url().toString());
//log.logInfo("directory listing not available. URL = " + request.url().toString());
sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, "directory listing not available. URL = " + request.url().toString());
throw new IOException("directory listing not available. URL = " + request.url().toString());
}

@ -1238,7 +1238,7 @@ public final class Switchboard extends serverSwitch {
if (noIndexReason != null) {
// log cause and close queue
final DigestURI referrerURL = response.referrerURL();
if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason);
//if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason);
addURLtoErrorDB(response.url(), (referrerURL == null) ? null : referrerURL.hash(), response.initiator(), response.name(), noIndexReason);
// finish this entry
return "not allowed: " + noIndexReason;
@ -1714,7 +1714,7 @@ public final class Switchboard extends serverSwitch {
b = Cache.getContent(response.url());
if (b == null) {
this.log.logWarning("the resource '" + response.url() + "' is missing in the cache.");
addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), "missing");
addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), "missing in cache");
return null;
}
}
@ -1790,6 +1790,7 @@ public final class Switchboard extends serverSwitch {
for (Document document: in.documents) {
if (document.indexingDenied()) {
if (log.isInfo()) log.logInfo("Not Condensed Resource '" + in.queueEntry.url().toNormalform(false, true) + "': denied by document-attached noindexing rule");
addURLtoErrorDB(in.queueEntry.url(), in.queueEntry.referrerHash(), in.queueEntry.initiator(), in.queueEntry.name(), "denied by document-attached noindexing rule");
continue;
}
doclist.add(document);
@ -1850,14 +1851,14 @@ public final class Switchboard extends serverSwitch {
if (process == Segments.Process.SURROGATES) processCase = EventOrigin.SURROGATES;
if (condenser == null || document.indexingDenied()) {
if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by rule in document");
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by rule in document, process case=" + processCase);
return;
}
if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) {
if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by profile rule");
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
return;
}
@ -1878,8 +1879,8 @@ public final class Switchboard extends serverSwitch {
searchEvent);
yacyChannel.channels(Base64Order.enhancedCoder.equal(queueEntry.initiator(), peers.mySeed().hash.getBytes()) ? yacyChannel.LOCALINDEXING : yacyChannel.REMOTEINDEXING).addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false)));
} catch (final IOException e) {
if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "error storing url: " + e.getMessage());
//if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "error storing url: " + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase + ", error = " + e.getMessage());
return;
}
@ -2360,19 +2361,17 @@ public final class Switchboard extends serverSwitch {
try {
url = new DigestURI(seedListFileURL, null);
final long start = System.currentTimeMillis();
// header = Client.whead(url.toString(), reqHeader);
//final long start = System.currentTimeMillis();
client.HEADResponse(url.toString());
header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
final long loadtime = System.currentTimeMillis() - start;
// if (header == null) {
if (header == null) {
//final long loadtime = System.currentTimeMillis() - start;
/*if (header == null) {
if (loadtime > getConfigLong("bootstrapLoadTimeout", 6000)) {
yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available, time-out after " + loadtime + " milliseconds");
} else {
yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available, no content");
}
} else if (header.lastModified() == null) {
} else*/ if (header.lastModified() == null) {
yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not usable, last-modified is missing");
} else if ((header.age() > 86400000) && (ssc > 0)) {
yacyCore.log.logInfo("BOOTSTRAP: seed-list URL " + seedListFileURL + " too old (" + (header.age() / 86400000) + " days)");

@ -190,7 +190,8 @@ public class Table implements Index, Iterable<Row.Entry> {
// open the file
this.file = new BufferedRecords(new Records(tablefile, rowdef.objectsize), this.buffersize);
assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size() + ", file = " + this.filename();
// clean up the file by cleaning badly formed entries
int errorc = errors.size();
int errorcc = 0;
@ -202,7 +203,7 @@ public class Table implements Index, Iterable<Row.Entry> {
removeInFile(idx);
}
errors.close();
assert this.index.size() == this.file.size() : "index.size() = " + index.size() + ", this.file.size() = " + this.file.size();
assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size() + ", file = " + this.filename();
// remove doubles
if (!freshFile) {
@ -449,8 +450,8 @@ public class Table implements Index, Iterable<Row.Entry> {
public Entry get(final byte[] key) throws IOException {
if ((file == null) || (index == null)) return null;
synchronized (this) {
assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size();
assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size();
assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size() + ", file = " + this.filename();
assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size() + ", file = " + this.filename();
}
Entry e = get0(key);
if (e != null && this.rowdef.objectOrder.equal(key, e.getPrimaryKeyBytes())) return e;
@ -543,8 +544,8 @@ public class Table implements Index, Iterable<Row.Entry> {
}
public synchronized void put(final Entry row) throws IOException, RowSpaceExceededException {
assert file == null || file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size();
assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size();
assert file == null || file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size() + ", file = " + this.filename();
assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size() + ", file = " + this.filename();
assert row != null;
assert row.bytes() != null;
if (file == null || row == null || row.bytes() == null) return;

Loading…
Cancel
Save