diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 07eaee23f..a8af897d2 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -32,6 +32,7 @@ import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; +import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.index.BufferedObjectIndex; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.Row; @@ -56,6 +57,7 @@ public class Balancer { private final ConcurrentHashMap domainStacks; // a map from host name to lists with url hashs private final ConcurrentLinkedQueue top; private final TreeMap delayed; + private final HandleSet ddc; private BufferedObjectIndex urlFileIndex; private final File cacheStacksPath; private long minimumLocalDelta; @@ -77,6 +79,7 @@ public class Balancer { this.minimumLocalDelta = minimumLocalDelta; this.minimumGlobalDelta = minimumGlobalDelta; this.domStackInitSize = Integer.MAX_VALUE; + this.ddc = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); // create a stack for newly entered entries if (!(cachePath.exists())) cachePath.mkdir(); // make the path @@ -208,8 +211,10 @@ public class Balancer { return removedCounter; } - public boolean has(final String urlhash) { - return urlFileIndex.has(urlhash.getBytes()); + public boolean has(final byte[] urlhashb) { + synchronized (this) { + return this.urlFileIndex.has(urlhashb) || this.ddc.has(urlhashb); + } } public boolean notEmpty() { @@ -240,15 +245,13 @@ public class Balancer { assert entry != null; final byte[] hash = entry.url().hash(); synchronized (this) { - if (urlFileIndex.has(hash)) { - return; - } + if (this.urlFileIndex.has(hash) || this.ddc.has(hash)) return; // add to index - final int s = urlFileIndex.size(); - urlFileIndex.put(entry.toRow()); - assert s < urlFileIndex.size() : "hash = " + new String(hash) + ", s = " + s + ", size = " + urlFileIndex.size(); - assert urlFileIndex.has(hash) : "hash = " + new String(hash); + final int s = this.urlFileIndex.size(); + this.urlFileIndex.put(entry.toRow()); + assert s < this.urlFileIndex.size() : "hash = " + new String(hash) + ", s = " + s + ", size = " + this.urlFileIndex.size(); + assert this.urlFileIndex.has(hash) : "hash = " + new String(hash); // add the hash to a queue pushHashToDomainStacks(entry.url().getHost(), entry.url().hash()); @@ -409,6 +412,8 @@ public class Balancer { } break; } + if (crawlEntry != null) + try { this.ddc.put(crawlEntry.url().hash()); } catch (RowSpaceExceededException e) {} } if (crawlEntry == null) return null; @@ -430,6 +435,7 @@ public class Balancer { try {synchronized(this) { this.wait(3000); }} catch (final InterruptedException e) {} } } + this.ddc.remove(crawlEntry.url().hash()); Latency.update(crawlEntry.url()); return crawlEntry; } diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 209f391c5..6270beec5 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -131,19 +131,17 @@ public class CrawlQueues { } /** - * tests if hash occurrs in any database + * tests if hash occurs in any database * @param hash * @return if the hash exists, the name of the database is returned, otherwise null is returned */ public String urlExists(final byte[] hash) { if (delegatedURL.exists(hash)) return "delegated"; if (errorURL.exists(hash)) return "errors"; - /* if (noticeURL.existsInStack(hash)) return "crawler"; for (final crawlWorker worker: workers.values()) { - if (worker.request.url().hash().equals(hash)) return "worker"; + if (Base64Order.enhancedCoder.equal(worker.request.url().hash(), hash)) return "worker"; } - */ return null; } @@ -539,7 +537,7 @@ public class CrawlQueues { // checking robots.txt for http(s) resources this.request.setStatus("worker-checkingrobots", WorkflowJob.STATUS_STARTED); if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) && sb.robots.isDisallowed(request.url())) { - if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt."); + //if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt."); errorURL.push( this.request, sb.peers.mySeed().hash.getBytes(), diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 6feccccf8..656d20f4b 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -322,14 +322,14 @@ public final class CrawlStacker { } else { final boolean recrawl = profile.recrawlIfOlder() > oldEntry.loaddate().getTime(); if (recrawl) { - if (this.log.isFine()) - this.log.logFine("RE-CRAWL of URL '" + url.toString() + "': this url was crawled " + + if (this.log.isInfo()) + this.log.logInfo("RE-CRAWL of URL '" + url.toString() + "': this url was crawled " + ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago."); } else { if (dbocc == null) { return "double in: LURL-DB"; } else { - if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time:"); + if (this.log.isInfo()) this.log.logInfo("URL '" + url.toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time:"); if (dbocc.equals("errors")) { ZURL.Entry errorEntry = nextQueue.errorURL.get(url.hash()); return "double in: errors (" + errorEntry.anycause() + ")"; diff --git a/source/de/anomic/crawler/NoticedURL.java b/source/de/anomic/crawler/NoticedURL.java index c2a6b285c..36434ebd3 100755 --- a/source/de/anomic/crawler/NoticedURL.java +++ b/source/de/anomic/crawler/NoticedURL.java @@ -142,12 +142,12 @@ public class NoticedURL { } } - public boolean existsInStack(final String urlhash) { + public boolean existsInStack(final byte[] urlhashb) { return - coreStack.has(urlhash) || - limitStack.has(urlhash) || - //overhangStack.has(urlhash) || - remoteStack.has(urlhash); + coreStack.has(urlhashb) || + limitStack.has(urlhashb) || + //overhangStack.has(urlhashb) || + remoteStack.has(urlhashb); } public void push(final int stackType, final Request entry) { diff --git a/source/de/anomic/crawler/ZURL.java b/source/de/anomic/crawler/ZURL.java index e134e66d5..e95afb4da 100755 --- a/source/de/anomic/crawler/ZURL.java +++ b/source/de/anomic/crawler/ZURL.java @@ -132,6 +132,7 @@ public class ZURL implements Iterable { Entry entry = new Entry(bentry, executor, workdate, workcount, anycause); put(entry); stack.add(entry.hash()); + Log.logInfo("URL Errors", bentry.url().toNormalform(false, false) + " - " + anycause); while (stack.size() > maxStackSize) stack.poll(); } diff --git a/source/de/anomic/crawler/retrieval/FileLoader.java b/source/de/anomic/crawler/retrieval/FileLoader.java index ab55b9109..e818fd570 100644 --- a/source/de/anomic/crawler/retrieval/FileLoader.java +++ b/source/de/anomic/crawler/retrieval/FileLoader.java @@ -67,7 +67,7 @@ public class FileLoader { String[] l = url.list(); if (l == null) { // this can only happen if there is no connection or the directory does not exist - log.logInfo("directory listing not available. URL = " + request.url().toString()); + //log.logInfo("directory listing not available. URL = " + request.url().toString()); sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, "directory listing not available. URL = " + request.url().toString()); throw new IOException("directory listing not available. URL = " + request.url().toString()); } diff --git a/source/de/anomic/crawler/retrieval/SMBLoader.java b/source/de/anomic/crawler/retrieval/SMBLoader.java index 51979b267..b54f14dae 100644 --- a/source/de/anomic/crawler/retrieval/SMBLoader.java +++ b/source/de/anomic/crawler/retrieval/SMBLoader.java @@ -81,7 +81,7 @@ public class SMBLoader { String[] l = url.list(); if (l == null) { // this can only happen if there is no connection or the directory does not exist - log.logInfo("directory listing not available. URL = " + request.url().toString()); + //log.logInfo("directory listing not available. URL = " + request.url().toString()); sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, "directory listing not available. URL = " + request.url().toString()); throw new IOException("directory listing not available. URL = " + request.url().toString()); } diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 352f6ba02..c6b090a62 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1238,7 +1238,7 @@ public final class Switchboard extends serverSwitch { if (noIndexReason != null) { // log cause and close queue final DigestURI referrerURL = response.referrerURL(); - if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason); + //if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason); addURLtoErrorDB(response.url(), (referrerURL == null) ? null : referrerURL.hash(), response.initiator(), response.name(), noIndexReason); // finish this entry return "not allowed: " + noIndexReason; @@ -1714,7 +1714,7 @@ public final class Switchboard extends serverSwitch { b = Cache.getContent(response.url()); if (b == null) { this.log.logWarning("the resource '" + response.url() + "' is missing in the cache."); - addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), "missing"); + addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), "missing in cache"); return null; } } @@ -1790,6 +1790,7 @@ public final class Switchboard extends serverSwitch { for (Document document: in.documents) { if (document.indexingDenied()) { if (log.isInfo()) log.logInfo("Not Condensed Resource '" + in.queueEntry.url().toNormalform(false, true) + "': denied by document-attached noindexing rule"); + addURLtoErrorDB(in.queueEntry.url(), in.queueEntry.referrerHash(), in.queueEntry.initiator(), in.queueEntry.name(), "denied by document-attached noindexing rule"); continue; } doclist.add(document); @@ -1850,14 +1851,14 @@ public final class Switchboard extends serverSwitch { if (process == Segments.Process.SURROGATES) processCase = EventOrigin.SURROGATES; if (condenser == null || document.indexingDenied()) { - if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase); - addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by rule in document"); + //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase); + addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by rule in document, process case=" + processCase); return; } if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) { - if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name()); - addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by profile rule"); + //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name()); + addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name()); return; } @@ -1878,8 +1879,8 @@ public final class Switchboard extends serverSwitch { searchEvent); yacyChannel.channels(Base64Order.enhancedCoder.equal(queueEntry.initiator(), peers.mySeed().hash.getBytes()) ? yacyChannel.LOCALINDEXING : yacyChannel.REMOTEINDEXING).addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false))); } catch (final IOException e) { - if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase); - addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "error storing url: " + e.getMessage()); + //if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase); + addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "error storing url: " + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase + ", error = " + e.getMessage()); return; } @@ -2360,19 +2361,17 @@ public final class Switchboard extends serverSwitch { try { url = new DigestURI(seedListFileURL, null); - final long start = System.currentTimeMillis(); -// header = Client.whead(url.toString(), reqHeader); + //final long start = System.currentTimeMillis(); client.HEADResponse(url.toString()); header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); - final long loadtime = System.currentTimeMillis() - start; -// if (header == null) { - if (header == null) { + //final long loadtime = System.currentTimeMillis() - start; + /*if (header == null) { if (loadtime > getConfigLong("bootstrapLoadTimeout", 6000)) { yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available, time-out after " + loadtime + " milliseconds"); } else { yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available, no content"); } - } else if (header.lastModified() == null) { + } else*/ if (header.lastModified() == null) { yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not usable, last-modified is missing"); } else if ((header.age() > 86400000) && (ssc > 0)) { yacyCore.log.logInfo("BOOTSTRAP: seed-list URL " + seedListFileURL + " too old (" + (header.age() / 86400000) + " days)"); diff --git a/source/net/yacy/kelondro/table/Table.java b/source/net/yacy/kelondro/table/Table.java index efd1e791e..2932491d2 100644 --- a/source/net/yacy/kelondro/table/Table.java +++ b/source/net/yacy/kelondro/table/Table.java @@ -190,7 +190,8 @@ public class Table implements Index, Iterable { // open the file this.file = new BufferedRecords(new Records(tablefile, rowdef.objectsize), this.buffersize); - + assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size() + ", file = " + this.filename(); + // clean up the file by cleaning badly formed entries int errorc = errors.size(); int errorcc = 0; @@ -202,7 +203,7 @@ public class Table implements Index, Iterable { removeInFile(idx); } errors.close(); - assert this.index.size() == this.file.size() : "index.size() = " + index.size() + ", this.file.size() = " + this.file.size(); + assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size() + ", file = " + this.filename(); // remove doubles if (!freshFile) { @@ -449,8 +450,8 @@ public class Table implements Index, Iterable { public Entry get(final byte[] key) throws IOException { if ((file == null) || (index == null)) return null; synchronized (this) { - assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size(); - assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size(); + assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size() + ", file = " + this.filename(); + assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size() + ", file = " + this.filename(); } Entry e = get0(key); if (e != null && this.rowdef.objectOrder.equal(key, e.getPrimaryKeyBytes())) return e; @@ -543,8 +544,8 @@ public class Table implements Index, Iterable { } public synchronized void put(final Entry row) throws IOException, RowSpaceExceededException { - assert file == null || file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size(); - assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size(); + assert file == null || file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size() + ", file = " + this.filename(); + assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size() + ", file = " + this.filename(); assert row != null; assert row.bytes() != null; if (file == null || row == null || row.bytes() == null) return;