diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index a80e71513..68a1a2e82 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -138,10 +138,12 @@ public class CrawlQueues { public String urlExists(final String hash) { if (delegatedURL.exists(hash)) return "delegated"; if (errorURL.exists(hash)) return "errors"; + /* + if (noticeURL.existsInStack(hash)) return "crawler"; for (final crawlWorker worker: workers.values()) { if (worker.request.url().hash().equals(hash)) return "worker"; } - if (noticeURL.existsInStack(hash)) return "crawler"; + */ return null; } @@ -154,9 +156,9 @@ public class CrawlQueues { public DigestURI getURL(final String urlhash) { assert urlhash != null; if (urlhash == null || urlhash.length() == 0) return null; - ZURL.Entry ee = delegatedURL.getEntry(urlhash); + ZURL.Entry ee = delegatedURL.get(urlhash); if (ee != null) return ee.url(); - ee = errorURL.getEntry(urlhash); + ee = errorURL.get(urlhash); if (ee != null) return ee.url(); for (final crawlWorker w: workers.values()) { if (w.request.url().hash().equals(urlhash)) return w.request.url(); diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 28919a54b..4c134e72f 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -260,23 +260,36 @@ public final class CrawlStacker { // check if the url is double registered final String dbocc = nextQueue.urlExists(entry.url().hash()); // returns the name of the queue if entry exists - URIMetadataRow oldEntry = null; - if (dbocc != null || (oldEntry = indexSegment.urlMetadata().load(entry.url().hash(), null, 0)) != null) { - final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime()); - // do double-check - if ((dbocc != null) && (!recrawl)) { + URIMetadataRow oldEntry = indexSegment.urlMetadata().load(entry.url().hash(), null, 0); + if (oldEntry == null) { + if (dbocc != null) { + // do double-check if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms"); - return "double in: " + dbocc; + if (dbocc.equals("errors")) { + ZURL.Entry errorEntry = nextQueue.errorURL.get(entry.url().hash()); + return "double in: errors (" + errorEntry.anycause() + ")"; + } else { + return "double in: " + dbocc; + } } - if ((oldEntry != null) && (!recrawl)) { - if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms"); - return "double in: LURL"; - } - - // show potential re-crawl - if (recrawl && oldEntry != null) { - if (this.log.isFine()) this.log.logFine("RE-CRAWL of URL '" + entry.url().toString() + "': this url was crawled " + + } else { + final boolean recrawl = profile.recrawlIfOlder() > oldEntry.loaddate().getTime(); + if (recrawl) { + if (this.log.isFine()) + this.log.logFine("RE-CRAWL of URL '" + entry.url().toString() + "': this url was crawled " + ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago."); + } else { + if (dbocc == null) { + return "double in: LURL-DB"; + } else { + if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms"); + if (dbocc.equals("errors")) { + ZURL.Entry errorEntry = nextQueue.errorURL.get(entry.url().hash()); + return "double in: errors (" + errorEntry.anycause() + ")"; + } else { + return "double in: " + dbocc; + } + } } } diff --git a/source/de/anomic/crawler/ZURL.java b/source/de/anomic/crawler/ZURL.java index 53c6367bc..ee256e9a7 100755 --- a/source/de/anomic/crawler/ZURL.java +++ b/source/de/anomic/crawler/ZURL.java @@ -127,9 +127,10 @@ public class ZURL implements Iterable { final int workcount, String anycause) { assert executor != null; + if (exists(bentry.url().hash())) return; // don't insert double causes if (anycause == null) anycause = "unknown"; Entry entry = new Entry(bentry, executor, workdate, workcount, anycause); - entry.store(); + put(entry); stack.add(entry.hash()); while (stack.size() > maxStackSize) stack.poll(); } @@ -157,7 +158,7 @@ public class ZURL implements Iterable { } public ZURL.Entry next() { - return getEntry(hi.next()); + return get(hi.next()); } public void remove() { @@ -166,7 +167,7 @@ public class ZURL implements Iterable { } - public ZURL.Entry getEntry(final String urlhash) { + public ZURL.Entry get(final String urlhash) { try { if (urlIndex == null) return null; //System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " get " + urlhash); @@ -179,6 +180,29 @@ public class ZURL implements Iterable { } } + /** + * private put (use push instead) + * @param entry + */ + private void put(Entry entry) { + // stores the values from the object variables into the database + if (entry.stored) return; + if (entry.bentry == null) return; + final Row.Entry newrow = rowdef.newEntry(); + newrow.setCol(0, entry.bentry.url().hash().getBytes()); + newrow.setCol(1, entry.executor.getBytes()); + newrow.setCol(2, entry.workdate.getTime()); + newrow.setCol(3, entry.workcount); + newrow.setCol(4, entry.anycause.getBytes()); + newrow.setCol(5, entry.bentry.toRow().bytes()); + try { + if (urlIndex != null) urlIndex.put(newrow); + entry.stored = true; + } catch (final Exception e) { + Log.logException(e); + } + } + public boolean exists(final String urlHash) { return urlIndex.has(urlHash.getBytes()); } @@ -228,26 +252,6 @@ public class ZURL implements Iterable { this.stored = true; return; } - - protected void store() { - // stores the values from the object variables into the database - if (this.stored) return; - if (this.bentry == null) return; - final Row.Entry newrow = rowdef.newEntry(); - newrow.setCol(0, this.bentry.url().hash().getBytes()); - newrow.setCol(1, this.executor.getBytes()); - newrow.setCol(2, this.workdate.getTime()); - newrow.setCol(3, this.workcount); - newrow.setCol(4, this.anycause.getBytes()); - newrow.setCol(5, this.bentry.toRow().bytes()); - try { - //System.out.println("*** DEBUG ZURL " + urlIndex.filename() + " store " + newrow.getColString(0, "UTF-8")); - if (urlIndex != null) urlIndex.put(newrow); - this.stored = true; - } catch (final Exception e) { - Log.logException(e); - } - } public DigestURI url() { return this.bentry.url();