From 4d5b401f003281d793e32fc0c4c388c9dff25a83 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 9 Jan 2009 00:06:36 +0000 Subject: [PATCH] try to fix some performance problems with the internal index management: - ensuring that ordered indexes stay ordered during remove - no unnecessary ordering checks - better test logic in crawl stacker git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5457 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/crawler/CrawlStacker.java | 38 ++++++++++--------- .../index/indexRepositoryReference.java | 2 +- .../de/anomic/kelondro/kelondroRAMIndex.java | 7 +++- .../kelondro/kelondroRowCollection.java | 19 ++++++---- source/de/anomic/kelondro/kelondroRowSet.java | 4 +- 5 files changed, 39 insertions(+), 31 deletions(-) diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 87fd41ec4..bebe0f788 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -255,24 +255,26 @@ public final class CrawlStacker { // check if the url is double registered final String dbocc = nextQueue.urlExists(entry.url().hash()); - final indexURLReference oldEntry = wordIndex.getURL(entry.url().hash(), null, 0); - final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime()); - // do double-check - if ((dbocc != null) && (!recrawl)) { - reason = "double " + dbocc + ")"; - if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); - return reason; - } - if ((oldEntry != null) && (!recrawl)) { - reason = "double " + "LURL)"; - if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); - return reason; - } - - // show potential re-crawl - if (recrawl && oldEntry != null) { - if (this.log.isFine()) this.log.logFine("RE-CRAWL of URL '" + entry.url().toString() + "': this url was crawled " + - ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago."); + if (dbocc != null || wordIndex.existsURL(entry.url().hash())) { + final indexURLReference oldEntry = wordIndex.getURL(entry.url().hash(), null, 0); + final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime()); + // do double-check + if ((dbocc != null) && (!recrawl)) { + reason = "double " + dbocc + ")"; + if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); + return reason; + } + if ((oldEntry != null) && (!recrawl)) { + reason = "double " + "LURL)"; + if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); + return reason; + } + + // show potential re-crawl + if (recrawl && oldEntry != null) { + if (this.log.isFine()) this.log.logFine("RE-CRAWL of URL '" + entry.url().toString() + "': this url was crawled " + + ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago."); + } } // store information diff --git a/source/de/anomic/index/indexRepositoryReference.java b/source/de/anomic/index/indexRepositoryReference.java index 70a1a7861..005f9091e 100644 --- a/source/de/anomic/index/indexRepositoryReference.java +++ b/source/de/anomic/index/indexRepositoryReference.java @@ -146,7 +146,7 @@ public final class indexRepositoryReference { } } - public synchronized boolean exists(final String urlHash) { + public boolean exists(final String urlHash) { if (urlIndexFile == null) return false; // case may happen during shutdown return urlIndexFile.has(urlHash.getBytes()); } diff --git a/source/de/anomic/kelondro/kelondroRAMIndex.java b/source/de/anomic/kelondro/kelondroRAMIndex.java index 73a80440b..df560b660 100644 --- a/source/de/anomic/kelondro/kelondroRAMIndex.java +++ b/source/de/anomic/kelondro/kelondroRAMIndex.java @@ -69,6 +69,7 @@ public class kelondroRAMIndex implements kelondroIndex { public synchronized kelondroRow.Entry get(final byte[] key) { assert (key != null); finishInitialization(); + assert index0.isSorted(); final kelondroRow.Entry indexentry = index0.get(key); if (indexentry != null) return indexentry; return index1.get(key); @@ -77,6 +78,7 @@ public class kelondroRAMIndex implements kelondroIndex { public boolean has(final byte[] key) { assert (key != null); finishInitialization(); + assert index0.isSorted(); if (index0.has(key)) return true; return index1.has(key); } @@ -85,9 +87,10 @@ public class kelondroRAMIndex implements kelondroIndex { assert (entry != null); finishInitialization(); // if the new entry is within the initialization part, just overwrite it - final kelondroRow.Entry indexentry = index0.get(entry.getPrimaryKeyBytes()); + assert index0.isSorted(); + final kelondroRow.Entry indexentry = index0.remove(entry.getPrimaryKeyBytes()); // keeps ordering if (indexentry != null) { - index0.put(entry); + index1.put(entry); return indexentry; } // else place it in the index1 diff --git a/source/de/anomic/kelondro/kelondroRowCollection.java b/source/de/anomic/kelondro/kelondroRowCollection.java index ced82bc58..ff4e29933 100644 --- a/source/de/anomic/kelondro/kelondroRowCollection.java +++ b/source/de/anomic/kelondro/kelondroRowCollection.java @@ -403,9 +403,9 @@ public class kelondroRowCollection implements Iterable { return this.sortBound; } - public synchronized Iterator keys() { + public synchronized Iterator keys(boolean keepOrderWhenRemoving) { // iterates byte[] - type entries - return new keyIterator(); + return new keyIterator(keepOrderWhenRemoving); } /** @@ -417,9 +417,11 @@ public class kelondroRowCollection implements Iterable { public class keyIterator implements Iterator { private int p; + private boolean keepOrderWhenRemoving; - public keyIterator() { - p = 0; + public keyIterator(boolean keepOrderWhenRemoving) { + this.p = 0; + this.keepOrderWhenRemoving = keepOrderWhenRemoving; } public boolean hasNext() { @@ -432,7 +434,7 @@ public class kelondroRowCollection implements Iterable { public void remove() { p--; - removeRow(p, false); + removeRow(p, keepOrderWhenRemoving); } } @@ -446,9 +448,8 @@ public class kelondroRowCollection implements Iterable { /** * Iterator for kelondroRowCollection. - * It supports remove() though it doesn't contain the order of the underlying + * It supports remove() and keeps the order of the underlying * collection during removes. - * */ public class rowIterator implements Iterator { @@ -468,7 +469,7 @@ public class kelondroRowCollection implements Iterable { public void remove() { p--; - removeRow(p, false); + removeRow(p, true); } } @@ -828,6 +829,7 @@ public class kelondroRowCollection implements Iterable { assert (this.rowdef.objectOrder != null); if (chunkcount <= 1) return true; if (chunkcount != this.sortBound) return false; + /* for (int i = 0; i < chunkcount - 1; i++) { //System.out.println("*" + new String(get(i).getColBytes(0))); if (compare(i, i + 1) > 0) { @@ -835,6 +837,7 @@ public class kelondroRowCollection implements Iterable { return false; } } + */ return true; } diff --git a/source/de/anomic/kelondro/kelondroRowSet.java b/source/de/anomic/kelondro/kelondroRowSet.java index dfe3cae15..f5170598a 100644 --- a/source/de/anomic/kelondro/kelondroRowSet.java +++ b/source/de/anomic/kelondro/kelondroRowSet.java @@ -145,7 +145,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd final int index = find(a, start, length); if (index < 0) return null; final kelondroRow.Entry entry = super.get(index, true); - super.removeRow(index, true); + super.removeRow(index, true); // keep order of collection! int findagainindex = 0; assert (findagainindex = find(a, start, length)) < 0 : "remove: chunk found again at index position (after remove) " + findagainindex + ", index(before) = " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize * findagainindex, length) + ", searchkey=" + serverLog.arrayList(a, start, length); // check if the remove worked return entry; @@ -268,7 +268,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd public synchronized Iterator keys() { sort(); - return super.keys(); + return super.keys(true); } public synchronized kelondroCloneableIterator keys(final boolean up, final byte[] firstKey) {