try to fix some performance problems with the internal index management:

- ensuring that ordered indexes stay ordered during remove
- no unnecessary ordering checks
- better test logic in crawl stacker

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5457 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 4641ecd6d9
commit 4d5b401f00

@ -255,24 +255,26 @@ public final class CrawlStacker {
// check if the url is double registered
final String dbocc = nextQueue.urlExists(entry.url().hash());
final indexURLReference oldEntry = wordIndex.getURL(entry.url().hash(), null, 0);
final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime());
// do double-check
if ((dbocc != null) && (!recrawl)) {
reason = "double " + dbocc + ")";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
}
if ((oldEntry != null) && (!recrawl)) {
reason = "double " + "LURL)";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
}
// show potential re-crawl
if (recrawl && oldEntry != null) {
if (this.log.isFine()) this.log.logFine("RE-CRAWL of URL '" + entry.url().toString() + "': this url was crawled " +
((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
if (dbocc != null || wordIndex.existsURL(entry.url().hash())) {
final indexURLReference oldEntry = wordIndex.getURL(entry.url().hash(), null, 0);
final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime());
// do double-check
if ((dbocc != null) && (!recrawl)) {
reason = "double " + dbocc + ")";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
}
if ((oldEntry != null) && (!recrawl)) {
reason = "double " + "LURL)";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
}
// show potential re-crawl
if (recrawl && oldEntry != null) {
if (this.log.isFine()) this.log.logFine("RE-CRAWL of URL '" + entry.url().toString() + "': this url was crawled " +
((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
}
}
// store information

@ -146,7 +146,7 @@ public final class indexRepositoryReference {
}
}
public synchronized boolean exists(final String urlHash) {
public boolean exists(final String urlHash) {
if (urlIndexFile == null) return false; // case may happen during shutdown
return urlIndexFile.has(urlHash.getBytes());
}

@ -69,6 +69,7 @@ public class kelondroRAMIndex implements kelondroIndex {
public synchronized kelondroRow.Entry get(final byte[] key) {
assert (key != null);
finishInitialization();
assert index0.isSorted();
final kelondroRow.Entry indexentry = index0.get(key);
if (indexentry != null) return indexentry;
return index1.get(key);
@ -77,6 +78,7 @@ public class kelondroRAMIndex implements kelondroIndex {
public boolean has(final byte[] key) {
assert (key != null);
finishInitialization();
assert index0.isSorted();
if (index0.has(key)) return true;
return index1.has(key);
}
@ -85,9 +87,10 @@ public class kelondroRAMIndex implements kelondroIndex {
assert (entry != null);
finishInitialization();
// if the new entry is within the initialization part, just overwrite it
final kelondroRow.Entry indexentry = index0.get(entry.getPrimaryKeyBytes());
assert index0.isSorted();
final kelondroRow.Entry indexentry = index0.remove(entry.getPrimaryKeyBytes()); // keeps ordering
if (indexentry != null) {
index0.put(entry);
index1.put(entry);
return indexentry;
}
// else place it in the index1

@ -403,9 +403,9 @@ public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
return this.sortBound;
}
public synchronized Iterator<byte[]> keys() {
public synchronized Iterator<byte[]> keys(boolean keepOrderWhenRemoving) {
// iterates byte[] - type entries
return new keyIterator();
return new keyIterator(keepOrderWhenRemoving);
}
/**
@ -417,9 +417,11 @@ public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
public class keyIterator implements Iterator<byte[]> {
private int p;
private boolean keepOrderWhenRemoving;
public keyIterator() {
p = 0;
public keyIterator(boolean keepOrderWhenRemoving) {
this.p = 0;
this.keepOrderWhenRemoving = keepOrderWhenRemoving;
}
public boolean hasNext() {
@ -432,7 +434,7 @@ public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
public void remove() {
p--;
removeRow(p, false);
removeRow(p, keepOrderWhenRemoving);
}
}
@ -446,9 +448,8 @@ public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
/**
* Iterator for kelondroRowCollection.
* It supports remove() though it doesn't contain the order of the underlying
* It supports remove() and keeps the order of the underlying
* collection during removes.
*
*/
public class rowIterator implements Iterator<kelondroRow.Entry> {
@ -468,7 +469,7 @@ public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
public void remove() {
p--;
removeRow(p, false);
removeRow(p, true);
}
}
@ -828,6 +829,7 @@ public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
assert (this.rowdef.objectOrder != null);
if (chunkcount <= 1) return true;
if (chunkcount != this.sortBound) return false;
/*
for (int i = 0; i < chunkcount - 1; i++) {
//System.out.println("*" + new String(get(i).getColBytes(0)));
if (compare(i, i + 1) > 0) {
@ -835,6 +837,7 @@ public class kelondroRowCollection implements Iterable<kelondroRow.Entry> {
return false;
}
}
*/
return true;
}

@ -145,7 +145,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
final int index = find(a, start, length);
if (index < 0) return null;
final kelondroRow.Entry entry = super.get(index, true);
super.removeRow(index, true);
super.removeRow(index, true); // keep order of collection!
int findagainindex = 0;
assert (findagainindex = find(a, start, length)) < 0 : "remove: chunk found again at index position (after remove) " + findagainindex + ", index(before) = " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize * findagainindex, length) + ", searchkey=" + serverLog.arrayList(a, start, length); // check if the remove worked
return entry;
@ -268,7 +268,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
public synchronized Iterator<byte[]> keys() {
sort();
return super.keys();
return super.keys(true);
}
public synchronized kelondroCloneableIterator<byte[]> keys(final boolean up, final byte[] firstKey) {

Loading…
Cancel
Save