Changes to plasmaURL and subclasses:

- Improve performance of plasmaURL.exists() by remembering URL-hashes that are not present
- Use a more realistic estimation of memory usage by the existsIndex cache
- Routine cleanup of the existsIndex to limit its memory usage



git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2113 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
hermens 19 years ago
parent 70ef53a89d
commit df7e1d9df3

@ -278,13 +278,13 @@ public class PerformanceMemory_p {
// other caching structures
long amount = sb.urlPool.errorURL.existsIndexSize();
prop.put("eurl.existsIndexAmount",Long.toString(amount));
prop.put("eurl.existsIndexSize",serverMemory.bytesToString(amount*plasmaURL.urlHashLength));
prop.put("eurl.existsIndexSize",serverMemory.bytesToString(amount*(2*plasmaURL.urlHashLength+28)));
amount = sb.urlPool.noticeURL.existsIndexSize();
prop.put("nurl.existsIndexAmount",Long.toString(amount));
prop.put("nurl.existsIndexSize",serverMemory.bytesToString(amount*plasmaURL.urlHashLength));
prop.put("nurl.existsIndexSize",serverMemory.bytesToString(amount*(2*plasmaURL.urlHashLength+28)));
amount = sb.urlPool.loadedURL.existsIndexSize();
prop.put("lurl.existsIndexAmount",Long.toString(amount));
prop.put("lurl.existsIndexSize",serverMemory.bytesToString(amount*plasmaURL.urlHashLength));
prop.put("lurl.existsIndexSize",serverMemory.bytesToString(amount*(2*plasmaURL.urlHashLength+28)));
// return rewrite values for templates
return prop;

@ -45,6 +45,7 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.lang.Boolean;
import java.net.URL;
import java.util.Date;
import java.util.Enumeration;
@ -206,7 +207,10 @@ public class plasmaCrawlEURL extends plasmaURL {
this.failreason.getBytes(),
this.flags.getBytes()
};
urlHashCache.put(entry);
synchronized(existsIndex) {
urlHashCache.put(entry);
existsIndex.put(this.hash, Boolean.TRUE);
}
} catch (IOException e) {
System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString());
}

@ -54,6 +54,7 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.lang.Boolean;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
@ -516,63 +517,66 @@ public final class plasmaCrawlLURL extends plasmaURL {
public void store() {
// Check if there is a more recent Entry already in the DB
if (this.stored) return;
Entry oldEntry;
try {
if (exists(urlHash)) {
oldEntry = new Entry (urlHash, null);
} else {
synchronized(existsIndex) {
Entry oldEntry;
try {
if (exists(urlHash)) {
oldEntry = new Entry (urlHash, null);
} else {
oldEntry = null;
}
} catch (Exception e) {
oldEntry = null;
}
} catch (Exception e) {
oldEntry = null;
}
if ((oldEntry != null) && (isOlder(oldEntry))) {
// the fetched oldEntry is better, so return its properties instead of the new ones
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
// this.url = oldEntry.url; // unnecessary, should be the same
this.descr = oldEntry.descr;
this.moddate = oldEntry.moddate;
this.loaddate = oldEntry.loaddate;
this.referrerHash = oldEntry.referrerHash;
this.copyCount = oldEntry.copyCount;
this.flags = oldEntry.flags;
this.quality = oldEntry.quality;
this.language = oldEntry.language;
this.doctype = oldEntry.doctype;
this.size = oldEntry.size;
this.wordCount = oldEntry.wordCount;
// this.snippet // not read from db
// this.word // not read from db
return;
}
// stores the values from the object variables into the database
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, urlDateLength);
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength);
if ((oldEntry != null) && (isOlder(oldEntry))) {
// the fetched oldEntry is better, so return its properties instead of the new ones
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
// this.url = oldEntry.url; // unnecessary, should be the same
this.descr = oldEntry.descr;
this.moddate = oldEntry.moddate;
this.loaddate = oldEntry.loaddate;
this.referrerHash = oldEntry.referrerHash;
this.copyCount = oldEntry.copyCount;
this.flags = oldEntry.flags;
this.quality = oldEntry.quality;
this.language = oldEntry.language;
this.doctype = oldEntry.doctype;
this.size = oldEntry.size;
this.wordCount = oldEntry.wordCount;
// this.snippet // not read from db
// this.word // not read from db
return;
}
// store the hash in the hash cache
try {
// even if the entry exists, we simply overwrite it
final byte[][] entry = new byte[][] {
urlHash.getBytes(),
url.toString().getBytes(),
descr.getBytes(), // null?
moddatestr.getBytes(),
loaddatestr.getBytes(),
referrerHash.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(copyCount, urlCopyCountLength).getBytes(),
flags.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength).getBytes(),
language.getBytes(),
new byte[] {(byte) doctype},
kelondroBase64Order.enhancedCoder.encodeLong(size, urlSizeLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(),
};
urlHashCache.put(entry);
serverLog.logFine("PLASMA","STORED new LURL " + url.toString());
this.stored = true;
} catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaCrawlLURL:store:" + e.toString(), e);
// stores the values from the object variables into the database
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, urlDateLength);
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength);
// store the hash in the hash cache
try {
// even if the entry exists, we simply overwrite it
final byte[][] entry = new byte[][] {
urlHash.getBytes(),
url.toString().getBytes(),
descr.getBytes(), // null?
moddatestr.getBytes(),
loaddatestr.getBytes(),
referrerHash.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(copyCount, urlCopyCountLength).getBytes(),
flags.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength).getBytes(),
language.getBytes(),
new byte[] {(byte) doctype},
kelondroBase64Order.enhancedCoder.encodeLong(size, urlSizeLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(),
};
urlHashCache.put(entry);
serverLog.logFine("PLASMA","STORED new LURL " + url.toString());
this.stored = true;
existsIndex.put(urlHash, Boolean.TRUE);
} catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaCrawlLURL:store:" + e.toString(), e);
}
}
}

@ -45,6 +45,7 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.lang.Boolean;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
@ -559,7 +560,10 @@ public class plasmaCrawlNURL extends plasmaURL {
this.flags.getBytes(),
normalizeHandle(this.handle).getBytes()
};
urlHashCache.put(entry);
synchronized(existsIndex) {
urlHashCache.put(entry);
existsIndex.put(this.hash, Boolean.TRUE);
}
} catch (IOException e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaNURL:store:" + e.toString() + ", resetting NURL-DB");
e.printStackTrace();

@ -1023,7 +1023,20 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// clean up profiles
if (cleanProfiles()) hasDoneSomething = true;
// clean up existsIndex
if (urlPool.errorURL.existsIndexSize() > 10000) {
log.logFine("Cleaning Error-URLs exists index, " + urlPool.errorURL.existsIndexSize() + " entries in index");
urlPool.errorURL.clearExistsIndex();
}
if (urlPool.noticeURL.existsIndexSize() > 10000) {
log.logFine("Cleaning Notice-URLs exists index, " + urlPool.noticeURL.existsIndexSize() + " entries in index");
urlPool.noticeURL.clearExistsIndex();
}
if (urlPool.loadedURL.existsIndexSize() > 100000) {
log.logFine("Cleaning Loaded-URLs exists index, " + urlPool.loadedURL.existsIndexSize() + " entries in index");
urlPool.loadedURL.clearExistsIndex();
}
// clean up news
try {
log.logFine("Cleaning Incoming News, " + yacyCore.newsPool.size(yacyNewsPool.INCOMING_DB) + " entries on stack");

@ -42,10 +42,10 @@
package de.anomic.plasma;
import java.io.IOException;
import java.lang.Boolean;
import java.net.URL;
import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.HashSet;
import java.util.HashMap;
import java.util.Iterator;
@ -424,11 +424,11 @@ public class plasmaURL {
// the class object
public kelondroTree urlHashCache;
private final HashSet existsIndex;
public final HashMap existsIndex; // allow subclasses to access the existsIndex during Entry.store()
public plasmaURL() {
urlHashCache = null;
existsIndex = new HashSet();
existsIndex = new HashMap();
}
public int size() {
@ -441,12 +441,14 @@ public class plasmaURL {
public boolean exists(String urlHash) {
synchronized (existsIndex) {
if (existsIndex.contains(urlHash)) return true;
Boolean existsInIndex = (Boolean) existsIndex.get(urlHash);
if (existsInIndex != null) return existsInIndex.booleanValue();
try {
if (urlHashCache.get(urlHash.getBytes()) != null) {
existsIndex.add(urlHash);
existsIndex.put(urlHash, Boolean.TRUE);
return true;
} else {
existsIndex.put(urlHash, Boolean.FALSE);
return false;
}
} catch (IOException e) {
@ -462,15 +464,23 @@ public class plasmaURL {
public boolean remove(String urlHash) {
synchronized (existsIndex) {
try {
boolean existsInIndex = this.existsIndex.remove(urlHash);
Boolean existsInIndex = (Boolean) existsIndex.remove(urlHash);
if (existsInIndex == null) existsInIndex = Boolean.FALSE;
boolean existsInCache = (this.urlHashCache.remove(urlHash.getBytes()) != null);
return existsInIndex || existsInCache;
existsIndex.put(urlHash, Boolean.FALSE);
return existsInIndex.booleanValue() || existsInCache;
} catch (IOException e) {
return false;
}
}
}
public void clearExistsIndex() {
synchronized (existsIndex) {
existsIndex.clear();
}
}
public static final int flagTypeID(String hash) {
return (kelondroBase64Order.enhancedCoder.decodeByte(hash.charAt(11)) & 32) >> 5;
}

Loading…
Cancel
Save