From df7e1d9df372a0bdc7152d5e46e566b05e9d23e1 Mon Sep 17 00:00:00 2001 From: hermens Date: Wed, 17 May 2006 13:08:57 +0000 Subject: [PATCH] Changes to plasmaURL and subclasses: - Improve performance of plasmaURL.exists() by remembering URL-hashes that are not present - Use a more realistic estimation of memory usage by the existsIndex cache - Routine cleanup of the existsIndex to limit its memory usage git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2113 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/PerformanceMemory_p.java | 6 +- source/de/anomic/plasma/plasmaCrawlEURL.java | 6 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 112 +++++++++--------- source/de/anomic/plasma/plasmaCrawlNURL.java | 6 +- .../de/anomic/plasma/plasmaSwitchboard.java | 15 ++- source/de/anomic/plasma/plasmaURL.java | 24 ++-- 6 files changed, 102 insertions(+), 67 deletions(-) diff --git a/htroot/PerformanceMemory_p.java b/htroot/PerformanceMemory_p.java index 3c01929df..5c19c9d63 100644 --- a/htroot/PerformanceMemory_p.java +++ b/htroot/PerformanceMemory_p.java @@ -278,13 +278,13 @@ public class PerformanceMemory_p { // other caching structures long amount = sb.urlPool.errorURL.existsIndexSize(); prop.put("eurl.existsIndexAmount",Long.toString(amount)); - prop.put("eurl.existsIndexSize",serverMemory.bytesToString(amount*plasmaURL.urlHashLength)); + prop.put("eurl.existsIndexSize",serverMemory.bytesToString(amount*(2*plasmaURL.urlHashLength+28))); amount = sb.urlPool.noticeURL.existsIndexSize(); prop.put("nurl.existsIndexAmount",Long.toString(amount)); - prop.put("nurl.existsIndexSize",serverMemory.bytesToString(amount*plasmaURL.urlHashLength)); + prop.put("nurl.existsIndexSize",serverMemory.bytesToString(amount*(2*plasmaURL.urlHashLength+28))); amount = sb.urlPool.loadedURL.existsIndexSize(); prop.put("lurl.existsIndexAmount",Long.toString(amount)); - prop.put("lurl.existsIndexSize",serverMemory.bytesToString(amount*plasmaURL.urlHashLength)); + prop.put("lurl.existsIndexSize",serverMemory.bytesToString(amount*(2*plasmaURL.urlHashLength+28))); // return rewrite values for templates return prop; diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index b325246ca..73f32a75e 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -45,6 +45,7 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; +import java.lang.Boolean; import java.net.URL; import java.util.Date; import java.util.Enumeration; @@ -206,7 +207,10 @@ public class plasmaCrawlEURL extends plasmaURL { this.failreason.getBytes(), this.flags.getBytes() }; - urlHashCache.put(entry); + synchronized(existsIndex) { + urlHashCache.put(entry); + existsIndex.put(this.hash, Boolean.TRUE); + } } catch (IOException e) { System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString()); } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index dab8bbd39..7512ea5b5 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -54,6 +54,7 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; +import java.lang.Boolean; import java.net.MalformedURLException; import java.net.URL; import java.text.SimpleDateFormat; @@ -516,63 +517,66 @@ public final class plasmaCrawlLURL extends plasmaURL { public void store() { // Check if there is a more recent Entry already in the DB if (this.stored) return; - Entry oldEntry; - try { - if (exists(urlHash)) { - oldEntry = new Entry (urlHash, null); - } else { + synchronized(existsIndex) { + Entry oldEntry; + try { + if (exists(urlHash)) { + oldEntry = new Entry (urlHash, null); + } else { + oldEntry = null; + } + } catch (Exception e) { oldEntry = null; } - } catch (Exception e) { - oldEntry = null; - } - if ((oldEntry != null) && (isOlder(oldEntry))) { - // the fetched oldEntry is better, so return its properties instead of the new ones - // this.urlHash = oldEntry.urlHash; // unnecessary, should be the same - // this.url = oldEntry.url; // unnecessary, should be the same - this.descr = oldEntry.descr; - this.moddate = oldEntry.moddate; - this.loaddate = oldEntry.loaddate; - this.referrerHash = oldEntry.referrerHash; - this.copyCount = oldEntry.copyCount; - this.flags = oldEntry.flags; - this.quality = oldEntry.quality; - this.language = oldEntry.language; - this.doctype = oldEntry.doctype; - this.size = oldEntry.size; - this.wordCount = oldEntry.wordCount; - // this.snippet // not read from db - // this.word // not read from db - return; - } - - // stores the values from the object variables into the database - final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, urlDateLength); - final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength); + if ((oldEntry != null) && (isOlder(oldEntry))) { + // the fetched oldEntry is better, so return its properties instead of the new ones + // this.urlHash = oldEntry.urlHash; // unnecessary, should be the same + // this.url = oldEntry.url; // unnecessary, should be the same + this.descr = oldEntry.descr; + this.moddate = oldEntry.moddate; + this.loaddate = oldEntry.loaddate; + this.referrerHash = oldEntry.referrerHash; + this.copyCount = oldEntry.copyCount; + this.flags = oldEntry.flags; + this.quality = oldEntry.quality; + this.language = oldEntry.language; + this.doctype = oldEntry.doctype; + this.size = oldEntry.size; + this.wordCount = oldEntry.wordCount; + // this.snippet // not read from db + // this.word // not read from db + return; + } - // store the hash in the hash cache - try { - // even if the entry exists, we simply overwrite it - final byte[][] entry = new byte[][] { - urlHash.getBytes(), - url.toString().getBytes(), - descr.getBytes(), // null? - moddatestr.getBytes(), - loaddatestr.getBytes(), - referrerHash.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(copyCount, urlCopyCountLength).getBytes(), - flags.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength).getBytes(), - language.getBytes(), - new byte[] {(byte) doctype}, - kelondroBase64Order.enhancedCoder.encodeLong(size, urlSizeLength).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(), - }; - urlHashCache.put(entry); - serverLog.logFine("PLASMA","STORED new LURL " + url.toString()); - this.stored = true; - } catch (Exception e) { - serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaCrawlLURL:store:" + e.toString(), e); + // stores the values from the object variables into the database + final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, urlDateLength); + final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength); + + // store the hash in the hash cache + try { + // even if the entry exists, we simply overwrite it + final byte[][] entry = new byte[][] { + urlHash.getBytes(), + url.toString().getBytes(), + descr.getBytes(), // null? + moddatestr.getBytes(), + loaddatestr.getBytes(), + referrerHash.getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(copyCount, urlCopyCountLength).getBytes(), + flags.getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength).getBytes(), + language.getBytes(), + new byte[] {(byte) doctype}, + kelondroBase64Order.enhancedCoder.encodeLong(size, urlSizeLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(), + }; + urlHashCache.put(entry); + serverLog.logFine("PLASMA","STORED new LURL " + url.toString()); + this.stored = true; + existsIndex.put(urlHash, Boolean.TRUE); + } catch (Exception e) { + serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaCrawlLURL:store:" + e.toString(), e); + } } } diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index 1c325bd82..9dfcce647 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -45,6 +45,7 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; +import java.lang.Boolean; import java.net.URL; import java.util.ArrayList; import java.util.Date; @@ -559,7 +560,10 @@ public class plasmaCrawlNURL extends plasmaURL { this.flags.getBytes(), normalizeHandle(this.handle).getBytes() }; - urlHashCache.put(entry); + synchronized(existsIndex) { + urlHashCache.put(entry); + existsIndex.put(this.hash, Boolean.TRUE); + } } catch (IOException e) { serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaNURL:store:" + e.toString() + ", resetting NURL-DB"); e.printStackTrace(); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 6204be613..f01b33d6d 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1023,7 +1023,20 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // clean up profiles if (cleanProfiles()) hasDoneSomething = true; - + // clean up existsIndex + if (urlPool.errorURL.existsIndexSize() > 10000) { + log.logFine("Cleaning Error-URLs exists index, " + urlPool.errorURL.existsIndexSize() + " entries in index"); + urlPool.errorURL.clearExistsIndex(); + } + if (urlPool.noticeURL.existsIndexSize() > 10000) { + log.logFine("Cleaning Notice-URLs exists index, " + urlPool.noticeURL.existsIndexSize() + " entries in index"); + urlPool.noticeURL.clearExistsIndex(); + } + if (urlPool.loadedURL.existsIndexSize() > 100000) { + log.logFine("Cleaning Loaded-URLs exists index, " + urlPool.loadedURL.existsIndexSize() + " entries in index"); + urlPool.loadedURL.clearExistsIndex(); + } + // clean up news try { log.logFine("Cleaning Incoming News, " + yacyCore.newsPool.size(yacyNewsPool.INCOMING_DB) + " entries on stack"); diff --git a/source/de/anomic/plasma/plasmaURL.java b/source/de/anomic/plasma/plasmaURL.java index 2a27b73ec..c60a9d2be 100644 --- a/source/de/anomic/plasma/plasmaURL.java +++ b/source/de/anomic/plasma/plasmaURL.java @@ -42,10 +42,10 @@ package de.anomic.plasma; import java.io.IOException; +import java.lang.Boolean; import java.net.URL; import java.net.MalformedURLException; import java.text.SimpleDateFormat; -import java.util.HashSet; import java.util.HashMap; import java.util.Iterator; @@ -424,11 +424,11 @@ public class plasmaURL { // the class object public kelondroTree urlHashCache; - private final HashSet existsIndex; + public final HashMap existsIndex; // allow subclasses to access the existsIndex during Entry.store() public plasmaURL() { urlHashCache = null; - existsIndex = new HashSet(); + existsIndex = new HashMap(); } public int size() { @@ -441,12 +441,14 @@ public class plasmaURL { public boolean exists(String urlHash) { synchronized (existsIndex) { - if (existsIndex.contains(urlHash)) return true; + Boolean existsInIndex = (Boolean) existsIndex.get(urlHash); + if (existsInIndex != null) return existsInIndex.booleanValue(); try { if (urlHashCache.get(urlHash.getBytes()) != null) { - existsIndex.add(urlHash); + existsIndex.put(urlHash, Boolean.TRUE); return true; } else { + existsIndex.put(urlHash, Boolean.FALSE); return false; } } catch (IOException e) { @@ -462,15 +464,23 @@ public class plasmaURL { public boolean remove(String urlHash) { synchronized (existsIndex) { try { - boolean existsInIndex = this.existsIndex.remove(urlHash); + Boolean existsInIndex = (Boolean) existsIndex.remove(urlHash); + if (existsInIndex == null) existsInIndex = Boolean.FALSE; boolean existsInCache = (this.urlHashCache.remove(urlHash.getBytes()) != null); - return existsInIndex || existsInCache; + existsIndex.put(urlHash, Boolean.FALSE); + return existsInIndex.booleanValue() || existsInCache; } catch (IOException e) { return false; } } } + public void clearExistsIndex() { + synchronized (existsIndex) { + existsIndex.clear(); + } + } + public static final int flagTypeID(String hash) { return (kelondroBase64Order.enhancedCoder.decodeByte(hash.charAt(11)) & 32) >> 5; }