From bcd99fe83eacb4de1c8e62c5ed616df8bf3271cc Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 13 Mar 2006 10:43:12 +0000 Subject: [PATCH] introduced a second RAM cache for DHT transfer git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1880 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/PerformanceQueues_p.html | 48 ++-- htroot/PerformanceQueues_p.java | 28 +- htroot/xml/status_p.java | 2 +- .../kelondro/kelondroCollectionIndex.java | 1 - .../kelondro/kelondroMScoreCluster.java | 32 +-- .../de/anomic/plasma/plasmaSearchEvent.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 5 +- source/de/anomic/plasma/plasmaWordIndex.java | 65 +++-- .../anomic/plasma/plasmaWordIndexCache.java | 253 ++++++++++++------ .../plasma/plasmaWordIndexInterface.java | 3 +- yacy.init | 17 +- 11 files changed, 272 insertions(+), 184 deletions(-) diff --git a/htroot/PerformanceQueues_p.html b/htroot/PerformanceQueues_p.html index cc15c3d05..e3a5df4ea 100644 --- a/htroot/PerformanceQueues_p.html +++ b/htroot/PerformanceQueues_p.html @@ -69,53 +69,59 @@ Changes take effect immediately
Indexing Cache Settings:
+ + + + + + - - + + + - + + - - + + + - - + + + - - + + + - - - - - diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index de43f806d..b2e8d1f34 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -142,15 +142,14 @@ public class PerformanceQueues_p { idlesleep = Long.parseLong(d((String) defaultSettings.get(threadName + "_idlesleep"), "1000")); busysleep = Long.parseLong(d((String) defaultSettings.get(threadName + "_busysleep"), "100")); memprereq = Long.parseLong(d((String) defaultSettings.get(threadName + "_memprereq"), "0")); - + // check values to prevent short-cut loops if (idlesleep < 1000) idlesleep = 1000; if (threadName.equals("10_httpd")) { idlesleep = 0; busysleep = 0; memprereq = 0; } if ((threadName.equals("50_localcrawl")) && (busysleep < 100)) busysleep = 100; if ((threadName.equals("61_globalcrawltrigger")) && (busysleep < 100)) busysleep = 100; if ((threadName.equals("62_remotetriggeredcrawl")) && (busysleep < 100)) busysleep = 100; - - + // on-the-fly re-configuration switchboard.setThreadPerformance(threadName, idlesleep, busysleep, memprereq); switchboard.setConfig(threadName + "_idlesleep", idlesleep); @@ -171,12 +170,9 @@ public class PerformanceQueues_p { prop.put("table", c); if ((post != null) && (post.containsKey("cacheSizeSubmit"))) { - int wordCacheMaxLow = post.getInt("wordCacheMaxLow", 8000); - int wordCacheMaxHigh = post.getInt("wordCacheMaxHigh", 10000); - if (wordCacheMaxLow > wordCacheMaxHigh) wordCacheMaxLow = wordCacheMaxHigh; - switchboard.setConfig("wordCacheMaxLow", Integer.toString(wordCacheMaxLow)); - switchboard.setConfig("wordCacheMaxHigh", Integer.toString(wordCacheMaxHigh)); - switchboard.wordIndex.setMaxWords(wordCacheMaxLow, wordCacheMaxHigh); + int wordCacheMaxCount = post.getInt("wordCacheMaxCount", 10000); + switchboard.setConfig("wordCacheMaxCount", Integer.toString(wordCacheMaxCount)); + switchboard.wordIndex.setMaxWordCount(wordCacheMaxCount); int maxWaitingWordFlush = post.getInt("maxWaitingWordFlush", 180); switchboard.setConfig("maxWaitingWordFlush", Integer.toString(maxWaitingWordFlush)); } @@ -251,13 +247,15 @@ public class PerformanceQueues_p { } // table cache settings - prop.put("wordCacheRAMSize", switchboard.wordIndex.wordCacheRAMSize()); - prop.put("maxURLinWordCache", "" + switchboard.wordIndex.maxURLinWordCache()); - prop.put("maxAgeOfWordCache", "" + (switchboard.wordIndex.maxAgeOfWordCache() / 1000 / 60)); // minutes - prop.put("minAgeOfWordCache", "" + (switchboard.wordIndex.minAgeOfWordCache() / 1000 / 60)); // minutes + prop.put("wordCacheWSize", switchboard.wordIndex.wSize()); + prop.put("wordCacheKSize", switchboard.wordIndex.kSize()); + prop.put("maxURLinWCache", "" + switchboard.wordIndex.maxURLinWCache()); + prop.put("maxAgeOfWCache", "" + (switchboard.wordIndex.maxAgeOfWCache() / 1000 / 60)); // minutes + prop.put("minAgeOfWCache", "" + (switchboard.wordIndex.minAgeOfWCache() / 1000 / 60)); // minutes + prop.put("maxAgeOfKCache", "" + (switchboard.wordIndex.maxAgeOfKCache() / 1000 / 60)); // minutes + prop.put("minAgeOfKCache", "" + (switchboard.wordIndex.minAgeOfKCache() / 1000 / 60)); // minutes prop.put("maxWaitingWordFlush", switchboard.getConfig("maxWaitingWordFlush", "180")); - prop.put("wordCacheMaxLow", switchboard.getConfig("wordCacheMaxLow", "10000")); - prop.put("wordCacheMaxHigh", switchboard.getConfig("wordCacheMaxHigh", "10000")); + prop.put("wordCacheMaxCount", switchboard.getConfig("wordCacheMaxCount", "10000")); prop.put("onlineCautionDelay", switchboard.getConfig("onlineCautionDelay", "30000")); prop.put("onlineCautionDelayCurrent", System.currentTimeMillis() - switchboard.proxyLastAccess); diff --git a/htroot/xml/status_p.java b/htroot/xml/status_p.java index af9bc3d9e..3cd8c35e5 100644 --- a/htroot/xml/status_p.java +++ b/htroot/xml/status_p.java @@ -64,7 +64,7 @@ public class status_p { prop.put("rejected", 0); yacyCore.peerActions.updateMySeed(); prop.put("ppm", yacyCore.seedDB.mySeed.get(yacySeed.ISPEED, "unknown")); - prop.put("wordCacheSize", switchboard.wordIndex.wordCacheRAMSize()); + prop.put("wordCacheSize", switchboard.wordIndex.wSize() + switchboard.wordIndex.kSize()); prop.put("wordCacheMaxLow", switchboard.getConfig("wordCacheMaxLow", "10000")); prop.put("wordCacheMaxHigh", switchboard.getConfig("wordCacheMaxHigh", "10000")); diff --git a/source/de/anomic/kelondro/kelondroCollectionIndex.java b/source/de/anomic/kelondro/kelondroCollectionIndex.java index 7084616a9..85e711d95 100644 --- a/source/de/anomic/kelondro/kelondroCollectionIndex.java +++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java @@ -27,7 +27,6 @@ package de.anomic.kelondro; import java.io.File; import java.io.IOException; -import java.util.Iterator; public class kelondroCollectionIndex { diff --git a/source/de/anomic/kelondro/kelondroMScoreCluster.java b/source/de/anomic/kelondro/kelondroMScoreCluster.java index e67d61e89..d32c92dfc 100644 --- a/source/de/anomic/kelondro/kelondroMScoreCluster.java +++ b/source/de/anomic/kelondro/kelondroMScoreCluster.java @@ -73,17 +73,9 @@ public final class kelondroMScoreCluster { } catch (ParseException e) {} } - /* - public static int string2score(String s) { - int i = string2scoreX(s); - System.out.println("string2core(" + s + ") = " + i); - return i; - } - */ - public static int string2score(String s) { // this can be used to calculate a score from a string - + if ((s == null) || (s.length() == 0) || (s.charAt(0) == '-')) return 0; try { long l = 0; if (s.length() == shortDateFormatString.length()) { @@ -97,7 +89,10 @@ public final class kelondroMScoreCluster { } // fix out-of-ranges if (l > Integer.MAX_VALUE) return Integer.MAX_VALUE; - if (l < 0) return 0; + if (l < 0) { + System.out.println("string2score: negative score for input " + s); + return 0; + } return (int) l; } catch (Exception e) { // try it lex @@ -110,7 +105,10 @@ public final class kelondroMScoreCluster { } for (int i = len; i < 5; i++) c <<= 6; if (c > Integer.MAX_VALUE) return Integer.MAX_VALUE; - if (c < 0) return 0; + if (c < 0) { + System.out.println("string2score: negative score for input " + s); + return 0; + } return c; } } @@ -411,14 +409,18 @@ public final class kelondroMScoreCluster { public static void main(String[] args) { - if (args.length > 0) System.out.println("score of " + args[0] + ": " + string2score(args[0])); - //System.exit(0); + String t = "ZZZZZZZZZZ"; + System.out.println("score of " + t + ": " + string2score(t)); + if (args.length > 0) { + System.out.println("score of " + args[0] + ": " + string2score(args[0])); + System.exit(0); + } System.out.println("Test for Score: start"); kelondroMScoreCluster s = new kelondroMScoreCluster(); - long c = 0; + long c = 0; - // create cluster + // create cluster long time = System.currentTimeMillis(); Random random = new Random(1234); int r; diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index adbd10c5a..ed2c143fb 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -304,7 +304,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { while (hashi.hasNext()) { wordHash = (String) hashi.next(); rcGlobal.setWordHash(wordHash); - wordIndex.addEntries(rcGlobal, System.currentTimeMillis(), true); + wordIndex.addEntries(rcGlobal, System.currentTimeMillis(), false); log.logFine("FLUSHED " + wordHash + ": " + rcGlobal.size() + " url entries"); } // the rcGlobal was flushed, empty it diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index c0fbf32f1..d5e6c7b64 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -369,9 +369,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser urlPool = new plasmaURLPool(plasmaPath, ramLURL, ramNURL, ramEURL); wordIndex = new plasmaWordIndex(plasmaPath, ramRWI, log); - int wordCacheMaxLow = (int) getConfigLong("wordCacheMaxLow", 8000); - int wordCacheMaxHigh = (int) getConfigLong("wordCacheMaxHigh", 10000); - wordIndex.setMaxWords(wordCacheMaxLow, wordCacheMaxHigh); + int wordCacheMaxCount = (int) getConfigLong("wordCacheMaxCount", 10000); + wordIndex.setMaxWordCount(wordCacheMaxCount); // start a cache manager log.logConfig("Starting HT Cache Manager"); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 47738e91b..2a0ae6d2c 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -90,20 +90,32 @@ public final class plasmaWordIndex { return databaseRoot; } - public int maxURLinWordCache() { - return ramCache.maxURLinWordCache(); + public int maxURLinWCache() { + return ramCache.maxURLinWCache(); } - public long minAgeOfWordCache() { - return ramCache.minAgeOfWordCache(); + public long minAgeOfWCache() { + return ramCache.minAgeOfWCache(); } - public long maxAgeOfWordCache() { - return ramCache.maxAgeOfWordCache(); + public long maxAgeOfWCache() { + return ramCache.maxAgeOfWCache(); } - public int wordCacheRAMSize() { - return ramCache.wordCacheRAMSize(); + public long minAgeOfKCache() { + return ramCache.minAgeOfKCache(); + } + + public long maxAgeOfKCache() { + return ramCache.maxAgeOfKCache(); + } + + public int wSize() { + return ramCache.wSize(); + } + + public int kSize() { + return ramCache.kSize(); } public int[] assortmentsSizes() { @@ -118,48 +130,49 @@ public final class plasmaWordIndex { return assortmentCluster.cacheFillStatusCml(); } - public void setMaxWords(int maxWordsLow, int maxWordsHigh) { - ramCache.setMaxWords(maxWordsLow, maxWordsHigh); + public void setMaxWordCount(int maxWords) { + ramCache.setMaxWordCount(maxWords); } - public void flushControl(boolean highPriority) { + public void flushControl(boolean dhtCase) { // check for forced flush - if (highPriority) { - if (ramCache.size() > ramCache.getMaxWordsHigh()) { - while (ramCache.size() + 500 > ramCache.getMaxWordsHigh()) { + ramCache.shiftK2W(); + if (dhtCase) { + if (ramCache.wSize() > ramCache.getMaxWordCount()) { + while (ramCache.wSize() + 500 > ramCache.getMaxWordCount()) { flushCache(1); } } } else { - while (ramCache.maxURLinWordCache() > plasmaWordIndexCache.ramCacheReferenceLimit) { + while (ramCache.maxURLinWCache() > plasmaWordIndexCache.wCacheReferenceLimit) { flushCache(1); } - if (ramCache.size() > ramCache.getMaxWordsLow()) { - while (ramCache.size() + 500 > ramCache.getMaxWordsLow()) { + if (ramCache.wSize() > ramCache.getMaxWordCount()) { + while (ramCache.wSize() + 500 > ramCache.getMaxWordCount()) { flushCache(1); } } } } - public boolean addEntry(String wordHash, plasmaWordIndexEntry entry, long updateTime, boolean highPriority) { - if (ramCache.addEntry(wordHash, entry, updateTime)) { - flushControl(highPriority); + public boolean addEntry(String wordHash, plasmaWordIndexEntry entry, long updateTime, boolean dhtCase) { + if (ramCache.addEntry(wordHash, entry, updateTime, dhtCase)) { + flushControl(dhtCase); return true; } return false; } - public int addEntries(plasmaWordIndexEntryContainer entries, long updateTime, boolean highPriority) { - int added = ramCache.addEntries(entries, updateTime, highPriority); + public int addEntries(plasmaWordIndexEntryContainer entries, long updateTime, boolean dhtCase) { + int added = ramCache.addEntries(entries, updateTime, dhtCase); // force flush - flushControl(highPriority); + flushControl(dhtCase); return added; } public synchronized void flushCacheSome() { - int flushCount = ramCache.size() / 1000; + int flushCount = ramCache.wSize() / 1000; if (flushCount > 50) flushCount = 50; if (flushCount < 3) flushCount = 3; flushCache(flushCount); @@ -167,7 +180,7 @@ public final class plasmaWordIndex { public synchronized void flushCache(int count) { for (int i = 0; i < count; i++) { - if (ramCache.size() == 0) break; + if (ramCache.wSize() == 0) break; flushCache(ramCache.bestFlushWordHash()); try {Thread.sleep(10);} catch (InterruptedException e) {} } @@ -316,7 +329,7 @@ public final class plasmaWordIndex { public int size() { return java.lang.Math.max(assortmentCluster.sizeTotal(), - java.lang.Math.max(backend.size(), ramCache.size())); + java.lang.Math.max(backend.size(), ramCache.wSize() + ramCache.kSize())); } public int indexSize(String wordHash) { diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index f85b1104e..e228f0f95 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -56,21 +56,24 @@ import de.anomic.kelondro.kelondroRecords; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySeedDB; -public final class plasmaWordIndexCache implements plasmaWordIndexInterface { +public final class plasmaWordIndexCache /*implements plasmaWordIndexInterface*/ { // environment constants private static final String indexArrayFileName = "indexDump1.array"; - public static final int ramCacheReferenceLimit = 50; - public static final long ramCacheMaxAge = 1000 * 60 * 60 * 2; // milliseconds; 2 Hours - public static final long ramCacheMinAge = 1000 * 60 * 2; // milliseconds; 2 Minutes (Karenz for DHT Receive) + public static final int wCacheReferenceLimit = 50; + public static final long wCacheMaxAge = 1000 * 60 * 60 * 2; // milliseconds; 2 hours + public static final long wCacheMinAge = 1000; // milliseconds; 1 second + public static final long kCacheMaxAge = 1000 * 60 * 2; // milliseconds; 2 minutes // class variables private final File databaseRoot; - private final TreeMap cache; + private final TreeMap wCache; // wordhash-container + private final TreeMap kCache; // time-container; for karenz/DHT caching (set with high priority) private final kelondroMScoreCluster hashScore; private final kelondroMScoreCluster hashDate; + private long kCacheInc = 0; private long startTime; - private int maxWordsLow, maxWordsHigh; // we have 2 cache limits for different priorities + private int wCacheMaxCount; private final serverLog log; // calculated constants @@ -85,12 +88,13 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { // creates a new index cache // the cache has a back-end where indexes that do not fit in the cache are flushed this.databaseRoot = databaseRoot; - this.cache = new TreeMap(); + this.wCache = new TreeMap(); + this.kCache = new TreeMap(); this.hashScore = new kelondroMScoreCluster(); this.hashDate = new kelondroMScoreCluster(); + this.kCacheInc = 0; this.startTime = System.currentTimeMillis(); - this.maxWordsLow = 8000; - this.maxWordsHigh = 10000; + this.wCacheMaxCount = 10000; this.log = log; // read in dump of last session @@ -102,7 +106,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { } private void dump(int waitingSeconds) throws IOException { - log.logConfig("creating dump for index cache, " + cache.size() + " words (and much more urls)"); + log.logConfig("creating dump for index cache, " + wCache.size() + " words (and much more urls)"); File indexDumpFile = new File(databaseRoot, indexArrayFileName); if (indexDumpFile.exists()) indexDumpFile.delete(); kelondroArray dumpArray = null; @@ -110,14 +114,41 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { long startTime = System.currentTimeMillis(); long messageTime = System.currentTimeMillis() + 5000; long wordsPerSecond = 0, wordcount = 0, urlcount = 0; - synchronized (cache) { - Iterator i = cache.entrySet().iterator(); - Map.Entry entry; - String wordHash; - plasmaWordIndexEntryContainer container; - long updateTime; - plasmaWordIndexEntry wordEntry; - byte[][] row = new byte[5][]; + Map.Entry entry; + String wordHash; + plasmaWordIndexEntryContainer container; + long updateTime; + plasmaWordIndexEntry wordEntry; + byte[][] row = new byte[5][]; + + // write kCache, this will be melted with the wCache upon load + synchronized (kCache) { + Iterator i = kCache.values().iterator(); + while (i.hasNext()) { + container = (plasmaWordIndexEntryContainer) i.next(); + + // put entries on stack + if (container != null) { + Iterator ci = container.entries(); + while (ci.hasNext()) { + wordEntry = (plasmaWordIndexEntry) ci.next(); + row[0] = container.wordHash().getBytes(); + row[1] = kelondroRecords.long2bytes(container.size(), 4); + row[2] = kelondroRecords.long2bytes(container.updated(), 8); + row[3] = wordEntry.getUrlHash().getBytes(); + row[4] = wordEntry.toEncodedForm().getBytes(); + dumpArray.set((int) urlcount++, row); + } + } + wordcount++; + i.remove(); // free some mem + + } + } + + // write wCache + synchronized (wCache) { + Iterator i = wCache.entrySet().iterator(); while (i.hasNext()) { // get entries entry = (Map.Entry) i.next(); @@ -145,7 +176,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { if (System.currentTimeMillis() > messageTime) { // System.gc(); // for better statistic wordsPerSecond = wordcount * 1000 / (1 + System.currentTimeMillis() - startTime); - log.logInfo("dumping status: " + wordcount + " words done, " + (cache.size() / (wordsPerSecond + 1)) + " seconds remaining, free mem = " + (Runtime.getRuntime().freeMemory() / 1024 / 1024) + "MB"); + log.logInfo("dumping status: " + wordcount + " words done, " + (wCache.size() / (wordsPerSecond + 1)) + " seconds remaining, free mem = " + (Runtime.getRuntime().freeMemory() / 1024 / 1024) + "MB"); messageTime = System.currentTimeMillis() + 5000; } } @@ -164,7 +195,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { long messageTime = System.currentTimeMillis() + 5000; long urlCount = 0, urlsPerSecond = 0; try { - synchronized (cache) { + synchronized (wCache) { int i = dumpArray.size(); String wordHash; //long creationTime; @@ -179,7 +210,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { //creationTime = kelondroRecords.bytes2long(row[2]); wordEntry = new plasmaWordIndexEntry(new String(row[3], "UTF-8"), new String(row[4], "UTF-8")); // store to cache - addEntry(wordHash, wordEntry, startTime); + addEntry(wordHash, wordEntry, startTime, false); urlCount++; // protect against memory shortage //while (rt.freeMemory() < 1000000) {flushFromMem(); java.lang.System.gc();} @@ -194,7 +225,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { } dumpArray.close(); - log.logConfig("restored " + cache.size() + " words in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds"); + log.logConfig("restored " + wCache.size() + " words in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds"); } catch (kelondroException e) { // restore failed log.logSevere("restore of indexCache array dump failed: " + e.getMessage(), e); @@ -206,72 +237,94 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { // cache settings - public int maxURLinWordCache() { + public int maxURLinWCache() { if (hashScore.size() == 0) return 0; return hashScore.getMaxScore(); } - public long minAgeOfWordCache() { + public long minAgeOfWCache() { if (hashDate.size() == 0) return 0; return System.currentTimeMillis() - longEmit(hashDate.getMaxScore()); } - public long maxAgeOfWordCache() { + public long maxAgeOfWCache() { if (hashDate.size() == 0) return 0; return System.currentTimeMillis() - longEmit(hashDate.getMinScore()); } - public int wordCacheRAMSize() { - return cache.size(); + public long minAgeOfKCache() { + if (kCache.size() == 0) return 0; + return System.currentTimeMillis() - ((Long) kCache.lastKey()).longValue(); } - public void setMaxWords(int maxWordsLow, int maxWordsHigh) { - this.maxWordsLow = maxWordsLow; - this.maxWordsHigh = maxWordsHigh; - } - - public int getMaxWordsLow() { - return this.maxWordsLow; + public long maxAgeOfKCache() { + if (kCache.size() == 0) return 0; + return System.currentTimeMillis() - ((Long) kCache.firstKey()).longValue(); } - public int getMaxWordsHigh() { - return this.maxWordsHigh; + public void setMaxWordCount(int maxWords) { + this.wCacheMaxCount = maxWords; + } + + public int getMaxWordCount() { + return this.wCacheMaxCount; } - public int size() { - return cache.size(); + public int wSize() { + return wCache.size(); + } + + public int kSize() { + return kCache.size(); } public int indexSize(String wordHash) { int size = 0; - plasmaWordIndexEntryContainer cacheIndex = (plasmaWordIndexEntryContainer) cache.get(wordHash); + plasmaWordIndexEntryContainer cacheIndex = (plasmaWordIndexEntryContainer) wCache.get(wordHash); if (cacheIndex != null) size += cacheIndex.size(); return size; } public Iterator wordHashes(String startWordHash, boolean rot) { if (rot) throw new UnsupportedOperationException("plasmaWordIndexCache cannot rotate"); - return cache.tailMap(startWordHash).keySet().iterator(); + return wCache.tailMap(startWordHash).keySet().iterator(); } + public void shiftK2W() { + // find entries in kCache that are too old for that place and shift them to the wCache + long time; + Long l; + plasmaWordIndexEntryContainer container; + synchronized (kCache) { + while (kCache.size() > 0) { + l = (Long) kCache.firstKey(); + time = l.longValue(); + if (System.currentTimeMillis() - time < kCacheMaxAge) return; + container = (plasmaWordIndexEntryContainer) kCache.remove(l); + addEntries(container, container.updated(), false); + } + } + } + public String bestFlushWordHash() { // select appropriate hash // we have 2 different methods to find a good hash: // - the oldest entry in the cache // - the entry with maximum count - if (cache.size() == 0) return null; + shiftK2W(); + if (wCache.size() == 0) return null; try { - synchronized (cache) { + synchronized (wCache) { String hash = null; int count = hashScore.getMaxScore(); - if ((count > ramCacheReferenceLimit) && + if ((count > wCacheReferenceLimit) && ((hash = (String) hashScore.getMaxObject()) != null) && - (System.currentTimeMillis() - longEmit(hashDate.getScore(hash)) > ramCacheMinAge)) { + (System.currentTimeMillis() - longEmit(hashDate.getScore(hash)) > wCacheMinAge)) { // flush high-score entries, but not if they are too 'young' return hash; } long oldestTime = longEmit(hashDate.getMinScore()); - if (((System.currentTimeMillis() - oldestTime) > ramCacheMaxAge) && + if (((System.currentTimeMillis() - oldestTime) > wCacheMaxAge) && ((hash = (String) hashDate.getMinObject()) != null)) { // flush out-dated entries return hash; @@ -280,7 +333,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { if (Runtime.getRuntime().freeMemory() < 10000000) { // low-memory case hash = (String) hashScore.getMaxObject(); // flush high-score entries (saves RAM) - if (System.currentTimeMillis() - longEmit(hashDate.getScore(hash)) < ramCacheMinAge) { + if (System.currentTimeMillis() - longEmit(hashDate.getScore(hash)) < wCacheMinAge) { // to young, take it from the oldest entries hash = (String) hashDate.getMinObject(); } @@ -297,25 +350,19 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { } private int intTime(long longTime) { - return (int) ((longTime - startTime) / 1000); + return (int) Math.max(0, ((longTime - startTime) / 1000)); } private long longEmit(int intTime) { return (((long) intTime) * (long) 1000) + startTime; } - /* - private long longTime(int intTime) { - return ((long) intTime) * ((long) 1000) + startTime; - } - */ - public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty) { - return (plasmaWordIndexEntryContainer) cache.get(wordHash); + return (plasmaWordIndexEntryContainer) wCache.get(wordHash); } public long getUpdateTime(String wordHash) { - plasmaWordIndexEntryContainer entries = (plasmaWordIndexEntryContainer) cache.get(wordHash); + plasmaWordIndexEntryContainer entries = (plasmaWordIndexEntryContainer) wCache.get(wordHash); if (entries == null) return 0; return entries.updated(); /* @@ -327,8 +374,8 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { public plasmaWordIndexEntryContainer deleteContainer(String wordHash) { // returns the index that had been deleted - synchronized (cache) { - plasmaWordIndexEntryContainer container = (plasmaWordIndexEntryContainer) cache.remove(wordHash); + synchronized (wCache) { + plasmaWordIndexEntryContainer container = (plasmaWordIndexEntryContainer) wCache.remove(wordHash); hashScore.deleteScore(wordHash); hashDate.deleteScore(wordHash); return container; @@ -338,7 +385,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) { if (urlHashes.length == 0) return 0; int count = 0; - synchronized (cache) { + synchronized (wCache) { plasmaWordIndexEntryContainer c = (plasmaWordIndexEntryContainer) deleteContainer(wordHash); if (c != null) { count = c.removeEntries(wordHash, urlHashes, deleteComplete); @@ -348,12 +395,13 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { return count; } + /* public int tryRemoveURLs(String urlHash) { // this tries to delete an index from the cache that has this // urlHash assigned. This can only work if the entry is really fresh // Such entries must be searched in the latest entries int delCount = 0; - synchronized (cache) { + synchronized (wCache) { Iterator i = hashDate.scores(false); String wordHash; long t; @@ -362,11 +410,11 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { wordHash = (String) i.next(); // check time t = longEmit(hashDate.getScore(wordHash)); - if (System.currentTimeMillis() - t > ramCacheMinAge) return delCount; + if (System.currentTimeMillis() - t > wCacheMinAge) return delCount; // get container - c = (plasmaWordIndexEntryContainer) cache.get(wordHash); + c = (plasmaWordIndexEntryContainer) wCache.get(wordHash); if (c.remove(urlHash) != null) { - cache.put(wordHash, c); + wCache.put(wordHash, c); hashScore.decScore(wordHash); delCount++; } @@ -374,50 +422,87 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { } return delCount; } + */ - public int addEntries(plasmaWordIndexEntryContainer container, long updateTime, boolean highPriority) { + public int tryRemoveURLs(String urlHash) { + // this tries to delete an index from the cache that has this + // urlHash assigned. This can only work if the entry is really fresh + // Such entries must be searched in the latest entries + int delCount = 0; + synchronized (kCache) { + Iterator i = kCache.entrySet().iterator(); + Map.Entry entry; + Long l; + plasmaWordIndexEntryContainer c; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + l = (Long) entry.getKey(); + + // get container + c = (plasmaWordIndexEntryContainer) entry.getValue(); + if (c.remove(urlHash) != null) { + if (c.size() == 0) { + i.remove(); + } else { + kCache.put(l, c); // superfluous? + } + delCount++; + } + } + } + return delCount; + } + + public int addEntries(plasmaWordIndexEntryContainer container, long updateTime, boolean dhtCase) { // this puts the entries into the cache, not into the assortment directly - int added = 0; - // check cache space - - //serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size()); // put new words into cache - String wordHash = container.wordHash(); - plasmaWordIndexEntryContainer entries = null; - synchronized (cache) { - // put container into cache - entries = (plasmaWordIndexEntryContainer) cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null + if (dhtCase) synchronized (kCache) { + // put container into kCache + kCache.put(new Long(updateTime + kCacheInc), container); + kCacheInc++; + if (kCacheInc > 10000) kCacheInc = 0; + added = container.size(); + } else synchronized (wCache) { + // put container into wCache + String wordHash = container.wordHash(); + plasmaWordIndexEntryContainer entries = (plasmaWordIndexEntryContainer) wCache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null if (entries == null) entries = new plasmaWordIndexEntryContainer(wordHash); added = entries.add(container); if (added > 0) { - cache.put(wordHash, entries); + wCache.put(wordHash, entries); hashScore.addScore(wordHash, added); hashDate.setScore(wordHash, intTime(updateTime)); } + entries = null; } - entries = null; return added; } - public boolean addEntry(String wordHash, plasmaWordIndexEntry newEntry, long updateTime) { - plasmaWordIndexEntryContainer container = null; - plasmaWordIndexEntry[] entries = null; - synchronized (cache) { - container = (plasmaWordIndexEntryContainer) cache.get(wordHash); + public boolean addEntry(String wordHash, plasmaWordIndexEntry newEntry, long updateTime, boolean dhtCase) { + if (dhtCase) synchronized (kCache) { + // put container into kCache + plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); + container.add(newEntry); + kCache.put(new Long(updateTime + kCacheInc), container); + kCacheInc++; + if (kCacheInc > 10000) kCacheInc = 0; + return true; + } else synchronized (wCache) { + plasmaWordIndexEntryContainer container = (plasmaWordIndexEntryContainer) wCache.get(wordHash); if (container == null) container = new plasmaWordIndexEntryContainer(wordHash); - entries = new plasmaWordIndexEntry[] { newEntry }; + plasmaWordIndexEntry[] entries = new plasmaWordIndexEntry[] { newEntry }; if (container.add(entries, updateTime) > 0) { - cache.put(wordHash, container); + wCache.put(wordHash, container); hashScore.incScore(wordHash); hashDate.setScore(wordHash, intTime(updateTime)); return true; } + container = null; + entries = null; + return false; } - container = null; - entries = null; - return false; } public void close(int waitingSeconds) { diff --git a/source/de/anomic/plasma/plasmaWordIndexInterface.java b/source/de/anomic/plasma/plasmaWordIndexInterface.java index d0c5878b8..dc47838a3 100644 --- a/source/de/anomic/plasma/plasmaWordIndexInterface.java +++ b/source/de/anomic/plasma/plasmaWordIndexInterface.java @@ -55,7 +55,8 @@ public interface plasmaWordIndexInterface { public plasmaWordIndexEntryContainer deleteContainer(String wordHash); public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete); - public int addEntries(plasmaWordIndexEntryContainer newEntries, long creationTime, boolean highPriority); + public boolean addEntry(String wordHash, plasmaWordIndexEntry entry, long updateTime, boolean dhtCase); + public int addEntries(plasmaWordIndexEntryContainer newEntries, long creationTime, boolean dhtCase); public void close(int waitingSeconds); diff --git a/yacy.init b/yacy.init index 9d7f71ecd..096d96500 100644 --- a/yacy.init +++ b/yacy.init @@ -310,18 +310,6 @@ yacyDebugMode=false #staticIP if you have a static IP, you can use this setting staticIP= -# if the process is running behind a NAT or ROUTER, we cannot easily identify -# the public IP of the process. We can ask a public IP responder, but cannot -# rely on it. Therefore, AnomicHTTPProxy includes it's own responder. -# But for the first running peer this is not an option. -# The author uses a DI-604 router, which can be -# asked for the public IP. If you own a DI-604 as well, please set the -# DI604use to true and put in your router password, it will not be used for any -# other purpose of asking for the IP -#DI604use=true -DI604use=false -DI604pw= - # each time the proxy starts up, it can trigger the local browser to show the # status page. This is active by default, to make it easier for first-time # users to understand what this application does. You can disable browser @@ -513,7 +501,6 @@ javastart_Xmx=Xmx64m # -Xms set initial Java heap size javastart_Xms=Xms10m - # performance properties for the word index cache # wordCacheMaxLow/High is the number of word indexes that shall be held in the # ram cache during indexing. When YaCy is shut down, this cache must be @@ -522,9 +509,7 @@ javastart_Xms=Xms10m # remote index transmissions and search requests # maxWaitingWordFlush gives the number of seconds that the shutdown # may last for the word flush -wordCacheMaxLow = 12000 -wordCacheMaxHigh = 16000 -maxWaitingWordFlush = 180 +wordCacheMaxCount = 12000 # Specifies if yacy can be used as transparent http proxy. #
Cache TypeIndexingDHTDescription
Words in RAM Cache:#[wordCacheRAMSize]#Words in RAM cache:#[wordCacheWSize]##[wordCacheKSize]# - This is the current size of the word cache. - The smaller this number, the faster the shut-down procedure will be. - The maximum of this cache can be set below. + This is the current size of the word caches. + The indexing cache speeds up the indexing process, the DHT cache holds indexes temporary for approval. + The maximum of this caches can be set below.
Maximum URLs currently assigned
to one cached word:
#[maxURLinWordCache]##[maxURLinWCache]#not controlled
for DHT cache
This is the maximum size of URLs assigned to a single word cache entry. If this is a big number, it shows that the caching works efficiently.
Maximum Age of Word in cache:#[maxAgeOfWordCache]#Maximum age of a word:#[maxAgeOfWCache]##[maxAgeOfKCache]# - This is the maximum age of a word index that is in the RAM cache in minutes. + This is the maximum age of a word in an index in minutes.
Minimum Age of Word in cache:#[minAgeOfWordCache]#Minimum age of a word:#[minAgeOfWCache]##[minAgeOfKCache]# - This is the minimum age of a word index that is in the RAM cache in minutes. + This is the minimum age of a word in an index in minutes.
Maximum number of Word Caches, low limit:Maximum number of words in cache:cannot be set for DHT This is is the number of word indexes that shall be held in the ram cache during indexing. When YaCy is shut down, this cache must be - flushed to disc; this may last some minutes. The low limit is valid for crawling tasks, the high limit is valid - for search and DHT transmission tasks. + flushed to disc; this may last some minutes.
Maximum number of Word Caches, high limit:
+ Changes take effect immediately