diff --git a/build.properties b/build.properties index 003816b2d..d5bdf3820 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.432 +releaseVersion=0.433 releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz #releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} diff --git a/htroot/PerformanceQueues_p.html b/htroot/PerformanceQueues_p.html index bffa4268d..51df4bc59 100644 --- a/htroot/PerformanceQueues_p.html +++ b/htroot/PerformanceQueues_p.html @@ -86,6 +86,13 @@ Changes take effect immediately If this is a big number, it shows that the caching works efficiently. + + Maximum Age of Word in cache: + #[maxAgeOfWordCache]# + + This is the maximum age of a word index that is in the RAM cache in minutes. + + Maximum number of Word Caches, low limit: diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index 61bbff583..dafe81f37 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -253,6 +253,7 @@ public class PerformanceQueues_p { // table cache settings prop.put("wordCacheRAMSize", switchboard.wordIndex.wordCacheRAMSize()); prop.put("maxURLinWordCache", "" + switchboard.wordIndex.maxURLinWordCache()); + prop.put("maxAgeOfWordCache", "" + (switchboard.wordIndex.maxAgeOfWordCache() / 1000 / 60)); // minutes prop.put("maxWaitingWordFlush", switchboard.getConfig("maxWaitingWordFlush", "180")); prop.put("wordCacheMaxLow", switchboard.getConfig("wordCacheMaxLow", "10000")); prop.put("wordCacheMaxHigh", switchboard.getConfig("wordCacheMaxHigh", "10000")); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 681031682..c9ec88d97 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -94,6 +94,10 @@ public final class plasmaWordIndex { return ramCache.maxURLinWordCache(); } + public long maxAgeOfWordCache() { + return ramCache.maxAgeOfWordCache(); + } + public int wordCacheRAMSize() { return ramCache.wordCacheRAMSize(); } @@ -123,7 +127,7 @@ public final class plasmaWordIndex { } } } else { - while (ramCache.maxURLinWordCache() > plasmaWordIndexCache.ramCacheLimit) { + while (ramCache.maxURLinWordCache() > plasmaWordIndexCache.ramCacheReferenceLimit) { flushCache(1); } if (ramCache.size() > ramCache.getMaxWordsLow()) { diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index 4adb29094..fc225a76e 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -60,7 +60,8 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { // environment constants private static final String indexArrayFileName = "indexDump1.array"; - public static final int ramCacheLimit = 50; + public static final int ramCacheReferenceLimit = 50; + public static final long ramCacheAgeLimit = 60 * 60 * 2 * 1000; // milliseconds; 2 Hours // class variables private final File databaseRoot; @@ -205,7 +206,11 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { // cache settings public int maxURLinWordCache() { - return hashScore.getScore(hashScore.getMaxObject()); + return hashScore.getMaxScore(); + } + + public long maxAgeOfWordCache() { + return System.currentTimeMillis() - longEmit(hashDate.getMinScore()); } public int wordCacheRAMSize() { @@ -249,18 +254,28 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { if (cache.size() == 0) return null; try { synchronized (cache) { - String hash = (String) hashScore.getMaxObject(); - if (hash == null) return null; + String hash = null; int count = hashScore.getMaxScore(); - //long time = longTime(hashDate.getScore(hash)); - if (count > ramCacheLimit) { + if ((count > ramCacheReferenceLimit) && + ((hash = (String) hashScore.getMaxObject()) != null)) { // flush high-score entries return hash; - } else { - // flush oldest entries - hash = (String) hashDate.getMinObject(); + } + long oldestTime = longEmit(hashDate.getMinScore()); + if (((System.currentTimeMillis() - oldestTime) > ramCacheAgeLimit) && + ((hash = (String) hashDate.getMinObject()) != null)) { + // flush out-dated entries return hash; } + // not an urgent case + if (Runtime.getRuntime().freeMemory() < 10000000) { + // low-memory case + hash = (String) hashScore.getMaxObject(); // flush high-score entries (saves RAM) + } else { + // not-efficient-so-far case + hash = (String) hashDate.getMinObject(); // flush oldest entries (makes indexing faster) + } + return hash; } } catch (Exception e) { log.logSevere("flushFromMem: " + e.getMessage(), e); @@ -272,6 +287,10 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { return (int) ((longTime - startTime) / 1000); } + private long longEmit(int intTime) { + return (((long) intTime) * (long) 1000) + startTime; + } + /* private long longTime(int intTime) { return ((long) intTime) * ((long) 1000) + startTime; diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntry.java index 5f5e127c2..cbad0d318 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntry.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java @@ -108,13 +108,18 @@ public final class plasmaWordIndexEntry implements Cloneable { public static final int AP_H5 = 5; // h5-tag public static final int AP_H6 = 6; // h6-tag public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam) - public static final int AP_URL = 8; // word inside an url - public static final int AP_IMG = 9; // tag inside image references - public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags) + public static final int AP_DOM = 8; // word inside an url: in Domain + public static final int AP_PATH = 9; // word inside an url: in path + public static final int AP_IMG = 10; // tag inside image references public static final int AP_ANCHOR = 11; // anchor description - public static final int AP_BOLD = 12; - public static final int AP_ITALICS = 13; - public static final int AP_INVISIBLE = 14; // good for spam detection + public static final int AP_BOLD = 12; // may be interpreted as emphasized + public static final int AP_ITALICS = 13; // may be interpreted as emphasized + public static final int AP_WEAK = 14; // for Text that is small or bareley visible + public static final int AP_INVISIBLE = 15; // good for spam detection + public static final int AP_TAG = 16; // for tagged indexeing (i.e. using mp3 tags) + public static final int AP_AUTHOR = 17; // word appears in author name + public static final int AP_OPUS = 18; // word appears in name of opus, which may be an album name (in mp3 tags) + public static final int AP_TRACK = 19; // word appears in track name (i.e. in mp3 tags) // URL attributes public static final int UA_LOCAL = 0; // URL was crawled locally diff --git a/yacy.init b/yacy.init index 56136aad8..28bc2e054 100644 --- a/yacy.init +++ b/yacy.init @@ -417,8 +417,8 @@ xpstopw=true # the prereq-value is a memory pre-requisite: that much bytes must # be available/free in the heap; othervise the loop is not executed # and another idlesleep is performed -20_dhtdistribution_idlesleep=20000 -20_dhtdistribution_busysleep=5000 +20_dhtdistribution_idlesleep=50000 +20_dhtdistribution_busysleep=2000 20_dhtdistribution_memprereq=8388608 20_dhtdistribution_threads=1 30_peerping_idlesleep=120000 @@ -428,7 +428,7 @@ xpstopw=true 40_peerseedcycle_busysleep=1200000 40_peerseedcycle_memprereq=4194304 50_localcrawl_idlesleep=10000 -50_localcrawl_busysleep=200 +50_localcrawl_busysleep=100 50_localcrawl_memprereq=1048576 50_localcrawl_isPaused=false 61_globalcrawltrigger_idlesleep=10000 @@ -442,8 +442,8 @@ xpstopw=true 70_cachemanager_idlesleep=5000 70_cachemanager_busysleep=0 70_cachemanager_memprereq=1048576 -80_indexing_idlesleep=5000 -80_indexing_busysleep=300 +80_indexing_idlesleep=2000 +80_indexing_busysleep=100 80_indexing_memprereq=2097152 82_crawlstack_idlesleep=5000 82_crawlstack_busysleep=0