From 474659a71ff1b4a0ec02d3689f07ac6cb3ab6f0e Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 3 Jul 2008 13:08:37 +0000 Subject: [PATCH] - modified and enhanced the crawl balancer: better list export, fixing of damaged crawl queue at start-up, re-sorting at start-up to enhance domain order - added option to set minimum crawl delta for domains in balancer - added default values to crawl deltas in yacy.init - added configuration for these deltas in performance queues - enhanced performance setting computation (more time for indexing queue for a faster flush - remote crawling is now enabled during local crawling if indexer has space and time for more links - added database stub for new distributed file system - refactoring of time computation to get an abstraction level that will be used by a TTL rule in new distributed file system git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4966 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 9 ++ htroot/ConfigBasic.java | 1 - htroot/IndexCreateWWWGlobalQueue_p.java | 7 +- htroot/IndexCreateWWWLocalQueue_p.java | 7 +- htroot/IndexCreateWWWRemoteQueue_p.java | 7 +- htroot/PerformanceQueues_p.html | 94 ++++++++---- htroot/PerformanceQueues_p.java | 14 +- htroot/xml/queues_p.java | 6 +- source/de/anomic/crawler/Balancer.java | 60 +++++--- source/de/anomic/crawler/CrawlQueues.java | 19 ++- source/de/anomic/crawler/IndexingStack.java | 1 + source/de/anomic/crawler/NoticedURL.java | 54 ++++--- source/de/anomic/index/indexRWIRowEntry.java | 15 +- source/de/anomic/index/indexRWIVarEntry.java | 6 +- .../de/anomic/kelondro/kelondroMicroDate.java | 64 ++++++++ .../de/anomic/kelondro/kelondroRelations.java | 142 ++++++++++++++++++ .../anomic/plasma/plasmaRankingCRProcess.java | 9 +- .../de/anomic/plasma/plasmaSwitchboard.java | 10 +- .../de/anomic/plasma/plasmaWebStructure.java | 5 +- source/de/anomic/plasma/plasmaWordIndex.java | 31 ---- source/de/anomic/yacy/yacyClient.java | 2 + 21 files changed, 422 insertions(+), 141 deletions(-) create mode 100755 source/de/anomic/kelondro/kelondroMicroDate.java create mode 100755 source/de/anomic/kelondro/kelondroRelations.java diff --git a/defaults/yacy.init b/defaults/yacy.init index 3ca445d26..2d6709495 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -206,6 +206,15 @@ proxyCacheSize__pro = 1024 # storage place for new releases releases = DATA/RELEASE +# time limits for the crawler: +# these times (milliseconds) are the shortest times for an access of the crawler to the same domain +# the crawler may read files faster than that, but never from the same domain faster than these time intervals +# a delta of 500 milliseconds means that no more than two files are taken from the same server +# there is a hard-coded limit which prevents that the used time is shorter that these default times +# the time-limits are distinguished for local and global crawls: there is no limit for an intranet-crawl. +minimumLocalDelta = 0 +minimumGlobalDelta = 500 + # the following mime-types are the whitelist for indexing # # parseableMimeTypes.HTML: specifies mime-types that can be indexed with built-in html parser diff --git a/htroot/ConfigBasic.java b/htroot/ConfigBasic.java index cfce71d13..165601d37 100644 --- a/htroot/ConfigBasic.java +++ b/htroot/ConfigBasic.java @@ -59,7 +59,6 @@ import de.anomic.server.serverDomains; import de.anomic.server.serverInstantBusyThread; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; -import de.anomic.server.servletProperties; import de.anomic.yacy.yacyAccessible; import de.anomic.yacy.yacySeed; diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java index f40ecdc90..338f41b7d 100644 --- a/htroot/IndexCreateWWWGlobalQueue_p.java +++ b/htroot/IndexCreateWWWGlobalQueue_p.java @@ -44,6 +44,7 @@ // if the shell's current path is HTROOT import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Date; import java.util.Locale; @@ -103,7 +104,7 @@ public class IndexCreateWWWGlobalQueue_p { prop.put("crawler-queue", "0"); } else { prop.put("crawler-queue", "1"); - CrawlEntry[] crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_LIMIT, showLimit); + ArrayList crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_LIMIT, showLimit); CrawlEntry urle; boolean dark = true; @@ -111,8 +112,8 @@ public class IndexCreateWWWGlobalQueue_p { String profileHandle; CrawlProfile.entry profileEntry; int i, showNum = 0; - for (i = 0; (i < crawlerList.length) && (showNum < showLimit); i++) { - urle = crawlerList[i]; + for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { + urle = crawlerList.get(i); if ((urle != null)&&(urle.url()!=null)) { initiator = sb.webIndex.seedDB.getConnected(urle.initiator()); profileHandle = urle.profileHandle(); diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java index 2949f6a98..0d580717c 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -44,6 +44,7 @@ // if the shell's current path is HTROOT import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Date; import java.util.Iterator; import java.util.Locale; @@ -171,7 +172,7 @@ public class IndexCreateWWWLocalQueue_p { prop.put("crawler-queue", "0"); } else { prop.put("crawler-queue", "1"); - CrawlEntry[] crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_CORE, (int) (showLimit * 1.20)); + ArrayList crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_CORE, (int) (showLimit * 1.20)); CrawlEntry urle; boolean dark = true; @@ -179,8 +180,8 @@ public class IndexCreateWWWLocalQueue_p { String profileHandle; CrawlProfile.entry profileEntry; int i; - for (i = 0; (i < crawlerList.length) && (showNum < showLimit); i++) { - urle = crawlerList[i]; + for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { + urle = crawlerList.get(i); if ((urle != null)&&(urle.url()!=null)) { initiator = sb.webIndex.seedDB.getConnected(urle.initiator()); profileHandle = urle.profileHandle(); diff --git a/htroot/IndexCreateWWWRemoteQueue_p.java b/htroot/IndexCreateWWWRemoteQueue_p.java index 15d5e1968..59de4b972 100644 --- a/htroot/IndexCreateWWWRemoteQueue_p.java +++ b/htroot/IndexCreateWWWRemoteQueue_p.java @@ -44,6 +44,7 @@ // if the shell's current path is HTROOT import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Date; import java.util.Locale; @@ -103,7 +104,7 @@ public class IndexCreateWWWRemoteQueue_p { prop.put("crawler-queue", "0"); } else { prop.put("crawler-queue", "1"); - CrawlEntry[] crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_REMOTE, showLimit); + ArrayList crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_REMOTE, showLimit); CrawlEntry urle; boolean dark = true; @@ -111,8 +112,8 @@ public class IndexCreateWWWRemoteQueue_p { String profileHandle; CrawlProfile.entry profileEntry; int i, showNum = 0; - for (i = 0; (i < crawlerList.length) && (showNum < showLimit); i++) { - urle = crawlerList[i]; + for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { + urle = crawlerList.get(i); if (urle != null && urle.url() != null) { initiator = sb.webIndex.seedDB.getConnected(urle.initiator()); profileHandle = urle.profileHandle(); diff --git a/htroot/PerformanceQueues_p.html b/htroot/PerformanceQueues_p.html index 3ea43e196..26986f889 100644 --- a/htroot/PerformanceQueues_p.html +++ b/htroot/PerformanceQueues_p.html @@ -153,27 +153,49 @@

- Thread pool settings: + Balancer Settings: +

+

+ This is the time delta between accessing of the same domain during a crawl. The crawl balancer tries to avoid that domains are + accessed too often, but if the balancer fails (i.e. if there are only links left from the same domain), then these minimum + delta times are ensured. +

+
+ + + + + + + + + + + + + + + + +
Crawler DomainMinimum Access Time Delta
local (intranet) crawls
global (internet) crawls
+ Changes take effect immediately
+
+ +

+ Thread Pool Settings:

- - - - - - #{pool}# - #{/pool}# @@ -184,6 +206,40 @@
Thread Pool maximum Active current ActiveFull Description
#[name]# #[numActive]#
+ +

+ Online Caution Settings:
+ This is the time that the crawler idles when the proxy is accessed, or a local or remote search is done. + The delay is extended by this time each time the proxy is accessed afterwards. + This shall improve performance of the affected process (proxy or search). + (current delta is #[crawlPauseProxyCurrent]#/#[crawlPauseLocalsearchCurrent]#/#[crawlPauseRemotesearchCurrent]# + seconds since last proxy/local-search/remote-search access.) +

+
+ + + + + + + + + + + + + + + + + + + + +
Online Caution Caseindexer delay (milliseconds) after case occurency
Proxy:
Local Search:
Remote Search:
+ Changes take effect immediately
+
+
YaCy Priority Settings
@@ -199,28 +255,6 @@
-
-
Proxy Performance Settings -

- This is the time that the crawler idles when the proxy is accessed, or a local or remote search is done. - The delay is extended by this time each time the proxy is accessed afterwards. - This shall improve performance of the affected process (proxy or search). - (current delta is #[crawlPauseProxyCurrent]#/#[crawlPauseLocalsearchCurrent]#/#[crawlPauseRemotesearchCurrent]# - seconds since last proxy/local-search/remote-search access.) -

-
-
:
-
-
:
-
-
:
-
-
-
Changes take effect immediately
-
-
-
- #%env/templates/footer.template%# diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index 9d76c6068..2a1fbab29 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -69,7 +69,6 @@ public class PerformanceQueues_p { performanceProfiles.put("defaults/performance_dht.profile", "prefer DHT"); } - public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch sb) { // return variable that accumulates replacements plasmaSwitchboard switchboard = (plasmaSwitchboard) sb; @@ -253,6 +252,19 @@ public class PerformanceQueues_p { switchboard.setConfig(plasmaSwitchboard.REMOTESEARCH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseRemotesearch", 30000))); } + if ((post != null) && (post.containsKey("minimumDeltaSubmit"))) { + long minimumLocalDelta = post.getLong("minimumLocalDelta", switchboard.crawlQueues.noticeURL.getMinimumLocalDelta()); + long minimumGlobalDelta = post.getLong("minimumGlobalDelta", switchboard.crawlQueues.noticeURL.getMinimumGlobalDelta()); + switchboard.setConfig("minimumLocalDelta", minimumLocalDelta); + switchboard.setConfig("minimumGlobalDelta", minimumGlobalDelta); + switchboard.crawlQueues.noticeURL.setMinimumLocalDelta(minimumLocalDelta); + switchboard.crawlQueues.noticeURL.setMinimumGlobalDelta(minimumGlobalDelta); + } + + // delta settings + prop.put("minimumLocalDelta", switchboard.crawlQueues.noticeURL.getMinimumLocalDelta()); + prop.put("minimumGlobalDelta", switchboard.crawlQueues.noticeURL.getMinimumGlobalDelta()); + // table cache settings prop.putNum("urlCacheSize", switchboard.webIndex.getURLwriteCacheSize()); prop.putNum("wordCacheWSize", switchboard.webIndex.dhtOutCacheSize()); diff --git a/htroot/xml/queues_p.java b/htroot/xml/queues_p.java index c689d8a4b..f3c105ba1 100644 --- a/htroot/xml/queues_p.java +++ b/htroot/xml/queues_p.java @@ -181,13 +181,13 @@ public class queues_p { } - public static final void addNTable(plasmaSwitchboard sb, serverObjects prop, String tableName, CrawlEntry[] crawlerList) { + public static final void addNTable(plasmaSwitchboard sb, serverObjects prop, String tableName, ArrayList crawlerList) { int showNum = 0; CrawlEntry urle; yacySeed initiator; - for (int i = 0; i < crawlerList.length; i++) { - urle = crawlerList[i]; + for (int i = 0; i < crawlerList.size(); i++) { + urle = crawlerList.get(i); if ((urle != null) && (urle.url() != null)) { initiator = sb.webIndex.seedDB.getConnected(urle.initiator()); prop.put(tableName + "_" + showNum + "_profile", urle.profileHandle()); diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index a9956196e..84b70ee69 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -116,6 +116,21 @@ public class Balancer { // create a stack for newly entered entries if (!(cachePath.exists())) cachePath.mkdir(); // make the path openFileIndex(); + if (urlFileStack.size() != urlFileIndex.size() || (urlFileIndex.size() < 10000 && urlFileIndex.size() > 0)) { + // fix the file stack + serverLog.logInfo("Balancer", "re-creating the " + stackname + " balancer stack, size = " + urlFileIndex.size() + ((urlFileStack.size() == urlFileIndex.size()) ? "" : " (the old stack size was wrong)" )); + urlFileStack = kelondroStack.reset(urlFileStack); + try { + Iterator i = urlFileIndex.keys(true, null); + byte[] hash; + while (i.hasNext()) { + hash = i.next(); + pushHash(new String(hash)); + } + } catch (IOException e) { + e.printStackTrace(); + } + } } public synchronized void close() { @@ -134,7 +149,7 @@ public class Balancer { public void finalize() { if (urlFileStack != null) { - serverLog.logWarning("plasmaCrawlBalancer", "crawl stack " + stackname + " closed by finalizer"); + serverLog.logWarning("Balancer", "crawl stack " + stackname + " closed by finalizer"); close(); } } @@ -321,24 +336,29 @@ public class Balancer { return; } + // add to index + urlFileIndex.put(entry.toRow()); + + // add the hash to a queue + pushHash(entry.url().hash()); + } + + private void pushHash(String hash) throws IOException { // extend domain stack - String dom = entry.url().hash().substring(6); + String dom = hash.substring(6); LinkedList domainList = domainStacks.get(dom); if (domainList == null) { // create new list domainList = new LinkedList(); synchronized (domainStacks) { - domainList.add(entry.url().hash()); + domainList.add(hash); domainStacks.put(dom, domainList); } } else { // extend existent domain list - domainList.addLast(entry.url().hash()); + domainList.addLast(hash); } - // add to index - urlFileIndex.put(entry.toRow()); - // check size of domainStacks and flush if ((domainStacks.size() > 100) || (sizeDomainStacks() > 1000)) { flushOnceDomStacks(1, urlRAMStack.size() < 100); // when the ram stack is small, flush it there @@ -507,16 +527,16 @@ public class Balancer { if (lastAccess == null) return Long.MAX_VALUE; // never accessed return System.currentTimeMillis() - lastAccess.time(); } - - public synchronized CrawlEntry top(int dist) throws IOException { + + public synchronized ArrayList top(int count) throws IOException { // if we need to flush anything, then flush the domain stack first, // to avoid that new urls get hidden by old entries from the file stack if (urlRAMStack == null) return null; - while ((domainStacksNotEmpty()) && (urlRAMStack.size() <= dist)) { + while ((domainStacksNotEmpty()) && (urlRAMStack.size() <= count)) { // flush only that much as we need to display flushOnceDomStacks(0, true); } - while ((urlFileStack != null) && (urlRAMStack.size() <= dist) && (urlFileStack.size() > 0)) { + while ((urlFileStack != null) && (urlRAMStack.size() <= count) && (urlFileStack.size() > 0)) { // flush some entries from disc to ram stack try { kelondroRow.Entry t = urlFileStack.pop(); @@ -526,16 +546,18 @@ public class Balancer { break; } } - if (dist >= urlRAMStack.size()) return null; - String urlhash = urlRAMStack.get(dist); - kelondroRow.Entry entry = urlFileIndex.get(urlhash.getBytes()); - if (entry == null) { - if (kelondroAbstractRecords.debugmode) serverLog.logWarning("PLASMA BALANCER", "no entry in index for urlhash " + urlhash); - return null; + + count = Math.min(count, urlRAMStack.size()); + ArrayList list = new ArrayList(); + for (int i = 0; i < count; i++) { + String urlhash = urlRAMStack.get(i); + kelondroRow.Entry entry = urlFileIndex.get(urlhash.getBytes()); + if (entry == null) break; + list.add(new CrawlEntry(entry)); } - return new CrawlEntry(entry); + return list; } - + public synchronized Iterator iterator() throws IOException { return new EntryIterator(); } diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 135fd4e4a..bdd077bbc 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -261,7 +261,7 @@ public class CrawlQueues { return false; } - if (sb.webIndex.queuePreStack.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) { + if (sb.webIndex.queuePreStack.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30) / 2) { if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.webIndex.queuePreStack.size() + ")"); return false; } @@ -276,12 +276,21 @@ public class CrawlQueues { return false; } + if (remoteTriggeredCrawlJobSize() > 0) { + if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: the remote-triggered crawl job queue is filled, omitting processing"); + return false; + } + + /* + if (coreCrawlJobSize() > 0) { + if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: a local crawl is running, omitting processing"); + return false; + } + */ + // check if we have an entry in the provider list, otherwise fill the list yacySeed seed; - if ((remoteCrawlProviderHashes.size() == 0) && - (coreCrawlJobSize() == 0) && - (remoteTriggeredCrawlJobSize() == 0) && - (sb.queueSize() < 10)) { + if (remoteCrawlProviderHashes.size() == 0) { if (sb.webIndex.seedDB != null && sb.webIndex.seedDB.sizeConnected() > 0) { Iterator e = sb.webIndex.peerActions.dhtAction.getProvidesRemoteCrawlURLs(); while (e.hasNext()) { diff --git a/source/de/anomic/crawler/IndexingStack.java b/source/de/anomic/crawler/IndexingStack.java index 1508f2a12..bfcd19e1d 100644 --- a/source/de/anomic/crawler/IndexingStack.java +++ b/source/de/anomic/crawler/IndexingStack.java @@ -194,6 +194,7 @@ public class IndexingStack { } public Collection getActiveQueueEntries() { + // todo: check dead entries? return this.queueInProcess.values(); } diff --git a/source/de/anomic/crawler/NoticedURL.java b/source/de/anomic/crawler/NoticedURL.java index 0d2c8c67f..8aebe53b1 100755 --- a/source/de/anomic/crawler/NoticedURL.java +++ b/source/de/anomic/crawler/NoticedURL.java @@ -62,9 +62,9 @@ public class NoticedURL { public static final int STACK_TYPE_MOVIE = 12; // put on movie stack public static final int STACK_TYPE_MUSIC = 13; // put on music stack - private static final long minimumLocalDelta = 0; // the minimum time difference between access of the same local domain - private static final long minimumGlobalDelta = 333; // the minimum time difference between access of the same global domain - private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt + private static final long minimumLocalDeltaInit = 0; // the minimum time difference between access of the same local domain + private static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain + private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt private Balancer coreStack; // links found by crawling to depth-1 private Balancer limitStack; // links found by crawling at target depth @@ -73,14 +73,34 @@ public class NoticedURL { //private kelondroStack imageStack; // links pointing to image resources //private kelondroStack movieStack; // links pointing to movie resources //private kelondroStack musicStack; // links pointing to music resources - + private long minimumLocalDelta; + private long minimumGlobalDelta; + public NoticedURL(File cachePath) { - coreStack = new Balancer(cachePath, "urlNoticeCoreStack", false); - limitStack = new Balancer(cachePath, "urlNoticeLimitStack", false); + this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", false); + this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", false); //overhangStack = new plasmaCrawlBalancer(overhangStackFile); - remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", false); + this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", false); + this.minimumLocalDelta = minimumLocalDeltaInit; + this.minimumGlobalDelta = minimumGlobalDeltaInit; } + public long getMinimumLocalDelta() { + return this.minimumLocalDelta; + } + + public long getMinimumGlobalDelta() { + return this.minimumGlobalDelta; + } + + public void setMinimumLocalDelta(long newDelta) { + this.minimumLocalDelta = Math.max(minimumLocalDeltaInit, newDelta); + } + + public void setMinimumGlobalDelta(long newDelta) { + this.minimumGlobalDelta = Math.max(minimumGlobalDeltaInit, newDelta); + } + public void clear() { coreStack.clear(); limitStack.clear(); @@ -185,7 +205,7 @@ public class NoticedURL { return removed; } - public CrawlEntry[] top(int stackType, int count) { + public ArrayList top(int stackType, int count) { switch (stackType) { case STACK_TYPE_CORE: return top(coreStack, count); case STACK_TYPE_LIMIT: return top(limitStack, count); @@ -240,20 +260,16 @@ public class NoticedURL { throw new IOException("balancer stack is empty"); } - private CrawlEntry[] top(Balancer balancer, int count) { + private ArrayList top(Balancer balancer, int count) { // this is a filo - top if (count > balancer.size()) count = balancer.size(); - ArrayList list = new ArrayList(count); - for (int i = 0; i < count; i++) { - try { - CrawlEntry entry = balancer.top(i); - if (entry == null) break; - list.add(entry); - } catch (IOException e) { - break; - } + ArrayList list; + try { + list = balancer.top(count); + } catch (IOException e) { + list = new ArrayList(0); } - return list.toArray(new CrawlEntry[list.size()]); + return list; } public Iterator iterator(int stackType) { diff --git a/source/de/anomic/index/indexRWIRowEntry.java b/source/de/anomic/index/indexRWIRowEntry.java index a66eb180f..83afefca3 100644 --- a/source/de/anomic/index/indexRWIRowEntry.java +++ b/source/de/anomic/index/indexRWIRowEntry.java @@ -29,9 +29,9 @@ package de.anomic.index; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroColumn; +import de.anomic.kelondro.kelondroMicroDate; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow.Entry; -import de.anomic.plasma.plasmaWordIndex; import de.anomic.yacy.yacySeedDB; public final class indexRWIRowEntry implements indexRWIEntry { @@ -112,8 +112,8 @@ public final class indexRWIRowEntry implements indexRWIEntry { assert (urlHash.length() == 12) : "urlhash = " + urlHash; if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk"; this.entry = urlEntryRow.newEntry(); - int mddlm = plasmaWordIndex.microDateDays(lastmodified); - int mddct = plasmaWordIndex.microDateDays(updatetime); + int mddlm = kelondroMicroDate.microDateDays(lastmodified); + int mddct = kelondroMicroDate.microDateDays(updatetime); this.entry.setCol(col_urlhash, urlHash, null); this.entry.setCol(col_lastModified, mddlm); this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation @@ -158,11 +158,6 @@ public final class indexRWIRowEntry implements indexRWIEntry { this.entry = rentry; } - public static int days(long time) { - // calculates the number of days since 1.1.1970 and returns this as 4-byte array - return (int) (time / 86400000); - } - public indexRWIRowEntry clone() { byte[] b = new byte[urlEntryRow.objectsize]; System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize); @@ -186,11 +181,11 @@ public final class indexRWIRowEntry implements indexRWIEntry { } public long lastModified() { - return plasmaWordIndex.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified)); + return kelondroMicroDate.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified)); } public long freshUntil() { - return plasmaWordIndex.reverseMicroDateDays((int) this.entry.getColLong(col_freshUntil)); + return kelondroMicroDate.reverseMicroDateDays((int) this.entry.getColLong(col_freshUntil)); } public int hitcount() { diff --git a/source/de/anomic/index/indexRWIVarEntry.java b/source/de/anomic/index/indexRWIVarEntry.java index fd55e5a55..bfd2eaf2a 100644 --- a/source/de/anomic/index/indexRWIVarEntry.java +++ b/source/de/anomic/index/indexRWIVarEntry.java @@ -27,7 +27,7 @@ package de.anomic.index; import de.anomic.kelondro.kelondroBitfield; -import de.anomic.plasma.plasmaWordIndex; +import de.anomic.kelondro.kelondroMicroDate; public class indexRWIVarEntry implements indexRWIEntry { @@ -62,8 +62,8 @@ public class indexRWIVarEntry implements indexRWIEntry { double termfrequency ) { if ((language == null) || (language.length() != 2)) language = "uk"; - int mddlm = plasmaWordIndex.microDateDays(lastmodified); - int mddct = plasmaWordIndex.microDateDays(updatetime); + int mddlm = kelondroMicroDate.microDateDays(lastmodified); + int mddct = kelondroMicroDate.microDateDays(updatetime); this.flags = flags; this.freshUntil = Math.max(0, mddlm + (mddct - mddlm) * 2); this.lastModified = lastmodified; diff --git a/source/de/anomic/kelondro/kelondroMicroDate.java b/source/de/anomic/kelondro/kelondroMicroDate.java new file mode 100755 index 000000000..a1fef46d9 --- /dev/null +++ b/source/de/anomic/kelondro/kelondroMicroDate.java @@ -0,0 +1,64 @@ +// microDate.java +// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 3.7.2008 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.kelondro; + +import java.util.Date; + +public class kelondroMicroDate { + + private static final long hour = 3600000L; // milliseconds of a hour + private static final long day = 86400000L; // milliseconds of a day + + public static int microDateDays(Date modified) { + return microDateDays(modified.getTime()); + } + + public static int microDateDays(long modified) { + // this calculates a virtual age from a given date + // the purpose is to have an age in days of a given modified date + // from a fixed standpoint in the past + // one day has 60*60*24 seconds = 86400 seconds + // we take mod 64**3 = 262144, this is the mask of the storage + return (int) ((modified / day) % 262144L); + } + + public static String microDateHoursStr(long time) { + return kelondroBase64Order.enhancedCoder.encodeLong(microDateHoursInt(time), 3); + } + + public static int microDateHoursInt(long time) { + return (int) ((time / hour) % 262144L); + } + + public static int microDateHoursAge(String mdhs) { + return microDateHoursInt(System.currentTimeMillis()) - (int) kelondroBase64Order.enhancedCoder.decodeLong(mdhs); + } + + public static long reverseMicroDateDays(long microDateDays) { + return Math.min(System.currentTimeMillis(), microDateDays * day); + } +} diff --git a/source/de/anomic/kelondro/kelondroRelations.java b/source/de/anomic/kelondro/kelondroRelations.java new file mode 100755 index 000000000..4ecd222ce --- /dev/null +++ b/source/de/anomic/kelondro/kelondroRelations.java @@ -0,0 +1,142 @@ +package de.anomic.kelondro; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; + +public class kelondroRelations { + + private File baseDir; + private HashMap relations; + + public kelondroRelations(File location) { + this.baseDir = location; + } + + private static kelondroRow rowdef(String filename) { + int p = filename.lastIndexOf('.'); + if (p >= 0) filename = filename.substring(0, p); + p = filename.lastIndexOf('-'); + assert p >= 0; + int payloadsize = Integer.parseInt(filename.substring(p + 1)); + filename = filename.substring(0, p); + p = filename.lastIndexOf('-'); + assert p >= 0; + int keysize = Integer.parseInt(filename.substring(p + 1)); + return rowdef(keysize, payloadsize); + } + + private static kelondroRow rowdef(int keysize, int payloadsize) { + return new kelondroRow( + "byte[] key-" + keysize + ", " + + "long time-8" + keysize + ", " + + "int ttl-4" + keysize + ", " + + "byte[] node-" + payloadsize, + kelondroNaturalOrder.naturalOrder, 0); + } + + private static String filename(String tablename, int keysize, int payloadsize) { + return tablename + "-" + keysize + "-" + payloadsize + ".eco"; + } + + public void declareRelation(String name, int keysize, int payloadsize) { + // try to get the relation from the relation-cache + kelondroIndex relation = relations.get(name); + if (relation != null) return; + // try to find the relation as stored on file + String[] list = baseDir.list(); + String targetfilename = filename(name, keysize, payloadsize); + for (int i = 0; i < list.length; i++) { + if (list[i].startsWith(name)) { + if (!list[i].equals(targetfilename)) continue; + kelondroRow row = rowdef(list[i]); + if (row.primaryKeyLength != keysize || row.column(1).cellwidth != payloadsize) continue; // a wrong table + kelondroIndex table = new kelondroEcoTable(new File(baseDir, list[i]), row, kelondroEcoTable.tailCacheUsageAuto, 1024*1024, 0); + relations.put(name, table); + return; + } + } + // the relation does not exist, create it + kelondroRow row = rowdef(keysize, payloadsize); + kelondroIndex table = new kelondroEcoTable(new File(baseDir, targetfilename), row, kelondroEcoTable.tailCacheUsageAuto, 1024*1024, 0); + relations.put(name, table); + } + + public kelondroIndex getRelation(String name) { + // try to get the relation from the relation-cache + kelondroIndex relation = relations.get(name); + if (relation != null) return relation; + // try to find the relation as stored on file + String[] list = baseDir.list(); + for (int i = 0; i < list.length; i++) { + if (list[i].startsWith(name)) { + kelondroRow row = rowdef(list[i]); + kelondroIndex table = new kelondroEcoTable(new File(baseDir, list[i]), row, kelondroEcoTable.tailCacheUsageAuto, 1024*1024, 0); + relations.put(name, table); + return table; + } + } + // the relation does not exist + return null; + } + + public String putRelation(String name, String key, String value) throws IOException { + byte[] r = putRelation(name, key.getBytes(), value.getBytes()); + if (r == null) return null; + return new String(r); + } + + public byte[] putRelation(String name, byte[] key, byte[] value) throws IOException { + kelondroIndex table = getRelation(name); + if (table == null) return null; + kelondroRow.Entry entry = table.row().newEntry(); + entry.setCol(0, key); + entry.setCol(1, System.currentTimeMillis()); + entry.setCol(2, 1000000); + entry.setCol(3, value); + kelondroRow.Entry oldentry = table.put(entry); + if (oldentry == null) return null; + return oldentry.getColBytes(3); + } + + public String getRelation(String name, String key) throws IOException { + byte[] r = getRelation(name, key.getBytes()); + if (r == null) return null; + return new String(r); + } + + public byte[] getRelation(String name, byte[] key) throws IOException { + kelondroIndex table = getRelation(name); + if (table == null) return null; + kelondroRow.Entry entry = table.get(key); + if (entry == null) return null; + return entry.getColBytes(3); + } + + public boolean hasRelation(String name, byte[] key) throws IOException { + kelondroIndex table = getRelation(name); + if (table == null) return false; + return table.has(key); + } + + public byte[] removeRelation(String name, byte[] key) throws IOException { + kelondroIndex table = getRelation(name); + if (table == null) return null; + kelondroRow.Entry entry = table.remove(key, false); + if (entry == null) return null; + return entry.getColBytes(3); + } + + public static void main(String args[]) { + kelondroRelations r = new kelondroRelations(new File("/Users/admin/")); + try { + String table1 = "test1"; + r.declareRelation(table1, 12, 30); + r.putRelation(table1, "abcdefg", "eineintrag"); + r.putRelation(table1, "abcdefg", "eineintrag"); + } catch (IOException e) { + e.printStackTrace(); + } + } + +} diff --git a/source/de/anomic/plasma/plasmaRankingCRProcess.java b/source/de/anomic/plasma/plasmaRankingCRProcess.java index 714086017..301932fe1 100644 --- a/source/de/anomic/plasma/plasmaRankingCRProcess.java +++ b/source/de/anomic/plasma/plasmaRankingCRProcess.java @@ -55,6 +55,7 @@ import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroCollectionIndex; import de.anomic.kelondro.kelondroFlexTable; import de.anomic.kelondro.kelondroIndex; +import de.anomic.kelondro.kelondroMicroDate; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRowSet; import de.anomic.server.serverDate; @@ -131,8 +132,8 @@ public class plasmaRankingCRProcess { } else { // initialize counters and dates acc_entry = acc.newEntry(key, new_entry.getAttrs(), new_entry.getSeqSet()); - FUDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // first update date - FDDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // very difficult to compute; this is only a quick-hack + FUDate = kelondroMicroDate.microDateHoursInt(System.currentTimeMillis()); // first update date + FDDate = kelondroMicroDate.microDateHoursInt(System.currentTimeMillis()); // very difficult to compute; this is only a quick-hack LUDate = (int) new_entry.getAttr("VDate", 0); UCount = 0; PCount = (new_flags.get(1)) ? 1 : 0; @@ -210,8 +211,8 @@ public class plasmaRankingCRProcess { acc_entry.setCol(i, new_entry.getAttr(acc.row().column(i).nickname, 0)); } seq.put(key.getBytes(), new_entry.getSeqCollection()); - FUDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // first update date - FDDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // very difficult to compute; this is only a quick-hack + FUDate = kelondroMicroDate.microDateHoursInt(System.currentTimeMillis()); // first update date + FDDate = kelondroMicroDate.microDateHoursInt(System.currentTimeMillis()); // very difficult to compute; this is only a quick-hack LUDate = (int) new_entry.getAttr("VDate", 0); UCount = 0; PCount = (new_flags.get(1)) ? 1 : 0; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 0df164baa..989ff674d 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1019,6 +1019,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch