From 7f67238f8bbf25cf8935248bcddf192989ae66ca Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 13 Mar 2009 14:56:25 +0000 Subject: [PATCH] refactoring of plasmaWordIndex: less methods in the class, separated the index to CachedIndexCollection git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5710 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexCleaner_p.java | 2 +- htroot/IndexControlRWIs_p.java | 14 +- htroot/IndexControlURLs_p.java | 2 +- htroot/IndexImport_p.java | 2 +- htroot/IndexShare_p.java | 4 +- htroot/PerformanceGraph.java | 2 +- htroot/PerformanceQueues_p.java | 14 +- htroot/api/queues_p.java | 2 +- htroot/api/status_p.java | 4 +- htroot/api/timeline.java | 2 +- htroot/yacy/query.java | 4 +- htroot/yacy/search.java | 2 +- htroot/yacy/transferRWI.java | 8 +- htroot/yacysearch.java | 2 +- source/de/anomic/crawler/Balancer.java | 6 +- source/de/anomic/crawler/IndexingStack.java | 29 +- source/de/anomic/kelondro/table/Stack.java | 19 +- .../kelondro/text/CachedIndexCollection.java | 449 ++++++++++++++++++ .../kelondro/text/ReferenceContainer.java | 5 +- source/de/anomic/plasma/plasmaDbImporter.java | 22 +- .../de/anomic/plasma/plasmaSearchEvent.java | 6 +- .../plasma/plasmaSearchRankingProcess.java | 2 +- .../de/anomic/plasma/plasmaSnippetCache.java | 4 +- .../de/anomic/plasma/plasmaSwitchboard.java | 16 +- source/de/anomic/plasma/plasmaWordIndex.java | 415 +--------------- source/de/anomic/yacy/yacyClient.java | 5 +- source/yacy.java | 4 +- 27 files changed, 551 insertions(+), 495 deletions(-) create mode 100644 source/de/anomic/kelondro/text/CachedIndexCollection.java diff --git a/htroot/IndexCleaner_p.java b/htroot/IndexCleaner_p.java index 9f4156a79..7fadfc7b0 100755 --- a/htroot/IndexCleaner_p.java +++ b/htroot/IndexCleaner_p.java @@ -94,7 +94,7 @@ public class IndexCleaner_p { prop.put("rwidb_threadAlive", indexCleanerThread.isAlive() + ""); prop.put("rwidb_threadToString", indexCleanerThread.toString()); prop.putNum("rwidb_RWIcountstart", indexCleanerThread.rwiCountAtStart); - prop.putNum("rwidb_RWIcountnow", sb.webIndex.size()); + prop.putNum("rwidb_RWIcountnow", sb.webIndex.index().size()); prop.put("rwidb_wordHashNow", indexCleanerThread.wordHashNow); prop.put("rwidb_lastWordHash", indexCleanerThread.lastWordHash); prop.putNum("rwidb_lastDeletionCounter", indexCleanerThread.lastDeletionCounter); diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index fc39a9050..09065e811 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -124,7 +124,7 @@ public class IndexControlRWIs_p { if (delurl || delurlref) { // generate an urlx array ReferenceContainer index = null; - index = sb.webIndex.getReferences(keyhash, null); + index = sb.webIndex.index().getReferences(keyhash, null); final Iterator en = index.entries(); int i = 0; urlx = new String[index.size()]; @@ -141,7 +141,7 @@ public class IndexControlRWIs_p { sb.urlRemove(urlx[i]); } } - sb.webIndex.deleteAllReferences(keyhash); + sb.webIndex.index().deleteAllReferences(keyhash); post.remove("keyhashdeleteall"); post.put("urllist", "generated"); } @@ -158,7 +158,7 @@ public class IndexControlRWIs_p { } final Set urlHashes = new HashSet(); for (int i = 0; i < urlx.length; i++) urlHashes.add(urlx[i]); - sb.webIndex.removeReferences(keyhash, urlHashes); + sb.webIndex.index().removeReferences(keyhash, urlHashes); // this shall lead to a presentation of the list; so handle that the remaining program // thinks that it was called for a list presentation post.remove("keyhashdelete"); @@ -200,7 +200,7 @@ public class IndexControlRWIs_p { // prepare index ReferenceContainer index; final long starttime = System.currentTimeMillis(); - index = sb.webIndex.getReferences(keyhash, null); + index = sb.webIndex.index().getReferences(keyhash, null); // built urlCache final Iterator urlIter = index.entries(); final HashMap knownURLs = new HashMap(); @@ -237,7 +237,7 @@ public class IndexControlRWIs_p { // generate list if (post.containsKey("keyhashsimilar")) { - final Iterator containerIt = sb.webIndex.indexContainerSet(keyhash, false, true, 256).iterator(); + final Iterator containerIt = sb.webIndex.index().indexContainerSet(keyhash, false, true, 256).iterator(); ReferenceContainer container; int i = 0; int rows = 0, cols = 0; @@ -315,7 +315,7 @@ public class IndexControlRWIs_p { } catch (final IOException e) { } } - sb.webIndex.removeReferences(keyhash, urlHashes); + sb.webIndex.index().removeReferences(keyhash, urlHashes); } if (prop.getInt("searchresult", 0) == 3) plasmaSearchAPI.listHosts(prop, keyhash, sb); @@ -323,7 +323,7 @@ public class IndexControlRWIs_p { // insert constants - prop.putNum("wcount", sb.webIndex.size()); + prop.putNum("wcount", sb.webIndex.index().size()); // return rewrite properties return prop; } diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 3db3200fc..083b1f74d 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -182,7 +182,7 @@ public class IndexControlURLs_p { // generate list if (post.containsKey("urlhashsimilar")) { try { - final Iterator entryIt = new RotateIterator(sb.webIndex.metadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.webIndex.size()); + final Iterator entryIt = new RotateIterator(sb.webIndex.metadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.webIndex.index().size()); final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:
"); MetadataRowContainer entry; int i = 0; diff --git a/htroot/IndexImport_p.java b/htroot/IndexImport_p.java index 18d23a809..c6421d385 100644 --- a/htroot/IndexImport_p.java +++ b/htroot/IndexImport_p.java @@ -106,7 +106,7 @@ public final class IndexImport_p { } } - prop.putNum("wcount", switchboard.webIndex.size()); + prop.putNum("wcount", switchboard.webIndex.index().size()); prop.putNum("ucount", switchboard.webIndex.metadata().size()); /* diff --git a/htroot/IndexShare_p.java b/htroot/IndexShare_p.java index 7cb854c75..eede204b5 100644 --- a/htroot/IndexShare_p.java +++ b/htroot/IndexShare_p.java @@ -55,7 +55,7 @@ public class IndexShare_p { prop.put("wordfreq", switchboard.getConfigLong("defaultWordReceiveFrequency",10)); prop.put("dtable", ""); prop.put("rtable", ""); - prop.putNum("wcount", switchboard.webIndex.size()); + prop.putNum("wcount", switchboard.webIndex.index().size()); prop.putNum("ucount", switchboard.webIndex.metadata().size()); return prop; // be save } @@ -68,7 +68,7 @@ public class IndexShare_p { } // insert constants - prop.putNum("wcount", switchboard.webIndex.size()); + prop.putNum("wcount", switchboard.webIndex.index().size()); prop.putNum("ucount", switchboard.webIndex.metadata().size()); // return rewrite properties diff --git a/htroot/PerformanceGraph.java b/htroot/PerformanceGraph.java index 0655fcfbf..cee7d2b7b 100644 --- a/htroot/PerformanceGraph.java +++ b/htroot/PerformanceGraph.java @@ -41,7 +41,7 @@ public class PerformanceGraph { final int width = post.getInt("width", 660); final int height = post.getInt("height", 240); - return plasmaProfiling.performanceGraph(width, height, sb.webIndex.metadata().size() + " URLS / " + sb.webIndex.collectionsSize() + " WORDS IN COLLECTIONS / " + sb.webIndex.cacheSize() + " WORDS IN CACHE"); + return plasmaProfiling.performanceGraph(width, height, sb.webIndex.metadata().size() + " URLS / " + sb.webIndex.index().collectionsSize() + " WORDS IN COLLECTIONS / " + sb.webIndex.index().cacheSize() + " WORDS IN CACHE"); } } \ No newline at end of file diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index 5167cdb4e..6897c3b90 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -199,7 +199,7 @@ public class PerformanceQueues_p { // disallow setting of memprereq for indexer to prevent db from throwing OOMs prop.put("table_" + c + "_disabled", /*(threadName.endsWith("_indexing")) ? 1 :*/ "0"); prop.put("table_" + c + "_recommendation", threadName.endsWith("_indexing") ? "1" : "0"); - prop.putNum("table_" + c + "_recommendation_value", threadName.endsWith("_indexing") ? (switchboard.webIndex.minMem() / 1024) : 0); + prop.putNum("table_" + c + "_recommendation_value", threadName.endsWith("_indexing") ? (switchboard.webIndex.index().minMem() / 1024) : 0); c++; } prop.put("table", c); @@ -229,7 +229,7 @@ public class PerformanceQueues_p { if ((post != null) && (post.containsKey("cacheSizeSubmit"))) { final int wordCacheMaxCount = post.getInt("wordCacheMaxCount", 20000); switchboard.setConfig(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount)); - switchboard.webIndex.setMaxWordCount(wordCacheMaxCount); + switchboard.webIndex.index().setMaxWordCount(wordCacheMaxCount); final int wordCacheInitCount = post.getInt(plasmaSwitchboardConstants.WORDCACHE_INIT_COUNT, 30000); switchboard.setConfig(plasmaSwitchboardConstants.WORDCACHE_INIT_COUNT, Integer.toString(wordCacheInitCount)); @@ -288,11 +288,11 @@ public class PerformanceQueues_p { // table cache settings prop.putNum("urlCacheSize", switchboard.webIndex.metadata().writeCacheSize()); - prop.putNum("wordCacheSize", switchboard.webIndex.indexCacheSize()); - prop.putNum("wordCacheSizeKBytes", switchboard.webIndex.indexCacheSizeBytes()/1024); - prop.putNum("maxURLinCache", switchboard.webIndex.maxURLinCache()); - prop.putNum("maxAgeOfCache", switchboard.webIndex.maxAgeOfCache() / 1000 / 60); // minutes - prop.putNum("minAgeOfCache", switchboard.webIndex.minAgeOfCache() / 1000 / 60); // minutes + prop.putNum("wordCacheSize", switchboard.webIndex.index().indexCacheSize()); + prop.putNum("wordCacheSizeKBytes", switchboard.webIndex.index().indexCacheSizeBytes()/1024); + prop.putNum("maxURLinCache", switchboard.webIndex.index().maxURLinCache()); + prop.putNum("maxAgeOfCache", switchboard.webIndex.index().maxAgeOfCache() / 1000 / 60); // minutes + prop.putNum("minAgeOfCache", switchboard.webIndex.index().minAgeOfCache() / 1000 / 60); // minutes prop.putNum("maxWaitingWordFlush", switchboard.getConfigLong("maxWaitingWordFlush", 180)); prop.put("wordCacheMaxCount", switchboard.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 20000)); prop.put("wordCacheInitCount", switchboard.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_INIT_COUNT, 30000)); diff --git a/htroot/api/queues_p.java b/htroot/api/queues_p.java index 226df3cc5..573b25833 100755 --- a/htroot/api/queues_p.java +++ b/htroot/api/queues_p.java @@ -42,7 +42,7 @@ public class queues_p { prop.putNum("indexingSize", sb.getThread(plasmaSwitchboardConstants.INDEXER).getJobCount() + sb.webIndex.queuePreStack.getActiveQueueSize()); prop.putNum("indexingMax", (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30)); prop.putNum("urlpublictextSize", sb.webIndex.metadata().size()); - prop.putNum("rwipublictextSize", sb.webIndex.size()); + prop.putNum("rwipublictextSize", sb.webIndex.index().size()); if ((sb.webIndex.queuePreStack.size() == 0) && (sb.webIndex.queuePreStack.getActiveQueueSize() == 0)) { prop.put("list", "0"); //is empty } else { diff --git a/htroot/api/status_p.java b/htroot/api/status_p.java index 47efe28c4..437fd9188 100644 --- a/htroot/api/status_p.java +++ b/htroot/api/status_p.java @@ -21,11 +21,11 @@ public class status_p { prop.setLocalized(false); prop.put("rejected", "0"); sb.updateMySeed(); - final int cacheSize = sb.webIndex.indexCacheSize(); + final int cacheSize = sb.webIndex.index().indexCacheSize(); final long cacheMaxSize = sb.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 10000); prop.putNum("ppm", sb.currentPPM()); prop.putNum("qpm", sb.webIndex.peers().mySeed().getQPM()); - prop.putNum("wordCacheSize", sb.webIndex.indexCacheSize()); + prop.putNum("wordCacheSize", sb.webIndex.index().indexCacheSize()); prop.putNum("wordCacheSize", cacheSize); prop.putNum("wordCacheMaxSize", cacheMaxSize); prop.put("wordCacheCount", cacheSize); diff --git a/htroot/api/timeline.java b/htroot/api/timeline.java index 6dc39bd05..f73b499c5 100644 --- a/htroot/api/timeline.java +++ b/htroot/api/timeline.java @@ -78,7 +78,7 @@ public final class timeline { yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links"); // get the index container with the result vector - HashMap[] localSearchContainerMaps = sb.webIndex.localSearchContainers(query[0], query[1], null); + HashMap[] localSearchContainerMaps = sb.webIndex.index().localSearchContainers(query[0], query[1], null); final ReferenceContainer index = ReferenceContainer.joinExcludeContainers( localSearchContainerMaps[0].values(), diff --git a/htroot/yacy/query.java b/htroot/yacy/query.java index e37874eed..d34566b8b 100644 --- a/htroot/yacy/query.java +++ b/htroot/yacy/query.java @@ -82,13 +82,13 @@ public final class query { if (obj.equals("rwiurlcount")) { // the total number of different urls in the rwi is returned // shall contain a word hash, the number of assigned lurls to this hash is returned - prop.put("response", sb.webIndex.getReferences(env, null).size()); + prop.put("response", sb.webIndex.index().getReferences(env, null).size()); return prop; } if (obj.equals("rwicount")) { // return the total number of available word indexes - prop.put("response", sb.webIndex.size()); + prop.put("response", sb.webIndex.index().size()); return prop; } diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 5102112f3..4853ff31f 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -185,7 +185,7 @@ public final class search { yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); final long timer = System.currentTimeMillis(); - final Map[] containers = sb.webIndex.localSearchContainers(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2Set(urls)); + final Map[] containers = sb.webIndex.index().localSearchContainers(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2Set(urls)); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), plasmaSearchEvent.COLLECTION, containers[0].size(), System.currentTimeMillis() - timer), false); if (containers != null) { diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index bad7b6bc0..e21792644 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -100,9 +100,9 @@ public final class transferRWI { sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". Not granted."); result = "not_granted"; pause = 0; - } else if (sb.webIndex.indexCacheSize() > cachelimit) { + } else if (sb.webIndex.index().indexCacheSize() > cachelimit) { // we are too busy to receive indexes - sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.webIndex.indexCacheSize() + ")."); + sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.webIndex.index().indexCacheSize() + ")."); granted = false; // don't accept more words if there are too many words to flush result = "busy"; pause = 60000; @@ -157,7 +157,7 @@ public final class transferRWI { } // learn entry - sb.webIndex.addEntry(wordHash, iEntry, System.currentTimeMillis()); + sb.webIndex.index().addEntry(wordHash, iEntry, System.currentTimeMillis()); serverCore.checkInterruption(); // check if we need to ask for the corresponding URL @@ -193,7 +193,7 @@ public final class transferRWI { } result = "ok"; - pause = (int) (sb.webIndex.indexCacheSize() * 20000 / sb.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time + pause = (int) (sb.webIndex.index().indexCacheSize() * 20000 / sb.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time } prop.put("unknownURL", unknownURLs.toString()); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index c43c19e64..412cf178b 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -315,7 +315,7 @@ public class yacysearch { // delete the index entry locally final String delHash = post.get("deleteref", ""); // urlhash - sb.webIndex.removeWordReferences(query[0], delHash); + sb.webIndex.index().removeWordReferences(query[0], delHash); // make new news message with negative voting final HashMap map = new HashMap(); diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 2503d947b..7224f9ff1 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -81,7 +81,7 @@ public class Balancer { if (urlFileStack.size() != urlFileIndex.size() || (urlFileIndex.size() < 10000 && urlFileIndex.size() > 0)) { // fix the file stack Log.logInfo("Balancer", "re-creating the " + stackname + " balancer stack, size = " + urlFileIndex.size() + ((urlFileStack.size() == urlFileIndex.size()) ? "" : " (the old stack size was wrong)" )); - urlFileStack = Stack.reset(urlFileStack); + urlFileStack.clear(); try { final Iterator i = urlFileIndex.keys(true, null); byte[] hash; @@ -130,7 +130,7 @@ public class Balancer { } public synchronized void clear() { - urlFileStack = Stack.reset(urlFileStack); + urlFileStack.clear(); domainStacks.clear(); urlRAMStack.clear(); resetFileIndex(); @@ -544,7 +544,7 @@ public class Balancer { if (nextentry == null) { // emergency case: this means that something with the stack organization is wrong // the file appears to be broken. We kill the file. - Stack.reset(urlFileStack); + urlFileStack.clear(); Log.logSevere("BALANCER", "get() failed to fetch entry from file stack. reset stack file."); } else { final String nexthash = new String(nextentry.getColBytes(0)); diff --git a/source/de/anomic/crawler/IndexingStack.java b/source/de/anomic/crawler/IndexingStack.java index 5203db1c0..e0bebd842 100644 --- a/source/de/anomic/crawler/IndexingStack.java +++ b/source/de/anomic/crawler/IndexingStack.java @@ -51,19 +51,16 @@ import de.anomic.yacy.yacyURL; public class IndexingStack { - Stack sbQueueStack; - CrawlProfile profiles; - plasmaWordIndex wordIndex; - private final File sbQueueStackPath; - ConcurrentHashMap queueInProcess; + private final Stack sbQueueStack; + private final CrawlProfile profiles; + private final plasmaWordIndex wordIndex; + private final ConcurrentHashMap queueInProcess; public IndexingStack(final plasmaWordIndex wordIndex, final File sbQueueStackPath, final CrawlProfile profiles) { - this.sbQueueStackPath = sbQueueStackPath; this.profiles = profiles; this.wordIndex = wordIndex; this.queueInProcess = new ConcurrentHashMap(); - - initQueueStack(); + this.sbQueueStack = Stack.open(sbQueueStackPath, rowdef); } public static final Row rowdef = new Row( @@ -77,18 +74,7 @@ public class IndexingStack { "String urldescr-80", NaturalOrder.naturalOrder, 0); - - private void initQueueStack() { - sbQueueStack = Stack.open(sbQueueStackPath, rowdef); - } - - /* - private void resetQueueStack() { - try {sbQueueStack.close();} catch (Exception e) {} - if (sbQueueStackPath.exists()) sbQueueStackPath.delete(); - initQueueStack(); - } - */ + public int size() { return (sbQueueStack == null) ? 0 : sbQueueStack.size(); } @@ -131,14 +117,13 @@ public class IndexingStack { } public void clear() { - sbQueueStack = Stack.reset(sbQueueStack); + sbQueueStack.clear(); } public void close() { if (sbQueueStack != null) { sbQueueStack.close(); } - sbQueueStack = null; } protected void finalize() throws Throwable { diff --git a/source/de/anomic/kelondro/table/Stack.java b/source/de/anomic/kelondro/table/Stack.java index c66577363..cd6c08390 100644 --- a/source/de/anomic/kelondro/table/Stack.java +++ b/source/de/anomic/kelondro/table/Stack.java @@ -78,17 +78,14 @@ public final class Stack extends FullRecords { } } - public static Stack reset(final Stack stack) { - // memorize settings to this file - final File f = new File(stack.filename); - final Row row = stack.row(); - - // close and delete the file - try {stack.close();} catch (final Exception e) {} - if (f.exists()) f.delete(); - - // re-open a database with same settings as before - return open(f, row); + public void clear() { + try { + super.clear(); + setHandle(root, null); + setHandle(toor, null); + } catch (IOException e) { + e.printStackTrace(); + } } public Iterator stackIterator(final boolean up) { diff --git a/source/de/anomic/kelondro/text/CachedIndexCollection.java b/source/de/anomic/kelondro/text/CachedIndexCollection.java new file mode 100644 index 000000000..fb5f3c50b --- /dev/null +++ b/source/de/anomic/kelondro/text/CachedIndexCollection.java @@ -0,0 +1,449 @@ +// plasmaWordIndex.java +// (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 2005 on http://www.anomic.de +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2009-03-13 11:34:51 +0100 (Fr, 13 Mrz 2009) $ +// $LastChangedRevision: 5709 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.kelondro.text; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Set; +import java.util.TreeSet; + +import de.anomic.kelondro.index.RowCollection; +import de.anomic.kelondro.order.Base64Order; +import de.anomic.kelondro.order.ByteOrder; +import de.anomic.kelondro.order.CloneableIterator; +import de.anomic.kelondro.order.Order; +import de.anomic.kelondro.order.RotateIterator; +import de.anomic.kelondro.text.Index; +import de.anomic.kelondro.text.IndexCache; +import de.anomic.kelondro.text.IndexCollection; +import de.anomic.kelondro.text.ReferenceContainer; +import de.anomic.kelondro.text.ReferenceContainerOrder; +import de.anomic.kelondro.text.ReferenceRow; +import de.anomic.kelondro.text.Word; +import de.anomic.kelondro.util.MemoryControl; +import de.anomic.kelondro.util.Log; +import de.anomic.server.serverProfiling; + +public final class CachedIndexCollection implements Index { + + // environment constants + public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes + public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash + public static final int lowcachedivisor = 900; + public static final int maxCollectionPartition = 7; // should be 7 + private static final ByteOrder indexOrder = Base64Order.enhancedCoder; + + + + private final IndexCache indexCache; + private final IndexCollection collections; // new database structure to replace AssortmentCluster and FileCluster + + public CachedIndexCollection( + File indexPrimaryTextLocation, + final int entityCacheMaxSize, + final boolean useCommons, + final int redundancy, + Log log) throws IOException { + + final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE"); + if (!(textindexcache.exists())) textindexcache.mkdirs(); + if (new File(textindexcache, "index.dhtin.blob").exists()) { + // migration of the both caches into one + this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log); + IndexCache dhtInCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtin.blob", log); + for (ReferenceContainer c: dhtInCache) { + this.indexCache.addReferences(c); + } + new File(textindexcache, "index.dhtin.blob").delete(); + } else { + // read in new BLOB + this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log); + } + + // create collections storage path + final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION"); + if (!(textindexcollections.exists())) textindexcollections.mkdirs(); + this.collections = new IndexCollection( + textindexcollections, + "collection", + 12, + Base64Order.enhancedCoder, + maxCollectionPartition, + ReferenceRow.urlEntryRow, + useCommons); + } + + public void clear() { + indexCache.clear(); + try { + collections.clear(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public int minMem() { + return 1024*1024 /* indexing overhead */ + indexCache.minMem() + collections.minMem(); + } + + public int maxURLinCache() { + return indexCache.maxURLinCache(); + } + + public long minAgeOfCache() { + return indexCache.minAgeOfCache(); + } + + public long maxAgeOfCache() { + return indexCache.maxAgeOfCache(); + } + + public int indexCacheSize() { + return indexCache.size(); + } + + public long indexCacheSizeBytes() { + // calculate the real size in bytes of the index cache + long cacheBytes = 0; + final long entryBytes = ReferenceRow.urlEntryRow.objectsize; + final IndexCache cache = (indexCache); + synchronized (cache) { + final Iterator it = cache.referenceIterator(null, false, true); + while (it.hasNext()) cacheBytes += it.next().size() * entryBytes; + } + return cacheBytes; + } + + public void setMaxWordCount(final int maxWords) { + indexCache.setMaxWordCount(maxWords); + } + + public void cacheFlushControl(final IndexCache theCache) { + // check for forced flush + int cs = cacheSize(); + if (cs > 0) { + // flush elements that are too big. This flushing depends on the fact that the flush rule + // selects the biggest elements first for flushing. If it does not for any reason, the following + // loop would not terminate. + serverProfiling.update("wordcache", Long.valueOf(cs), true); + // To ensure termination an additional counter is used + int l = 0; + while (theCache.size() > 0 && (l++ < 100) && (theCache.maxURLinCache() > wCacheMaxChunk)) { + flushCacheOne(theCache); + } + // next flush more entries if the size exceeds the maximum size of the cache + while (theCache.size() > 0 && + ((theCache.size() > theCache.getMaxWordCount()) || + (MemoryControl.available() < collections.minMem()))) { + flushCacheOne(theCache); + } + if (cacheSize() != cs) serverProfiling.update("wordcache", Long.valueOf(cacheSize()), true); + } + } + + public static ReferenceContainer emptyContainer(final String wordHash, final int elementCount) { + return new ReferenceContainer(wordHash, ReferenceRow.urlEntryRow, elementCount); + } + + public void addEntry(final String wordHash, final ReferenceRow entry, final long updateTime) { + // add the entry + indexCache.addEntry(wordHash, entry, updateTime, true); + cacheFlushControl(this.indexCache); + } + + public void addReferences(final ReferenceContainer entries) { + assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize); + + // add the entry + indexCache.addReferences(entries); + cacheFlushControl(this.indexCache); + } + + public void flushCacheFor(int time) { + flushCacheUntil(System.currentTimeMillis() + time); + } + + private synchronized void flushCacheUntil(long timeout) { + while (System.currentTimeMillis() < timeout && indexCache.size() > 0) { + flushCacheOne(indexCache); + } + } + + private synchronized void flushCacheOne(final IndexCache ram) { + if (ram.size() > 0) collections.addReferences(flushContainer(ram)); + } + + private ReferenceContainer flushContainer(final IndexCache ram) { + String wordHash; + ReferenceContainer c; + wordHash = ram.maxScoreWordHash(); + c = ram.getReferences(wordHash, null); + if ((c != null) && (c.size() > wCacheMaxChunk)) { + return ram.deleteAllReferences(wordHash); + } else { + return ram.deleteAllReferences(ram.bestFlushWordHash()); + } + } + + public boolean hasReferences(final String wordHash) { + if (indexCache.hasReferences(wordHash)) return true; + if (collections.hasReferences(wordHash)) return true; + return false; + } + + public ReferenceContainer getReferences(final String wordHash, final Set urlselection) { + if (wordHash == null) { + // wrong input + return null; + } + + // get from cache + ReferenceContainer container; + container = indexCache.getReferences(wordHash, urlselection); + + // get from collection index + if (container == null) { + container = collections.getReferences(wordHash, urlselection); + } else { + container.addAllUnique(collections.getReferences(wordHash, urlselection)); + } + + if (container == null) return null; + + // check doubles + final int beforeDouble = container.size(); + container.sort(); + final ArrayList d = container.removeDoubles(); + RowCollection set; + for (int i = 0; i < d.size(); i++) { + // for each element in the double-set, take that one that is the most recent one + set = d.get(i); + ReferenceRow e, elm = null; + long lm = 0; + for (int j = 0; j < set.size(); j++) { + e = new ReferenceRow(set.get(j, true)); + if ((elm == null) || (e.lastModified() > lm)) { + elm = e; + lm = e.lastModified(); + } + } + if(elm != null) { + container.addUnique(elm.toKelondroEntry()); + } + } + if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getWordHash()); + + return container; + } + + /** + * return map of wordhash:indexContainer + * + * @param wordHashes + * @param urlselection + * @param deleteIfEmpty + * @param interruptIfEmpty + * @return + */ + public HashMap getContainers(final Set wordHashes, final Set urlselection, final boolean interruptIfEmpty) { + // retrieve entities that belong to the hashes + final HashMap containers = new HashMap(wordHashes.size()); + String singleHash; + ReferenceContainer singleContainer; + final Iterator i = wordHashes.iterator(); + while (i.hasNext()) { + + // get next word hash: + singleHash = i.next(); + + // retrieve index + singleContainer = getReferences(singleHash, urlselection); + + // check result + if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashMap(0); + + containers.put(singleHash, singleContainer); + } + return containers; + } + + @SuppressWarnings("unchecked") + public HashMap[] localSearchContainers( + final TreeSet queryHashes, + final TreeSet excludeHashes, + final Set urlselection) { + // search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result + + // retrieve entities that belong to the hashes + HashMap inclusionContainers = (queryHashes.size() == 0) ? new HashMap(0) : getContainers( + queryHashes, + urlselection, + true); + if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap(0); // prevent that only a subset is returned + final HashMap exclusionContainers = (inclusionContainers.size() == 0) ? new HashMap(0) : getContainers( + excludeHashes, + urlselection, + true); + return new HashMap[]{inclusionContainers, exclusionContainers}; + } + + public int size() { + return java.lang.Math.max(collections.size(), indexCache.size()); + } + + public int collectionsSize() { + return collections.size(); + } + + public int cacheSize() { + return indexCache.size(); + } + + public void close() { + indexCache.close(); + collections.close(); + } + + public ReferenceContainer deleteAllReferences(final String wordHash) { + final ReferenceContainer c = new ReferenceContainer( + wordHash, + ReferenceRow.urlEntryRow, + indexCache.countReferences(wordHash)); + c.addAllUnique(indexCache.deleteAllReferences(wordHash)); + c.addAllUnique(collections.deleteAllReferences(wordHash)); + return c; + } + + public boolean removeReference(final String wordHash, final String urlHash) { + boolean removed = false; + removed = removed | (indexCache.removeReference(wordHash, urlHash)); + removed = removed | (collections.removeReference(wordHash, urlHash)); + return removed; + } + + public int removeEntryMultiple(final Set wordHashes, final String urlHash) { + // remove the same url hashes for multiple words + // this is mainly used when correcting a index after a search + final Iterator i = wordHashes.iterator(); + int count = 0; + while (i.hasNext()) { + if (removeReference(i.next(), urlHash)) count++; + } + return count; + } + + public int removeReferences(final String wordHash, final Set urlHashes) { + int removed = 0; + removed += indexCache.removeReferences(wordHash, urlHashes); + removed += collections.removeReferences(wordHash, urlHashes); + return removed; + } + + public String removeEntriesExpl(final String wordHash, final Set urlHashes) { + String removed = ""; + removed += indexCache.removeReferences(wordHash, urlHashes) + ", "; + removed += collections.removeReferences(wordHash, urlHashes); + return removed; + } + + public void removeEntriesMultiple(final Set wordHashes, final Set urlHashes) { + // remove the same url hashes for multiple words + // this is mainly used when correcting a index after a search + final Iterator i = wordHashes.iterator(); + while (i.hasNext()) { + removeReferences(i.next(), urlHashes); + } + } + + public int removeWordReferences(final Set words, final String urlhash) { + // sequentially delete all word references + // returns number of deletions + final Iterator iter = words.iterator(); + int count = 0; + while (iter.hasNext()) { + // delete the URL reference in this word index + if (removeReference(Word.word2hash(iter.next()), urlhash)) count++; + } + return count; + } + + public synchronized TreeSet indexContainerSet(final String startHash, final boolean ram, final boolean rot, int count) { + // creates a set of indexContainers + // this does not use the cache + final Order containerOrder = new ReferenceContainerOrder(indexOrder.clone()); + containerOrder.rotate(emptyContainer(startHash, 0)); + final TreeSet containers = new TreeSet(containerOrder); + final Iterator i = referenceIterator(startHash, rot, ram); + if (ram) count = Math.min(indexCache.size(), count); + ReferenceContainer container; + // this loop does not terminate using the i.hasNex() predicate when rot == true + // because then the underlying iterator is a rotating iterator without termination + // in this case a termination must be ensured with a counter + // It must also be ensured that the counter is in/decreased every loop + while ((count > 0) && (i.hasNext())) { + container = i.next(); + if ((container != null) && (container.size() > 0)) { + containers.add(container); + } + count--; // decrease counter even if the container was null or empty to ensure termination + } + return containers; // this may return less containers as demanded + } + + public synchronized CloneableIterator referenceIterator(final String startHash, final boolean rot, final boolean ram) { + final CloneableIterator i = wordContainers(startHash, ram); + if (rot) { + return new RotateIterator(i, new String(Base64Order.zero(startHash.length())), indexCache.size() + ((ram) ? 0 : collections.size())); + } + return i; + } + + private synchronized CloneableIterator wordContainers(final String startWordHash, final boolean ram) { + final Order containerOrder = new ReferenceContainerOrder(indexOrder.clone()); + containerOrder.rotate(emptyContainer(startWordHash, 0)); + if (ram) { + return indexCache.referenceIterator(startWordHash, false, true); + } + return collections.referenceIterator(startWordHash, false, false); + /* + return new MergeIterator( + indexCache.referenceIterator(startWordHash, false, true), + collections.referenceIterator(startWordHash, false, false), + containerOrder, + ReferenceContainer.containerMergeMethod, + true); + */ + } + + public int countReferences(String key) { + return indexCache.countReferences(key) + collections.countReferences(key); + } + +} diff --git a/source/de/anomic/kelondro/text/ReferenceContainer.java b/source/de/anomic/kelondro/text/ReferenceContainer.java index 875f47f5c..6c79abda6 100644 --- a/source/de/anomic/kelondro/text/ReferenceContainer.java +++ b/source/de/anomic/kelondro/text/ReferenceContainer.java @@ -38,7 +38,6 @@ import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.RowSet; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.util.ByteBuffer; -import de.anomic.plasma.plasmaWordIndex; public class ReferenceContainer extends RowSet { @@ -229,11 +228,11 @@ public class ReferenceContainer extends RowSet { // join a search result and return the joincount (number of pages after join) // since this is a conjunction we return an empty entity if any word is not known - if (includeContainers == null) return plasmaWordIndex.emptyContainer(null, 0); + if (includeContainers == null) return CachedIndexCollection.emptyContainer(null, 0); // join the result final ReferenceContainer rcLocal = ReferenceContainer.joinContainers(includeContainers, maxDistance); - if (rcLocal == null) return plasmaWordIndex.emptyContainer(null, 0); + if (rcLocal == null) return CachedIndexCollection.emptyContainer(null, 0); excludeContainers(rcLocal, excludeContainers); return rcLocal; diff --git a/source/de/anomic/plasma/plasmaDbImporter.java b/source/de/anomic/plasma/plasmaDbImporter.java index 2855ca490..973e1a617 100644 --- a/source/de/anomic/plasma/plasmaDbImporter.java +++ b/source/de/anomic/plasma/plasmaDbImporter.java @@ -36,7 +36,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer { super("PLASMADB"); this.homeWordIndex = homeWI; this.importWordIndex = importWI; - this.importStartSize = this.importWordIndex.size(); + this.importStartSize = this.importWordIndex.index().size(); } /** @@ -93,15 +93,15 @@ public class plasmaDbImporter extends AbstractImporter implements Importer { try { this.log.logInfo("Importing DB from '" + this.importWordIndex.getLocation(true).getAbsolutePath() + "'"); - this.log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeWordIndex.metadata().size() + " URLs."); - this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.metadata().size() + " URLs."); + this.log.logInfo("Home word index contains " + homeWordIndex.index().size() + " words and " + homeWordIndex.metadata().size() + " URLs."); + this.log.logInfo("Import word index contains " + this.importWordIndex.index().size() + " words and " + this.importWordIndex.metadata().size() + " URLs."); final HashSet unknownUrlBuffer = new HashSet(); final HashSet importedUrlBuffer = new HashSet(); // iterate over all words from import db //Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false); - Iterator indexContainerIterator = this.importWordIndex.indexContainerSet(this.wordChunkStartHash, false, false, 100).iterator(); + Iterator indexContainerIterator = this.importWordIndex.index().indexContainerSet(this.wordChunkStartHash, false, false, 100).iterator(); while (!isAborted() && indexContainerIterator.hasNext()) { final TreeSet entityUrls = new TreeSet(); @@ -169,10 +169,10 @@ public class plasmaDbImporter extends AbstractImporter implements Importer { if (isAborted()) break; // importing entity container to home db - if (newContainer.size() > 0) { homeWordIndex.addReferences(newContainer); } + if (newContainer.size() > 0) { homeWordIndex.index().addReferences(newContainer); } // delete complete index entity file - this.importWordIndex.deleteAllReferences(this.wordHash); + this.importWordIndex.index().deleteAllReferences(this.wordHash); // print out some statistical information if (this.entryCounter % 500 == 0) { @@ -189,8 +189,8 @@ public class plasmaDbImporter extends AbstractImporter implements Importer { "Speed: "+ 500*1000/duration + " word entities/s" + " | Elapsed time: " + DateFormatter.formatInterval(getElapsedTime()) + " | Estimated time: " + DateFormatter.formatInterval(getEstimatedTime()) + "\n" + - "Home Words = " + homeWordIndex.size() + - " | Import Words = " + this.importWordIndex.size()); + "Home Words = " + homeWordIndex.index().size() + + " | Import Words = " + this.importWordIndex.index().size()); this.wordChunkStart = this.wordChunkEnd; this.wordChunkStartHash = this.wordChunkEndHash; } @@ -203,7 +203,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer { if (!indexContainerIterator.hasNext()) { // We may not be finished yet, try to get the next chunk of wordHashes - final TreeSet containers = this.importWordIndex.indexContainerSet(this.wordHash, false, false, 100); + final TreeSet containers = this.importWordIndex.index().indexContainerSet(this.wordHash, false, false, 100); indexContainerIterator = containers.iterator(); // Make sure we don't get the same wordhash twice, but don't skip a word if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getWordHash()))) { @@ -212,8 +212,8 @@ public class plasmaDbImporter extends AbstractImporter implements Importer { } } - this.log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeWordIndex.metadata().size() + " URLs."); - this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.metadata().size() + " URLs."); + this.log.logInfo("Home word index contains " + homeWordIndex.index().size() + " words and " + homeWordIndex.metadata().size() + " URLs."); + this.log.logInfo("Import word index contains " + this.importWordIndex.index().size() + " words and " + this.importWordIndex.metadata().size() + " URLs."); } catch (final Exception e) { this.log.logSevere("Database import failed.",e); e.printStackTrace(); diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index e2b7c6ed0..553a04fc4 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -248,7 +248,7 @@ public final class plasmaSearchEvent { if (rw > 0) { final Set removeWords = cleanEvent.query.queryHashes; removeWords.addAll(cleanEvent.query.excludeHashes); - cleanEvent.wordIndex.removeEntriesMultiple(removeWords, cleanEvent.failedURLs.keySet()); + cleanEvent.wordIndex.index().removeEntriesMultiple(removeWords, cleanEvent.failedURLs.keySet()); Log.logInfo("SearchEvents", "cleaning up event " + cleanEvent.query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words"); } @@ -301,7 +301,7 @@ public final class plasmaSearchEvent { (query.constraint.get(plasmaCondenser.flag_cat_indexof)) && (!(metadata.dc_title().startsWith("Index of")))) { final Iterator wi = query.queryHashes.iterator(); - while (wi.hasNext()) wordIndex.removeReference(wi.next(), page.hash()); + while (wi.hasNext()) wordIndex.index().removeReference(wi.next(), page.hash()); registerFailure(page.hash(), "index-of constraint not fullfilled"); return null; } @@ -824,7 +824,7 @@ public final class plasmaSearchEvent { String address = null; if ((seed == null) || ((address = seed.getPublicAddress()) == null)) { // seed is not known from here - wordIndex.removeWordReferences( + wordIndex.index().removeWordReferences( plasmaCondenser.getWords( ("yacyshare " + filename.replace('?', ' ') + diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index b1514db82..19ed465ec 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -110,7 +110,7 @@ public final class plasmaSearchRankingProcess { public void execQuery() { long timer = System.currentTimeMillis(); - this.localSearchContainerMaps = wordIndex.localSearchContainers(query.queryHashes, query.excludeHashes, null); + this.localSearchContainerMaps = wordIndex.index().localSearchContainers(query.queryHashes, query.excludeHashes, null); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.COLLECTION, this.localSearchContainerMaps[0].size(), System.currentTimeMillis() - timer), false); // join and exclude the local result diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 0740ff747..80f061853 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -952,12 +952,12 @@ public class plasmaSnippetCache { assert plasmaSwitchboard.getSwitchboard().webIndex != null; assert event != null : "eventID = " + eventID; assert event.getQuery() != null; - plasmaSwitchboard.getSwitchboard().webIndex.removeEntryMultiple(event.getQuery().queryHashes, urlHash); + plasmaSwitchboard.getSwitchboard().webIndex.index().removeEntryMultiple(event.getQuery().queryHashes, urlHash); event.remove(urlHash); } if (snippet.getErrorCode() == ERROR_NO_MATCH) { log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError()); - plasmaSwitchboard.getSwitchboard().webIndex.removeEntryMultiple(snippet.remaingHashes, urlHash); + plasmaSwitchboard.getSwitchboard().webIndex.index().removeEntryMultiple(snippet.remaingHashes, urlHash); plasmaSearchEvent.getEvent(eventID).remove(urlHash); } return snippet.getError(); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 29bf2e6ae..dbe02b9a5 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -329,7 +329,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch it = cache.referenceIterator(null, false, true); - while (it.hasNext()) cacheBytes += it.next().size() * entryBytes; - } - return cacheBytes; - } - - public void setMaxWordCount(final int maxWords) { - indexCache.setMaxWordCount(maxWords); - } - - public void cacheFlushControl(final IndexCache theCache) { - // check for forced flush - int cs = cacheSize(); - if (cs > 0) { - // flush elements that are too big. This flushing depends on the fact that the flush rule - // selects the biggest elements first for flushing. If it does not for any reason, the following - // loop would not terminate. - serverProfiling.update("wordcache", Long.valueOf(cs), true); - // To ensure termination an additional counter is used - int l = 0; - while (theCache.size() > 0 && (l++ < 100) && (theCache.maxURLinCache() > wCacheMaxChunk)) { - flushCacheOne(theCache); - } - // next flush more entries if the size exceeds the maximum size of the cache - while (theCache.size() > 0 && - ((theCache.size() > theCache.getMaxWordCount()) || - (MemoryControl.available() < collections.minMem()))) { - flushCacheOne(theCache); - } - if (cacheSize() != cs) serverProfiling.update("wordcache", Long.valueOf(cacheSize()), true); - } - } - - public static ReferenceContainer emptyContainer(final String wordHash, final int elementCount) { - return new ReferenceContainer(wordHash, ReferenceRow.urlEntryRow, elementCount); - } - public void addEntry(final String wordHash, final ReferenceRow entry, final long updateTime) { - // add the entry - indexCache.addEntry(wordHash, entry, updateTime, true); - cacheFlushControl(this.indexCache); - } - - public void addReferences(final ReferenceContainer entries) { - assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize); - - // add the entry - indexCache.addReferences(entries); - cacheFlushControl(this.indexCache); - } - - public void flushCacheFor(int time) { - flushCacheUntil(System.currentTimeMillis() + time); - } - - private synchronized void flushCacheUntil(long timeout) { - while (System.currentTimeMillis() < timeout && indexCache.size() > 0) { - flushCacheOne(indexCache); - } - } - - private synchronized void flushCacheOne(final IndexCache ram) { - if (ram.size() > 0) collections.addReferences(flushContainer(ram)); - } - - private ReferenceContainer flushContainer(final IndexCache ram) { - String wordHash; - ReferenceContainer c; - wordHash = ram.maxScoreWordHash(); - c = ram.getReferences(wordHash, null); - if ((c != null) && (c.size() > wCacheMaxChunk)) { - return ram.deleteAllReferences(wordHash); - } else { - return ram.deleteAllReferences(ram.bestFlushWordHash()); - } - } - - /** * this is called by the switchboard to put in a new page into the index * use all the words in one condenser object to simultanous create index entries @@ -526,221 +383,20 @@ public final class plasmaWordIndex implements Index { doctype, outlinksSame, outlinksOther, wprop.flags); - addEntry(Word.word2hash(word), ientry, System.currentTimeMillis()); + this.index.addEntry(Word.word2hash(word), ientry, System.currentTimeMillis()); wordCount++; } return wordCount; } - public boolean hasReferences(final String wordHash) { - if (indexCache.hasReferences(wordHash)) return true; - if (collections.hasReferences(wordHash)) return true; - return false; - } - - public ReferenceContainer getReferences(final String wordHash, final Set urlselection) { - if ((wordHash == null) || (wordHash.length() != yacySeedDB.commonHashLength)) { - // wrong input - return null; - } - - // get from cache - ReferenceContainer container; - container = indexCache.getReferences(wordHash, urlselection); - - // get from collection index - if (container == null) { - container = collections.getReferences(wordHash, urlselection); - } else { - container.addAllUnique(collections.getReferences(wordHash, urlselection)); - } - - if (container == null) return null; - - // check doubles - final int beforeDouble = container.size(); - container.sort(); - final ArrayList d = container.removeDoubles(); - RowCollection set; - for (int i = 0; i < d.size(); i++) { - // for each element in the double-set, take that one that is the most recent one - set = d.get(i); - ReferenceRow e, elm = null; - long lm = 0; - for (int j = 0; j < set.size(); j++) { - e = new ReferenceRow(set.get(j, true)); - if ((elm == null) || (e.lastModified() > lm)) { - elm = e; - lm = e.lastModified(); - } - } - if(elm != null) { - container.addUnique(elm.toKelondroEntry()); - } - } - if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getWordHash()); - - return container; - } - - /** - * return map of wordhash:indexContainer - * - * @param wordHashes - * @param urlselection - * @param deleteIfEmpty - * @param interruptIfEmpty - * @return - */ - public HashMap getContainers(final Set wordHashes, final Set urlselection, final boolean interruptIfEmpty) { - // retrieve entities that belong to the hashes - final HashMap containers = new HashMap(wordHashes.size()); - String singleHash; - ReferenceContainer singleContainer; - final Iterator i = wordHashes.iterator(); - while (i.hasNext()) { - - // get next word hash: - singleHash = i.next(); - - // retrieve index - singleContainer = getReferences(singleHash, urlselection); - - // check result - if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashMap(0); - - containers.put(singleHash, singleContainer); - } - return containers; - } - - @SuppressWarnings("unchecked") - public HashMap[] localSearchContainers( - final TreeSet queryHashes, - final TreeSet excludeHashes, - final Set urlselection) { - // search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result - - // retrieve entities that belong to the hashes - HashMap inclusionContainers = (queryHashes.size() == 0) ? new HashMap(0) : getContainers( - queryHashes, - urlselection, - true); - if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap(0); // prevent that only a subset is returned - final HashMap exclusionContainers = (inclusionContainers.size() == 0) ? new HashMap(0) : getContainers( - excludeHashes, - urlselection, - true); - return new HashMap[]{inclusionContainers, exclusionContainers}; - } - - public int size() { - return java.lang.Math.max(collections.size(), indexCache.size()); - } - - public int collectionsSize() { - return collections.size(); - } - - public int cacheSize() { - return indexCache.size(); - } - public void close() { - indexCache.close(); - collections.close(); + index.close(); metadata.close(); peers.close(); profilesActiveCrawls.close(); queuePreStack.close(); } - - public ReferenceContainer deleteAllReferences(final String wordHash) { - final ReferenceContainer c = new ReferenceContainer( - wordHash, - ReferenceRow.urlEntryRow, - indexCache.countReferences(wordHash)); - c.addAllUnique(indexCache.deleteAllReferences(wordHash)); - c.addAllUnique(collections.deleteAllReferences(wordHash)); - return c; - } - - public boolean removeReference(final String wordHash, final String urlHash) { - boolean removed = false; - removed = removed | (indexCache.removeReference(wordHash, urlHash)); - removed = removed | (collections.removeReference(wordHash, urlHash)); - return removed; - } - - public int removeEntryMultiple(final Set wordHashes, final String urlHash) { - // remove the same url hashes for multiple words - // this is mainly used when correcting a index after a search - final Iterator i = wordHashes.iterator(); - int count = 0; - while (i.hasNext()) { - if (removeReference(i.next(), urlHash)) count++; - } - return count; - } - - public int removeReferences(final String wordHash, final Set urlHashes) { - int removed = 0; - removed += indexCache.removeReferences(wordHash, urlHashes); - removed += collections.removeReferences(wordHash, urlHashes); - return removed; - } - - public String removeEntriesExpl(final String wordHash, final Set urlHashes) { - String removed = ""; - removed += indexCache.removeReferences(wordHash, urlHashes) + ", "; - removed += collections.removeReferences(wordHash, urlHashes); - return removed; - } - - public void removeEntriesMultiple(final Set wordHashes, final Set urlHashes) { - // remove the same url hashes for multiple words - // this is mainly used when correcting a index after a search - final Iterator i = wordHashes.iterator(); - while (i.hasNext()) { - removeReferences(i.next(), urlHashes); - } - } - - public int removeWordReferences(final Set words, final String urlhash) { - // sequentially delete all word references - // returns number of deletions - final Iterator iter = words.iterator(); - int count = 0; - while (iter.hasNext()) { - // delete the URL reference in this word index - if (removeReference(Word.word2hash(iter.next()), urlhash)) count++; - } - return count; - } - - public synchronized TreeSet indexContainerSet(final String startHash, final boolean ram, final boolean rot, int count) { - // creates a set of indexContainers - // this does not use the cache - final Order containerOrder = new ReferenceContainerOrder(indexOrder.clone()); - containerOrder.rotate(emptyContainer(startHash, 0)); - final TreeSet containers = new TreeSet(containerOrder); - final Iterator i = referenceIterator(startHash, rot, ram); - if (ram) count = Math.min(indexCache.size(), count); - ReferenceContainer container; - // this loop does not terminate using the i.hasNex() predicate when rot == true - // because then the underlying iterator is a rotating iterator without termination - // in this case a termination must be ensured with a counter - // It must also be ensured that the counter is in/decreased every loop - while ((count > 0) && (i.hasNext())) { - container = i.next(); - if ((container != null) && (container.size() > 0)) { - containers.add(container); - } - count--; // decrease counter even if the container was null or empty to ensure termination - } - return containers; // this may return less containers as demanded - } public MetadataRowContainer storeDocument(final IndexingStack.QueueEntry entry, final plasmaParserDocument document, final plasmaCondenser condenser) throws IOException { final long startTime = System.currentTimeMillis(); @@ -856,32 +512,6 @@ public final class plasmaWordIndex implements Index { return newEntry; } - public synchronized CloneableIterator referenceIterator(final String startHash, final boolean rot, final boolean ram) { - final CloneableIterator i = wordContainers(startHash, ram); - if (rot) { - return new RotateIterator(i, new String(Base64Order.zero(startHash.length())), indexCache.size() + ((ram) ? 0 : collections.size())); - } - return i; - } - - private synchronized CloneableIterator wordContainers(final String startWordHash, final boolean ram) { - final Order containerOrder = new ReferenceContainerOrder(indexOrder.clone()); - containerOrder.rotate(emptyContainer(startWordHash, 0)); - if (ram) { - return indexCache.referenceIterator(startWordHash, false, true); - } - return collections.referenceIterator(startWordHash, false, false); - /* - return new MergeIterator( - indexCache.referenceIterator(startWordHash, false, true), - collections.referenceIterator(startWordHash, false, false), - containerOrder, - ReferenceContainer.containerMergeMethod, - true); - */ - } - - // The Cleaner class was provided as "UrldbCleaner" by Hydrox public synchronized ReferenceCleaner getReferenceCleaner(final String startHash) { return new ReferenceCleaner(startHash); @@ -899,7 +529,7 @@ public final class plasmaWordIndex implements Index { public ReferenceCleaner(final String startHash) { this.startHash = startHash; - this.rwiCountAtStart = size(); + this.rwiCountAtStart = index().size(); } public void run() { @@ -908,7 +538,7 @@ public final class plasmaWordIndex implements Index { ReferenceRow entry = null; yacyURL url = null; final HashSet urlHashs = new HashSet(); - Iterator indexContainerIterator = indexContainerSet(startHash, false, false, 100).iterator(); + Iterator indexContainerIterator = index.indexContainerSet(startHash, false, false, 100).iterator(); while (indexContainerIterator.hasNext() && run) { waiter(); container = indexContainerIterator.next(); @@ -930,7 +560,7 @@ public final class plasmaWordIndex implements Index { } } if (urlHashs.size() > 0) { - final int removed = removeReferences(container.getWordHash(), urlHashs); + final int removed = index.removeReferences(container.getWordHash(), urlHashs); Log.logFine("INDEXCLEANER", container.getWordHash() + ": " + removed + " of " + container.size() + " URL-entries deleted"); lastWordHash = container.getWordHash(); lastDeletionCounter = urlHashs.size(); @@ -938,7 +568,7 @@ public final class plasmaWordIndex implements Index { } if (!containerIterator.hasNext()) { // We may not be finished yet, try to get the next chunk of wordHashes - final TreeSet containers = indexContainerSet(container.getWordHash(), false, false, 100); + final TreeSet containers = index.indexContainerSet(container.getWordHash(), false, false, 100); indexContainerIterator = containers.iterator(); // Make sure we don't get the same wordhash twice, but don't skip a word if ((indexContainerIterator.hasNext()) && (!container.getWordHash().equals(indexContainerIterator.next().getWordHash()))) { @@ -988,9 +618,4 @@ public final class plasmaWordIndex implements Index { } } } - - public int countReferences(String key) { - return indexCache.countReferences(key) + collections.countReferences(key); - } - } diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index af133a64d..2d4187006 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -69,6 +69,7 @@ import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.order.Digest; +import de.anomic.kelondro.text.CachedIndexCollection; import de.anomic.kelondro.text.MetadataRowContainer; import de.anomic.kelondro.text.Reference; import de.anomic.kelondro.text.ReferenceContainer; @@ -529,7 +530,7 @@ public final class yacyClient { final int words = wordhashes.length() / yacySeedDB.commonHashLength; final ReferenceContainer[] container = new ReferenceContainer[words]; for (int i = 0; i < words; i++) { - container[i] = plasmaWordIndex.emptyContainer(wordhashes.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength), count); + container[i] = CachedIndexCollection.emptyContainer(wordhashes.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength), count); } // insert results to containers @@ -638,7 +639,7 @@ public final class yacyClient { // insert the containers to the index for (int m = 0; m < words; m++) { - wordIndex.addReferences(container[m]); + wordIndex.index().addReferences(container[m]); } // generate statistics diff --git a/source/yacy.java b/source/yacy.java index 6c327f4b2..fd05ca690 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -676,7 +676,7 @@ public final class yacy { if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up."); final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0); - final Iterator indexContainerIterator = wordIndex.referenceIterator("AAAAAAAAAAAA", false, false); + final Iterator indexContainerIterator = wordIndex.index().referenceIterator("AAAAAAAAAAAA", false, false); long urlCounter = 0, wordCounter = 0; long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0; @@ -867,7 +867,7 @@ public final class yacy { Iterator indexContainerIterator = null; if (resource.equals("all")) { WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0); - indexContainerIterator = WordIndex.referenceIterator(wordChunkStartHash, false, false); + indexContainerIterator = WordIndex.index().referenceIterator(wordChunkStartHash, false, false); } int counter = 0; ReferenceContainer container = null;