refactoring of plasmaWordIndex: less methods in the class, separated the index to CachedIndexCollection

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5710 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 14a1c33823
commit 7f67238f8b

@ -94,7 +94,7 @@ public class IndexCleaner_p {
prop.put("rwidb_threadAlive", indexCleanerThread.isAlive() + ""); prop.put("rwidb_threadAlive", indexCleanerThread.isAlive() + "");
prop.put("rwidb_threadToString", indexCleanerThread.toString()); prop.put("rwidb_threadToString", indexCleanerThread.toString());
prop.putNum("rwidb_RWIcountstart", indexCleanerThread.rwiCountAtStart); prop.putNum("rwidb_RWIcountstart", indexCleanerThread.rwiCountAtStart);
prop.putNum("rwidb_RWIcountnow", sb.webIndex.size()); prop.putNum("rwidb_RWIcountnow", sb.webIndex.index().size());
prop.put("rwidb_wordHashNow", indexCleanerThread.wordHashNow); prop.put("rwidb_wordHashNow", indexCleanerThread.wordHashNow);
prop.put("rwidb_lastWordHash", indexCleanerThread.lastWordHash); prop.put("rwidb_lastWordHash", indexCleanerThread.lastWordHash);
prop.putNum("rwidb_lastDeletionCounter", indexCleanerThread.lastDeletionCounter); prop.putNum("rwidb_lastDeletionCounter", indexCleanerThread.lastDeletionCounter);

@ -124,7 +124,7 @@ public class IndexControlRWIs_p {
if (delurl || delurlref) { if (delurl || delurlref) {
// generate an urlx array // generate an urlx array
ReferenceContainer index = null; ReferenceContainer index = null;
index = sb.webIndex.getReferences(keyhash, null); index = sb.webIndex.index().getReferences(keyhash, null);
final Iterator<ReferenceRow> en = index.entries(); final Iterator<ReferenceRow> en = index.entries();
int i = 0; int i = 0;
urlx = new String[index.size()]; urlx = new String[index.size()];
@ -141,7 +141,7 @@ public class IndexControlRWIs_p {
sb.urlRemove(urlx[i]); sb.urlRemove(urlx[i]);
} }
} }
sb.webIndex.deleteAllReferences(keyhash); sb.webIndex.index().deleteAllReferences(keyhash);
post.remove("keyhashdeleteall"); post.remove("keyhashdeleteall");
post.put("urllist", "generated"); post.put("urllist", "generated");
} }
@ -158,7 +158,7 @@ public class IndexControlRWIs_p {
} }
final Set<String> urlHashes = new HashSet<String>(); final Set<String> urlHashes = new HashSet<String>();
for (int i = 0; i < urlx.length; i++) urlHashes.add(urlx[i]); for (int i = 0; i < urlx.length; i++) urlHashes.add(urlx[i]);
sb.webIndex.removeReferences(keyhash, urlHashes); sb.webIndex.index().removeReferences(keyhash, urlHashes);
// this shall lead to a presentation of the list; so handle that the remaining program // this shall lead to a presentation of the list; so handle that the remaining program
// thinks that it was called for a list presentation // thinks that it was called for a list presentation
post.remove("keyhashdelete"); post.remove("keyhashdelete");
@ -200,7 +200,7 @@ public class IndexControlRWIs_p {
// prepare index // prepare index
ReferenceContainer index; ReferenceContainer index;
final long starttime = System.currentTimeMillis(); final long starttime = System.currentTimeMillis();
index = sb.webIndex.getReferences(keyhash, null); index = sb.webIndex.index().getReferences(keyhash, null);
// built urlCache // built urlCache
final Iterator<ReferenceRow> urlIter = index.entries(); final Iterator<ReferenceRow> urlIter = index.entries();
final HashMap<String, MetadataRowContainer> knownURLs = new HashMap<String, MetadataRowContainer>(); final HashMap<String, MetadataRowContainer> knownURLs = new HashMap<String, MetadataRowContainer>();
@ -237,7 +237,7 @@ public class IndexControlRWIs_p {
// generate list // generate list
if (post.containsKey("keyhashsimilar")) { if (post.containsKey("keyhashsimilar")) {
final Iterator<ReferenceContainer> containerIt = sb.webIndex.indexContainerSet(keyhash, false, true, 256).iterator(); final Iterator<ReferenceContainer> containerIt = sb.webIndex.index().indexContainerSet(keyhash, false, true, 256).iterator();
ReferenceContainer container; ReferenceContainer container;
int i = 0; int i = 0;
int rows = 0, cols = 0; int rows = 0, cols = 0;
@ -315,7 +315,7 @@ public class IndexControlRWIs_p {
} catch (final IOException e) { } catch (final IOException e) {
} }
} }
sb.webIndex.removeReferences(keyhash, urlHashes); sb.webIndex.index().removeReferences(keyhash, urlHashes);
} }
if (prop.getInt("searchresult", 0) == 3) plasmaSearchAPI.listHosts(prop, keyhash, sb); if (prop.getInt("searchresult", 0) == 3) plasmaSearchAPI.listHosts(prop, keyhash, sb);
@ -323,7 +323,7 @@ public class IndexControlRWIs_p {
// insert constants // insert constants
prop.putNum("wcount", sb.webIndex.size()); prop.putNum("wcount", sb.webIndex.index().size());
// return rewrite properties // return rewrite properties
return prop; return prop;
} }

@ -182,7 +182,7 @@ public class IndexControlURLs_p {
// generate list // generate list
if (post.containsKey("urlhashsimilar")) { if (post.containsKey("urlhashsimilar")) {
try { try {
final Iterator<MetadataRowContainer> entryIt = new RotateIterator<MetadataRowContainer>(sb.webIndex.metadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.webIndex.size()); final Iterator<MetadataRowContainer> entryIt = new RotateIterator<MetadataRowContainer>(sb.webIndex.metadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.webIndex.index().size());
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />"); final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
MetadataRowContainer entry; MetadataRowContainer entry;
int i = 0; int i = 0;

@ -106,7 +106,7 @@ public final class IndexImport_p {
} }
} }
prop.putNum("wcount", switchboard.webIndex.size()); prop.putNum("wcount", switchboard.webIndex.index().size());
prop.putNum("ucount", switchboard.webIndex.metadata().size()); prop.putNum("ucount", switchboard.webIndex.metadata().size());
/* /*

@ -55,7 +55,7 @@ public class IndexShare_p {
prop.put("wordfreq", switchboard.getConfigLong("defaultWordReceiveFrequency",10)); prop.put("wordfreq", switchboard.getConfigLong("defaultWordReceiveFrequency",10));
prop.put("dtable", ""); prop.put("dtable", "");
prop.put("rtable", ""); prop.put("rtable", "");
prop.putNum("wcount", switchboard.webIndex.size()); prop.putNum("wcount", switchboard.webIndex.index().size());
prop.putNum("ucount", switchboard.webIndex.metadata().size()); prop.putNum("ucount", switchboard.webIndex.metadata().size());
return prop; // be save return prop; // be save
} }
@ -68,7 +68,7 @@ public class IndexShare_p {
} }
// insert constants // insert constants
prop.putNum("wcount", switchboard.webIndex.size()); prop.putNum("wcount", switchboard.webIndex.index().size());
prop.putNum("ucount", switchboard.webIndex.metadata().size()); prop.putNum("ucount", switchboard.webIndex.metadata().size());
// return rewrite properties // return rewrite properties

@ -41,7 +41,7 @@ public class PerformanceGraph {
final int width = post.getInt("width", 660); final int width = post.getInt("width", 660);
final int height = post.getInt("height", 240); final int height = post.getInt("height", 240);
return plasmaProfiling.performanceGraph(width, height, sb.webIndex.metadata().size() + " URLS / " + sb.webIndex.collectionsSize() + " WORDS IN COLLECTIONS / " + sb.webIndex.cacheSize() + " WORDS IN CACHE"); return plasmaProfiling.performanceGraph(width, height, sb.webIndex.metadata().size() + " URLS / " + sb.webIndex.index().collectionsSize() + " WORDS IN COLLECTIONS / " + sb.webIndex.index().cacheSize() + " WORDS IN CACHE");
} }
} }

@ -199,7 +199,7 @@ public class PerformanceQueues_p {
// disallow setting of memprereq for indexer to prevent db from throwing OOMs // disallow setting of memprereq for indexer to prevent db from throwing OOMs
prop.put("table_" + c + "_disabled", /*(threadName.endsWith("_indexing")) ? 1 :*/ "0"); prop.put("table_" + c + "_disabled", /*(threadName.endsWith("_indexing")) ? 1 :*/ "0");
prop.put("table_" + c + "_recommendation", threadName.endsWith("_indexing") ? "1" : "0"); prop.put("table_" + c + "_recommendation", threadName.endsWith("_indexing") ? "1" : "0");
prop.putNum("table_" + c + "_recommendation_value", threadName.endsWith("_indexing") ? (switchboard.webIndex.minMem() / 1024) : 0); prop.putNum("table_" + c + "_recommendation_value", threadName.endsWith("_indexing") ? (switchboard.webIndex.index().minMem() / 1024) : 0);
c++; c++;
} }
prop.put("table", c); prop.put("table", c);
@ -229,7 +229,7 @@ public class PerformanceQueues_p {
if ((post != null) && (post.containsKey("cacheSizeSubmit"))) { if ((post != null) && (post.containsKey("cacheSizeSubmit"))) {
final int wordCacheMaxCount = post.getInt("wordCacheMaxCount", 20000); final int wordCacheMaxCount = post.getInt("wordCacheMaxCount", 20000);
switchboard.setConfig(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount)); switchboard.setConfig(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount));
switchboard.webIndex.setMaxWordCount(wordCacheMaxCount); switchboard.webIndex.index().setMaxWordCount(wordCacheMaxCount);
final int wordCacheInitCount = post.getInt(plasmaSwitchboardConstants.WORDCACHE_INIT_COUNT, 30000); final int wordCacheInitCount = post.getInt(plasmaSwitchboardConstants.WORDCACHE_INIT_COUNT, 30000);
switchboard.setConfig(plasmaSwitchboardConstants.WORDCACHE_INIT_COUNT, Integer.toString(wordCacheInitCount)); switchboard.setConfig(plasmaSwitchboardConstants.WORDCACHE_INIT_COUNT, Integer.toString(wordCacheInitCount));
@ -288,11 +288,11 @@ public class PerformanceQueues_p {
// table cache settings // table cache settings
prop.putNum("urlCacheSize", switchboard.webIndex.metadata().writeCacheSize()); prop.putNum("urlCacheSize", switchboard.webIndex.metadata().writeCacheSize());
prop.putNum("wordCacheSize", switchboard.webIndex.indexCacheSize()); prop.putNum("wordCacheSize", switchboard.webIndex.index().indexCacheSize());
prop.putNum("wordCacheSizeKBytes", switchboard.webIndex.indexCacheSizeBytes()/1024); prop.putNum("wordCacheSizeKBytes", switchboard.webIndex.index().indexCacheSizeBytes()/1024);
prop.putNum("maxURLinCache", switchboard.webIndex.maxURLinCache()); prop.putNum("maxURLinCache", switchboard.webIndex.index().maxURLinCache());
prop.putNum("maxAgeOfCache", switchboard.webIndex.maxAgeOfCache() / 1000 / 60); // minutes prop.putNum("maxAgeOfCache", switchboard.webIndex.index().maxAgeOfCache() / 1000 / 60); // minutes
prop.putNum("minAgeOfCache", switchboard.webIndex.minAgeOfCache() / 1000 / 60); // minutes prop.putNum("minAgeOfCache", switchboard.webIndex.index().minAgeOfCache() / 1000 / 60); // minutes
prop.putNum("maxWaitingWordFlush", switchboard.getConfigLong("maxWaitingWordFlush", 180)); prop.putNum("maxWaitingWordFlush", switchboard.getConfigLong("maxWaitingWordFlush", 180));
prop.put("wordCacheMaxCount", switchboard.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 20000)); prop.put("wordCacheMaxCount", switchboard.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 20000));
prop.put("wordCacheInitCount", switchboard.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_INIT_COUNT, 30000)); prop.put("wordCacheInitCount", switchboard.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_INIT_COUNT, 30000));

@ -42,7 +42,7 @@ public class queues_p {
prop.putNum("indexingSize", sb.getThread(plasmaSwitchboardConstants.INDEXER).getJobCount() + sb.webIndex.queuePreStack.getActiveQueueSize()); prop.putNum("indexingSize", sb.getThread(plasmaSwitchboardConstants.INDEXER).getJobCount() + sb.webIndex.queuePreStack.getActiveQueueSize());
prop.putNum("indexingMax", (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30)); prop.putNum("indexingMax", (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30));
prop.putNum("urlpublictextSize", sb.webIndex.metadata().size()); prop.putNum("urlpublictextSize", sb.webIndex.metadata().size());
prop.putNum("rwipublictextSize", sb.webIndex.size()); prop.putNum("rwipublictextSize", sb.webIndex.index().size());
if ((sb.webIndex.queuePreStack.size() == 0) && (sb.webIndex.queuePreStack.getActiveQueueSize() == 0)) { if ((sb.webIndex.queuePreStack.size() == 0) && (sb.webIndex.queuePreStack.getActiveQueueSize() == 0)) {
prop.put("list", "0"); //is empty prop.put("list", "0"); //is empty
} else { } else {

@ -21,11 +21,11 @@ public class status_p {
prop.setLocalized(false); prop.setLocalized(false);
prop.put("rejected", "0"); prop.put("rejected", "0");
sb.updateMySeed(); sb.updateMySeed();
final int cacheSize = sb.webIndex.indexCacheSize(); final int cacheSize = sb.webIndex.index().indexCacheSize();
final long cacheMaxSize = sb.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 10000); final long cacheMaxSize = sb.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 10000);
prop.putNum("ppm", sb.currentPPM()); prop.putNum("ppm", sb.currentPPM());
prop.putNum("qpm", sb.webIndex.peers().mySeed().getQPM()); prop.putNum("qpm", sb.webIndex.peers().mySeed().getQPM());
prop.putNum("wordCacheSize", sb.webIndex.indexCacheSize()); prop.putNum("wordCacheSize", sb.webIndex.index().indexCacheSize());
prop.putNum("wordCacheSize", cacheSize); prop.putNum("wordCacheSize", cacheSize);
prop.putNum("wordCacheMaxSize", cacheMaxSize); prop.putNum("wordCacheMaxSize", cacheMaxSize);
prop.put("wordCacheCount", cacheSize); prop.put("wordCacheCount", cacheSize);

@ -78,7 +78,7 @@ public final class timeline {
yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links"); yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links");
// get the index container with the result vector // get the index container with the result vector
HashMap<String, ReferenceContainer>[] localSearchContainerMaps = sb.webIndex.localSearchContainers(query[0], query[1], null); HashMap<String, ReferenceContainer>[] localSearchContainerMaps = sb.webIndex.index().localSearchContainers(query[0], query[1], null);
final ReferenceContainer index = final ReferenceContainer index =
ReferenceContainer.joinExcludeContainers( ReferenceContainer.joinExcludeContainers(
localSearchContainerMaps[0].values(), localSearchContainerMaps[0].values(),

@ -82,13 +82,13 @@ public final class query {
if (obj.equals("rwiurlcount")) { if (obj.equals("rwiurlcount")) {
// the total number of different urls in the rwi is returned // the total number of different urls in the rwi is returned
// <env> shall contain a word hash, the number of assigned lurls to this hash is returned // <env> shall contain a word hash, the number of assigned lurls to this hash is returned
prop.put("response", sb.webIndex.getReferences(env, null).size()); prop.put("response", sb.webIndex.index().getReferences(env, null).size());
return prop; return prop;
} }
if (obj.equals("rwicount")) { if (obj.equals("rwicount")) {
// return the total number of available word indexes // return the total number of available word indexes
prop.put("response", sb.webIndex.size()); prop.put("response", sb.webIndex.index().size());
return prop; return prop;
} }

@ -185,7 +185,7 @@ public final class search {
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
final long timer = System.currentTimeMillis(); final long timer = System.currentTimeMillis();
final Map<String, ReferenceContainer>[] containers = sb.webIndex.localSearchContainers(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2Set(urls)); final Map<String, ReferenceContainer>[] containers = sb.webIndex.index().localSearchContainers(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2Set(urls));
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), plasmaSearchEvent.COLLECTION, containers[0].size(), System.currentTimeMillis() - timer), false); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), plasmaSearchEvent.COLLECTION, containers[0].size(), System.currentTimeMillis() - timer), false);
if (containers != null) { if (containers != null) {

@ -100,9 +100,9 @@ public final class transferRWI {
sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". Not granted."); sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". Not granted.");
result = "not_granted"; result = "not_granted";
pause = 0; pause = 0;
} else if (sb.webIndex.indexCacheSize() > cachelimit) { } else if (sb.webIndex.index().indexCacheSize() > cachelimit) {
// we are too busy to receive indexes // we are too busy to receive indexes
sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.webIndex.indexCacheSize() + ")."); sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.webIndex.index().indexCacheSize() + ").");
granted = false; // don't accept more words if there are too many words to flush granted = false; // don't accept more words if there are too many words to flush
result = "busy"; result = "busy";
pause = 60000; pause = 60000;
@ -157,7 +157,7 @@ public final class transferRWI {
} }
// learn entry // learn entry
sb.webIndex.addEntry(wordHash, iEntry, System.currentTimeMillis()); sb.webIndex.index().addEntry(wordHash, iEntry, System.currentTimeMillis());
serverCore.checkInterruption(); serverCore.checkInterruption();
// check if we need to ask for the corresponding URL // check if we need to ask for the corresponding URL
@ -193,7 +193,7 @@ public final class transferRWI {
} }
result = "ok"; result = "ok";
pause = (int) (sb.webIndex.indexCacheSize() * 20000 / sb.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time pause = (int) (sb.webIndex.index().indexCacheSize() * 20000 / sb.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time
} }
prop.put("unknownURL", unknownURLs.toString()); prop.put("unknownURL", unknownURLs.toString());

@ -315,7 +315,7 @@ public class yacysearch {
// delete the index entry locally // delete the index entry locally
final String delHash = post.get("deleteref", ""); // urlhash final String delHash = post.get("deleteref", ""); // urlhash
sb.webIndex.removeWordReferences(query[0], delHash); sb.webIndex.index().removeWordReferences(query[0], delHash);
// make new news message with negative voting // make new news message with negative voting
final HashMap<String, String> map = new HashMap<String, String>(); final HashMap<String, String> map = new HashMap<String, String>();

@ -81,7 +81,7 @@ public class Balancer {
if (urlFileStack.size() != urlFileIndex.size() || (urlFileIndex.size() < 10000 && urlFileIndex.size() > 0)) { if (urlFileStack.size() != urlFileIndex.size() || (urlFileIndex.size() < 10000 && urlFileIndex.size() > 0)) {
// fix the file stack // fix the file stack
Log.logInfo("Balancer", "re-creating the " + stackname + " balancer stack, size = " + urlFileIndex.size() + ((urlFileStack.size() == urlFileIndex.size()) ? "" : " (the old stack size was wrong)" )); Log.logInfo("Balancer", "re-creating the " + stackname + " balancer stack, size = " + urlFileIndex.size() + ((urlFileStack.size() == urlFileIndex.size()) ? "" : " (the old stack size was wrong)" ));
urlFileStack = Stack.reset(urlFileStack); urlFileStack.clear();
try { try {
final Iterator<byte[]> i = urlFileIndex.keys(true, null); final Iterator<byte[]> i = urlFileIndex.keys(true, null);
byte[] hash; byte[] hash;
@ -130,7 +130,7 @@ public class Balancer {
} }
public synchronized void clear() { public synchronized void clear() {
urlFileStack = Stack.reset(urlFileStack); urlFileStack.clear();
domainStacks.clear(); domainStacks.clear();
urlRAMStack.clear(); urlRAMStack.clear();
resetFileIndex(); resetFileIndex();
@ -544,7 +544,7 @@ public class Balancer {
if (nextentry == null) { if (nextentry == null) {
// emergency case: this means that something with the stack organization is wrong // emergency case: this means that something with the stack organization is wrong
// the file appears to be broken. We kill the file. // the file appears to be broken. We kill the file.
Stack.reset(urlFileStack); urlFileStack.clear();
Log.logSevere("BALANCER", "get() failed to fetch entry from file stack. reset stack file."); Log.logSevere("BALANCER", "get() failed to fetch entry from file stack. reset stack file.");
} else { } else {
final String nexthash = new String(nextentry.getColBytes(0)); final String nexthash = new String(nextentry.getColBytes(0));

@ -51,19 +51,16 @@ import de.anomic.yacy.yacyURL;
public class IndexingStack { public class IndexingStack {
Stack sbQueueStack; private final Stack sbQueueStack;
CrawlProfile profiles; private final CrawlProfile profiles;
plasmaWordIndex wordIndex; private final plasmaWordIndex wordIndex;
private final File sbQueueStackPath; private final ConcurrentHashMap<String, QueueEntry> queueInProcess;
ConcurrentHashMap<String, QueueEntry> queueInProcess;
public IndexingStack(final plasmaWordIndex wordIndex, final File sbQueueStackPath, final CrawlProfile profiles) { public IndexingStack(final plasmaWordIndex wordIndex, final File sbQueueStackPath, final CrawlProfile profiles) {
this.sbQueueStackPath = sbQueueStackPath;
this.profiles = profiles; this.profiles = profiles;
this.wordIndex = wordIndex; this.wordIndex = wordIndex;
this.queueInProcess = new ConcurrentHashMap<String, QueueEntry>(); this.queueInProcess = new ConcurrentHashMap<String, QueueEntry>();
this.sbQueueStack = Stack.open(sbQueueStackPath, rowdef);
initQueueStack();
} }
public static final Row rowdef = new Row( public static final Row rowdef = new Row(
@ -77,18 +74,7 @@ public class IndexingStack {
"String urldescr-80", "String urldescr-80",
NaturalOrder.naturalOrder, NaturalOrder.naturalOrder,
0); 0);
private void initQueueStack() {
sbQueueStack = Stack.open(sbQueueStackPath, rowdef);
}
/*
private void resetQueueStack() {
try {sbQueueStack.close();} catch (Exception e) {}
if (sbQueueStackPath.exists()) sbQueueStackPath.delete();
initQueueStack();
}
*/
public int size() { public int size() {
return (sbQueueStack == null) ? 0 : sbQueueStack.size(); return (sbQueueStack == null) ? 0 : sbQueueStack.size();
} }
@ -131,14 +117,13 @@ public class IndexingStack {
} }
public void clear() { public void clear() {
sbQueueStack = Stack.reset(sbQueueStack); sbQueueStack.clear();
} }
public void close() { public void close() {
if (sbQueueStack != null) { if (sbQueueStack != null) {
sbQueueStack.close(); sbQueueStack.close();
} }
sbQueueStack = null;
} }
protected void finalize() throws Throwable { protected void finalize() throws Throwable {

@ -78,17 +78,14 @@ public final class Stack extends FullRecords {
} }
} }
public static Stack reset(final Stack stack) { public void clear() {
// memorize settings to this file try {
final File f = new File(stack.filename); super.clear();
final Row row = stack.row(); setHandle(root, null);
setHandle(toor, null);
// close and delete the file } catch (IOException e) {
try {stack.close();} catch (final Exception e) {} e.printStackTrace();
if (f.exists()) f.delete(); }
// re-open a database with same settings as before
return open(f, row);
} }
public Iterator<Row.Entry> stackIterator(final boolean up) { public Iterator<Row.Entry> stackIterator(final boolean up) {

@ -0,0 +1,449 @@
// plasmaWordIndex.java
// (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 2005 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-03-13 11:34:51 +0100 (Fr, 13 Mrz 2009) $
// $LastChangedRevision: 5709 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import de.anomic.kelondro.index.RowCollection;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.Order;
import de.anomic.kelondro.order.RotateIterator;
import de.anomic.kelondro.text.Index;
import de.anomic.kelondro.text.IndexCache;
import de.anomic.kelondro.text.IndexCollection;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceContainerOrder;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.Log;
import de.anomic.server.serverProfiling;
public final class CachedIndexCollection implements Index {
// environment constants
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash
public static final int lowcachedivisor = 900;
public static final int maxCollectionPartition = 7; // should be 7
private static final ByteOrder indexOrder = Base64Order.enhancedCoder;
private final IndexCache indexCache;
private final IndexCollection collections; // new database structure to replace AssortmentCluster and FileCluster
public CachedIndexCollection(
File indexPrimaryTextLocation,
final int entityCacheMaxSize,
final boolean useCommons,
final int redundancy,
Log log) throws IOException {
final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE");
if (!(textindexcache.exists())) textindexcache.mkdirs();
if (new File(textindexcache, "index.dhtin.blob").exists()) {
// migration of the both caches into one
this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
IndexCache dhtInCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtin.blob", log);
for (ReferenceContainer c: dhtInCache) {
this.indexCache.addReferences(c);
}
new File(textindexcache, "index.dhtin.blob").delete();
} else {
// read in new BLOB
this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
}
// create collections storage path
final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
this.collections = new IndexCollection(
textindexcollections,
"collection",
12,
Base64Order.enhancedCoder,
maxCollectionPartition,
ReferenceRow.urlEntryRow,
useCommons);
}
public void clear() {
indexCache.clear();
try {
collections.clear();
} catch (IOException e) {
e.printStackTrace();
}
}
public int minMem() {
return 1024*1024 /* indexing overhead */ + indexCache.minMem() + collections.minMem();
}
public int maxURLinCache() {
return indexCache.maxURLinCache();
}
public long minAgeOfCache() {
return indexCache.minAgeOfCache();
}
public long maxAgeOfCache() {
return indexCache.maxAgeOfCache();
}
public int indexCacheSize() {
return indexCache.size();
}
public long indexCacheSizeBytes() {
// calculate the real size in bytes of the index cache
long cacheBytes = 0;
final long entryBytes = ReferenceRow.urlEntryRow.objectsize;
final IndexCache cache = (indexCache);
synchronized (cache) {
final Iterator<ReferenceContainer> it = cache.referenceIterator(null, false, true);
while (it.hasNext()) cacheBytes += it.next().size() * entryBytes;
}
return cacheBytes;
}
public void setMaxWordCount(final int maxWords) {
indexCache.setMaxWordCount(maxWords);
}
public void cacheFlushControl(final IndexCache theCache) {
// check for forced flush
int cs = cacheSize();
if (cs > 0) {
// flush elements that are too big. This flushing depends on the fact that the flush rule
// selects the biggest elements first for flushing. If it does not for any reason, the following
// loop would not terminate.
serverProfiling.update("wordcache", Long.valueOf(cs), true);
// To ensure termination an additional counter is used
int l = 0;
while (theCache.size() > 0 && (l++ < 100) && (theCache.maxURLinCache() > wCacheMaxChunk)) {
flushCacheOne(theCache);
}
// next flush more entries if the size exceeds the maximum size of the cache
while (theCache.size() > 0 &&
((theCache.size() > theCache.getMaxWordCount()) ||
(MemoryControl.available() < collections.minMem()))) {
flushCacheOne(theCache);
}
if (cacheSize() != cs) serverProfiling.update("wordcache", Long.valueOf(cacheSize()), true);
}
}
public static ReferenceContainer emptyContainer(final String wordHash, final int elementCount) {
return new ReferenceContainer(wordHash, ReferenceRow.urlEntryRow, elementCount);
}
public void addEntry(final String wordHash, final ReferenceRow entry, final long updateTime) {
// add the entry
indexCache.addEntry(wordHash, entry, updateTime, true);
cacheFlushControl(this.indexCache);
}
public void addReferences(final ReferenceContainer entries) {
assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize);
// add the entry
indexCache.addReferences(entries);
cacheFlushControl(this.indexCache);
}
public void flushCacheFor(int time) {
flushCacheUntil(System.currentTimeMillis() + time);
}
private synchronized void flushCacheUntil(long timeout) {
while (System.currentTimeMillis() < timeout && indexCache.size() > 0) {
flushCacheOne(indexCache);
}
}
private synchronized void flushCacheOne(final IndexCache ram) {
if (ram.size() > 0) collections.addReferences(flushContainer(ram));
}
private ReferenceContainer flushContainer(final IndexCache ram) {
String wordHash;
ReferenceContainer c;
wordHash = ram.maxScoreWordHash();
c = ram.getReferences(wordHash, null);
if ((c != null) && (c.size() > wCacheMaxChunk)) {
return ram.deleteAllReferences(wordHash);
} else {
return ram.deleteAllReferences(ram.bestFlushWordHash());
}
}
public boolean hasReferences(final String wordHash) {
if (indexCache.hasReferences(wordHash)) return true;
if (collections.hasReferences(wordHash)) return true;
return false;
}
public ReferenceContainer getReferences(final String wordHash, final Set<String> urlselection) {
if (wordHash == null) {
// wrong input
return null;
}
// get from cache
ReferenceContainer container;
container = indexCache.getReferences(wordHash, urlselection);
// get from collection index
if (container == null) {
container = collections.getReferences(wordHash, urlselection);
} else {
container.addAllUnique(collections.getReferences(wordHash, urlselection));
}
if (container == null) return null;
// check doubles
final int beforeDouble = container.size();
container.sort();
final ArrayList<RowCollection> d = container.removeDoubles();
RowCollection set;
for (int i = 0; i < d.size(); i++) {
// for each element in the double-set, take that one that is the most recent one
set = d.get(i);
ReferenceRow e, elm = null;
long lm = 0;
for (int j = 0; j < set.size(); j++) {
e = new ReferenceRow(set.get(j, true));
if ((elm == null) || (e.lastModified() > lm)) {
elm = e;
lm = e.lastModified();
}
}
if(elm != null) {
container.addUnique(elm.toKelondroEntry());
}
}
if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getWordHash());
return container;
}
/**
* return map of wordhash:indexContainer
*
* @param wordHashes
* @param urlselection
* @param deleteIfEmpty
* @param interruptIfEmpty
* @return
*/
public HashMap<String, ReferenceContainer> getContainers(final Set<String> wordHashes, final Set<String> urlselection, final boolean interruptIfEmpty) {
// retrieve entities that belong to the hashes
final HashMap<String, ReferenceContainer> containers = new HashMap<String, ReferenceContainer>(wordHashes.size());
String singleHash;
ReferenceContainer singleContainer;
final Iterator<String> i = wordHashes.iterator();
while (i.hasNext()) {
// get next word hash:
singleHash = i.next();
// retrieve index
singleContainer = getReferences(singleHash, urlselection);
// check result
if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashMap<String, ReferenceContainer>(0);
containers.put(singleHash, singleContainer);
}
return containers;
}
@SuppressWarnings("unchecked")
public HashMap<String, ReferenceContainer>[] localSearchContainers(
final TreeSet<String> queryHashes,
final TreeSet<String> excludeHashes,
final Set<String> urlselection) {
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
// retrieve entities that belong to the hashes
HashMap<String, ReferenceContainer> inclusionContainers = (queryHashes.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
queryHashes,
urlselection,
true);
if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap<String, ReferenceContainer>(0); // prevent that only a subset is returned
final HashMap<String, ReferenceContainer> exclusionContainers = (inclusionContainers.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
excludeHashes,
urlselection,
true);
return new HashMap[]{inclusionContainers, exclusionContainers};
}
public int size() {
return java.lang.Math.max(collections.size(), indexCache.size());
}
public int collectionsSize() {
return collections.size();
}
public int cacheSize() {
return indexCache.size();
}
public void close() {
indexCache.close();
collections.close();
}
public ReferenceContainer deleteAllReferences(final String wordHash) {
final ReferenceContainer c = new ReferenceContainer(
wordHash,
ReferenceRow.urlEntryRow,
indexCache.countReferences(wordHash));
c.addAllUnique(indexCache.deleteAllReferences(wordHash));
c.addAllUnique(collections.deleteAllReferences(wordHash));
return c;
}
public boolean removeReference(final String wordHash, final String urlHash) {
boolean removed = false;
removed = removed | (indexCache.removeReference(wordHash, urlHash));
removed = removed | (collections.removeReference(wordHash, urlHash));
return removed;
}
public int removeEntryMultiple(final Set<String> wordHashes, final String urlHash) {
// remove the same url hashes for multiple words
// this is mainly used when correcting a index after a search
final Iterator<String> i = wordHashes.iterator();
int count = 0;
while (i.hasNext()) {
if (removeReference(i.next(), urlHash)) count++;
}
return count;
}
public int removeReferences(final String wordHash, final Set<String> urlHashes) {
int removed = 0;
removed += indexCache.removeReferences(wordHash, urlHashes);
removed += collections.removeReferences(wordHash, urlHashes);
return removed;
}
public String removeEntriesExpl(final String wordHash, final Set<String> urlHashes) {
String removed = "";
removed += indexCache.removeReferences(wordHash, urlHashes) + ", ";
removed += collections.removeReferences(wordHash, urlHashes);
return removed;
}
public void removeEntriesMultiple(final Set<String> wordHashes, final Set<String> urlHashes) {
// remove the same url hashes for multiple words
// this is mainly used when correcting a index after a search
final Iterator<String> i = wordHashes.iterator();
while (i.hasNext()) {
removeReferences(i.next(), urlHashes);
}
}
public int removeWordReferences(final Set<String> words, final String urlhash) {
// sequentially delete all word references
// returns number of deletions
final Iterator<String> iter = words.iterator();
int count = 0;
while (iter.hasNext()) {
// delete the URL reference in this word index
if (removeReference(Word.word2hash(iter.next()), urlhash)) count++;
}
return count;
}
public synchronized TreeSet<ReferenceContainer> indexContainerSet(final String startHash, final boolean ram, final boolean rot, int count) {
// creates a set of indexContainers
// this does not use the cache
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(indexOrder.clone());
containerOrder.rotate(emptyContainer(startHash, 0));
final TreeSet<ReferenceContainer> containers = new TreeSet<ReferenceContainer>(containerOrder);
final Iterator<ReferenceContainer> i = referenceIterator(startHash, rot, ram);
if (ram) count = Math.min(indexCache.size(), count);
ReferenceContainer container;
// this loop does not terminate using the i.hasNex() predicate when rot == true
// because then the underlying iterator is a rotating iterator without termination
// in this case a termination must be ensured with a counter
// It must also be ensured that the counter is in/decreased every loop
while ((count > 0) && (i.hasNext())) {
container = i.next();
if ((container != null) && (container.size() > 0)) {
containers.add(container);
}
count--; // decrease counter even if the container was null or empty to ensure termination
}
return containers; // this may return less containers as demanded
}
public synchronized CloneableIterator<ReferenceContainer> referenceIterator(final String startHash, final boolean rot, final boolean ram) {
final CloneableIterator<ReferenceContainer> i = wordContainers(startHash, ram);
if (rot) {
return new RotateIterator<ReferenceContainer>(i, new String(Base64Order.zero(startHash.length())), indexCache.size() + ((ram) ? 0 : collections.size()));
}
return i;
}
private synchronized CloneableIterator<ReferenceContainer> wordContainers(final String startWordHash, final boolean ram) {
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(indexOrder.clone());
containerOrder.rotate(emptyContainer(startWordHash, 0));
if (ram) {
return indexCache.referenceIterator(startWordHash, false, true);
}
return collections.referenceIterator(startWordHash, false, false);
/*
return new MergeIterator<ReferenceContainer>(
indexCache.referenceIterator(startWordHash, false, true),
collections.referenceIterator(startWordHash, false, false),
containerOrder,
ReferenceContainer.containerMergeMethod,
true);
*/
}
public int countReferences(String key) {
return indexCache.countReferences(key) + collections.countReferences(key);
}
}

@ -38,7 +38,6 @@ import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.RowSet; import de.anomic.kelondro.index.RowSet;
import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.util.ByteBuffer; import de.anomic.kelondro.util.ByteBuffer;
import de.anomic.plasma.plasmaWordIndex;
public class ReferenceContainer extends RowSet { public class ReferenceContainer extends RowSet {
@ -229,11 +228,11 @@ public class ReferenceContainer extends RowSet {
// join a search result and return the joincount (number of pages after join) // join a search result and return the joincount (number of pages after join)
// since this is a conjunction we return an empty entity if any word is not known // since this is a conjunction we return an empty entity if any word is not known
if (includeContainers == null) return plasmaWordIndex.emptyContainer(null, 0); if (includeContainers == null) return CachedIndexCollection.emptyContainer(null, 0);
// join the result // join the result
final ReferenceContainer rcLocal = ReferenceContainer.joinContainers(includeContainers, maxDistance); final ReferenceContainer rcLocal = ReferenceContainer.joinContainers(includeContainers, maxDistance);
if (rcLocal == null) return plasmaWordIndex.emptyContainer(null, 0); if (rcLocal == null) return CachedIndexCollection.emptyContainer(null, 0);
excludeContainers(rcLocal, excludeContainers); excludeContainers(rcLocal, excludeContainers);
return rcLocal; return rcLocal;

@ -36,7 +36,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
super("PLASMADB"); super("PLASMADB");
this.homeWordIndex = homeWI; this.homeWordIndex = homeWI;
this.importWordIndex = importWI; this.importWordIndex = importWI;
this.importStartSize = this.importWordIndex.size(); this.importStartSize = this.importWordIndex.index().size();
} }
/** /**
@ -93,15 +93,15 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
try { try {
this.log.logInfo("Importing DB from '" + this.importWordIndex.getLocation(true).getAbsolutePath() + "'"); this.log.logInfo("Importing DB from '" + this.importWordIndex.getLocation(true).getAbsolutePath() + "'");
this.log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeWordIndex.metadata().size() + " URLs."); this.log.logInfo("Home word index contains " + homeWordIndex.index().size() + " words and " + homeWordIndex.metadata().size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.metadata().size() + " URLs."); this.log.logInfo("Import word index contains " + this.importWordIndex.index().size() + " words and " + this.importWordIndex.metadata().size() + " URLs.");
final HashSet<String> unknownUrlBuffer = new HashSet<String>(); final HashSet<String> unknownUrlBuffer = new HashSet<String>();
final HashSet<String> importedUrlBuffer = new HashSet<String>(); final HashSet<String> importedUrlBuffer = new HashSet<String>();
// iterate over all words from import db // iterate over all words from import db
//Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false); //Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
Iterator<ReferenceContainer> indexContainerIterator = this.importWordIndex.indexContainerSet(this.wordChunkStartHash, false, false, 100).iterator(); Iterator<ReferenceContainer> indexContainerIterator = this.importWordIndex.index().indexContainerSet(this.wordChunkStartHash, false, false, 100).iterator();
while (!isAborted() && indexContainerIterator.hasNext()) { while (!isAborted() && indexContainerIterator.hasNext()) {
final TreeSet<String> entityUrls = new TreeSet<String>(); final TreeSet<String> entityUrls = new TreeSet<String>();
@ -169,10 +169,10 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
if (isAborted()) break; if (isAborted()) break;
// importing entity container to home db // importing entity container to home db
if (newContainer.size() > 0) { homeWordIndex.addReferences(newContainer); } if (newContainer.size() > 0) { homeWordIndex.index().addReferences(newContainer); }
// delete complete index entity file // delete complete index entity file
this.importWordIndex.deleteAllReferences(this.wordHash); this.importWordIndex.index().deleteAllReferences(this.wordHash);
// print out some statistical information // print out some statistical information
if (this.entryCounter % 500 == 0) { if (this.entryCounter % 500 == 0) {
@ -189,8 +189,8 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
"Speed: "+ 500*1000/duration + " word entities/s" + "Speed: "+ 500*1000/duration + " word entities/s" +
" | Elapsed time: " + DateFormatter.formatInterval(getElapsedTime()) + " | Elapsed time: " + DateFormatter.formatInterval(getElapsedTime()) +
" | Estimated time: " + DateFormatter.formatInterval(getEstimatedTime()) + "\n" + " | Estimated time: " + DateFormatter.formatInterval(getEstimatedTime()) + "\n" +
"Home Words = " + homeWordIndex.size() + "Home Words = " + homeWordIndex.index().size() +
" | Import Words = " + this.importWordIndex.size()); " | Import Words = " + this.importWordIndex.index().size());
this.wordChunkStart = this.wordChunkEnd; this.wordChunkStart = this.wordChunkEnd;
this.wordChunkStartHash = this.wordChunkEndHash; this.wordChunkStartHash = this.wordChunkEndHash;
} }
@ -203,7 +203,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
if (!indexContainerIterator.hasNext()) { if (!indexContainerIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes // We may not be finished yet, try to get the next chunk of wordHashes
final TreeSet<ReferenceContainer> containers = this.importWordIndex.indexContainerSet(this.wordHash, false, false, 100); final TreeSet<ReferenceContainer> containers = this.importWordIndex.index().indexContainerSet(this.wordHash, false, false, 100);
indexContainerIterator = containers.iterator(); indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word // Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getWordHash()))) { if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getWordHash()))) {
@ -212,8 +212,8 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
} }
} }
this.log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeWordIndex.metadata().size() + " URLs."); this.log.logInfo("Home word index contains " + homeWordIndex.index().size() + " words and " + homeWordIndex.metadata().size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.metadata().size() + " URLs."); this.log.logInfo("Import word index contains " + this.importWordIndex.index().size() + " words and " + this.importWordIndex.metadata().size() + " URLs.");
} catch (final Exception e) { } catch (final Exception e) {
this.log.logSevere("Database import failed.",e); this.log.logSevere("Database import failed.",e);
e.printStackTrace(); e.printStackTrace();

@ -248,7 +248,7 @@ public final class plasmaSearchEvent {
if (rw > 0) { if (rw > 0) {
final Set<String> removeWords = cleanEvent.query.queryHashes; final Set<String> removeWords = cleanEvent.query.queryHashes;
removeWords.addAll(cleanEvent.query.excludeHashes); removeWords.addAll(cleanEvent.query.excludeHashes);
cleanEvent.wordIndex.removeEntriesMultiple(removeWords, cleanEvent.failedURLs.keySet()); cleanEvent.wordIndex.index().removeEntriesMultiple(removeWords, cleanEvent.failedURLs.keySet());
Log.logInfo("SearchEvents", "cleaning up event " + cleanEvent.query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words"); Log.logInfo("SearchEvents", "cleaning up event " + cleanEvent.query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words");
} }
@ -301,7 +301,7 @@ public final class plasmaSearchEvent {
(query.constraint.get(plasmaCondenser.flag_cat_indexof)) && (query.constraint.get(plasmaCondenser.flag_cat_indexof)) &&
(!(metadata.dc_title().startsWith("Index of")))) { (!(metadata.dc_title().startsWith("Index of")))) {
final Iterator<String> wi = query.queryHashes.iterator(); final Iterator<String> wi = query.queryHashes.iterator();
while (wi.hasNext()) wordIndex.removeReference(wi.next(), page.hash()); while (wi.hasNext()) wordIndex.index().removeReference(wi.next(), page.hash());
registerFailure(page.hash(), "index-of constraint not fullfilled"); registerFailure(page.hash(), "index-of constraint not fullfilled");
return null; return null;
} }
@ -824,7 +824,7 @@ public final class plasmaSearchEvent {
String address = null; String address = null;
if ((seed == null) || ((address = seed.getPublicAddress()) == null)) { if ((seed == null) || ((address = seed.getPublicAddress()) == null)) {
// seed is not known from here // seed is not known from here
wordIndex.removeWordReferences( wordIndex.index().removeWordReferences(
plasmaCondenser.getWords( plasmaCondenser.getWords(
("yacyshare " + ("yacyshare " +
filename.replace('?', ' ') + filename.replace('?', ' ') +

@ -110,7 +110,7 @@ public final class plasmaSearchRankingProcess {
public void execQuery() { public void execQuery() {
long timer = System.currentTimeMillis(); long timer = System.currentTimeMillis();
this.localSearchContainerMaps = wordIndex.localSearchContainers(query.queryHashes, query.excludeHashes, null); this.localSearchContainerMaps = wordIndex.index().localSearchContainers(query.queryHashes, query.excludeHashes, null);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.COLLECTION, this.localSearchContainerMaps[0].size(), System.currentTimeMillis() - timer), false); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.COLLECTION, this.localSearchContainerMaps[0].size(), System.currentTimeMillis() - timer), false);
// join and exclude the local result // join and exclude the local result

@ -952,12 +952,12 @@ public class plasmaSnippetCache {
assert plasmaSwitchboard.getSwitchboard().webIndex != null; assert plasmaSwitchboard.getSwitchboard().webIndex != null;
assert event != null : "eventID = " + eventID; assert event != null : "eventID = " + eventID;
assert event.getQuery() != null; assert event.getQuery() != null;
plasmaSwitchboard.getSwitchboard().webIndex.removeEntryMultiple(event.getQuery().queryHashes, urlHash); plasmaSwitchboard.getSwitchboard().webIndex.index().removeEntryMultiple(event.getQuery().queryHashes, urlHash);
event.remove(urlHash); event.remove(urlHash);
} }
if (snippet.getErrorCode() == ERROR_NO_MATCH) { if (snippet.getErrorCode() == ERROR_NO_MATCH) {
log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError()); log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
plasmaSwitchboard.getSwitchboard().webIndex.removeEntryMultiple(snippet.remaingHashes, urlHash); plasmaSwitchboard.getSwitchboard().webIndex.index().removeEntryMultiple(snippet.remaingHashes, urlHash);
plasmaSearchEvent.getEvent(eventID).remove(urlHash); plasmaSearchEvent.getEvent(eventID).remove(urlHash);
} }
return snippet.getError(); return snippet.getError();

@ -329,7 +329,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// init a DHT transmission dispatcher // init a DHT transmission dispatcher
this.dhtDispatcher = new Dispatcher( this.dhtDispatcher = new Dispatcher(
webIndex, webIndex.index(),
webIndex.metadata(), webIndex.metadata(),
webIndex.peers(), webIndex.peers(),
true, true,
@ -1119,12 +1119,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
} }
public int rwiCacheSize() { public int rwiCacheSize() {
return webIndex.cacheSize(); return webIndex.index().cacheSize();
} }
public boolean rwiCacheFlush() { public boolean rwiCacheFlush() {
if (rwiCacheSize() == 0) return false; if (rwiCacheSize() == 0) return false;
webIndex.flushCacheFor((int) ((this.getConfigLong(plasmaSwitchboardConstants.CACHEFLUSH_BUSYSLEEP, 10000) * this.getConfigLong("performanceIO", 10)) / 100)); webIndex.index().flushCacheFor((int) ((this.getConfigLong(plasmaSwitchboardConstants.CACHEFLUSH_BUSYSLEEP, 10000) * this.getConfigLong("performanceIO", 10)) / 100));
return true; return true;
} }
@ -1143,7 +1143,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
public void deQueueFreeMem() { public void deQueueFreeMem() {
// flush some entries from the RAM cache // flush some entries from the RAM cache
webIndex.flushCacheFor(5000); webIndex.index().flushCacheFor(5000);
// empty some caches // empty some caches
webIndex.metadata().clearCache(); webIndex.metadata().clearCache();
plasmaSearchEvent.cleanupEvents(true); plasmaSearchEvent.cleanupEvents(true);
@ -1772,7 +1772,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// delete all word references // delete all word references
int count = 0; int count = 0;
if (words != null) count = webIndex.removeWordReferences(words, urlhash); if (words != null) count = webIndex.index().removeWordReferences(words, urlhash);
// finally delete the url entry itself // finally delete the url entry itself
webIndex.metadata().remove(urlhash); webIndex.metadata().remove(urlhash);
@ -1889,8 +1889,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
if (webIndex.metadata().size() < 10) { if (webIndex.metadata().size() < 10) {
return "no DHT distribution: loadedURL.size() = " + webIndex.metadata().size(); return "no DHT distribution: loadedURL.size() = " + webIndex.metadata().size();
} }
if (webIndex.size() < 100) { if (webIndex.index().size() < 100) {
return "no DHT distribution: not enough words - wordIndex.size() = " + webIndex.size(); return "no DHT distribution: not enough words - wordIndex.size() = " + webIndex.index().size();
} }
if ((getConfig(plasmaSwitchboardConstants.INDEX_DIST_ALLOW_WHILE_CRAWLING, "false").equalsIgnoreCase("false")) && (crawlQueues.noticeURL.notEmpty())) { if ((getConfig(plasmaSwitchboardConstants.INDEX_DIST_ALLOW_WHILE_CRAWLING, "false").equalsIgnoreCase("false")) && (crawlQueues.noticeURL.notEmpty())) {
return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " + crawlQueues.noticeURL.size() + ", sbQueue.size() = " + webIndex.queuePreStack.size(); return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " + crawlQueues.noticeURL.size() + ", sbQueue.size() = " + webIndex.queuePreStack.size();
@ -1992,7 +1992,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
webIndex.peers().mySeed().put(yacySeed.LCOUNT, Integer.toString(webIndex.metadata().size())); // the number of links that the peer has stored (LURL's) webIndex.peers().mySeed().put(yacySeed.LCOUNT, Integer.toString(webIndex.metadata().size())); // the number of links that the peer has stored (LURL's)
webIndex.peers().mySeed().put(yacySeed.NCOUNT, Integer.toString(crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's) webIndex.peers().mySeed().put(yacySeed.NCOUNT, Integer.toString(crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's)
webIndex.peers().mySeed().put(yacySeed.RCOUNT, Integer.toString(crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's) webIndex.peers().mySeed().put(yacySeed.RCOUNT, Integer.toString(crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's)
webIndex.peers().mySeed().put(yacySeed.ICOUNT, Integer.toString(webIndex.size())); // the minimum number of words that the peer has indexed (as it says) webIndex.peers().mySeed().put(yacySeed.ICOUNT, Integer.toString(webIndex.index().size())); // the minimum number of words that the peer has indexed (as it says)
webIndex.peers().mySeed().put(yacySeed.SCOUNT, Integer.toString(webIndex.peers().sizeConnected())); // the number of seeds that the peer has stored webIndex.peers().mySeed().put(yacySeed.SCOUNT, Integer.toString(webIndex.peers().sizeConnected())); // the number of seeds that the peer has stored
webIndex.peers().mySeed().put(yacySeed.CCOUNT, Double.toString(((int) ((webIndex.peers().sizeConnected() + webIndex.peers().sizeDisconnected() + webIndex.peers().sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour) webIndex.peers().mySeed().put(yacySeed.CCOUNT, Double.toString(((int) ((webIndex.peers().sizeConnected() + webIndex.peers().sizeDisconnected() + webIndex.peers().sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour)
webIndex.peers().mySeed().put(yacySeed.VERSION, getConfig("version", "")); webIndex.peers().mySeed().put(yacySeed.VERSION, getConfig("version", ""));

@ -28,55 +28,39 @@ package de.anomic.plasma;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.IndexingStack; import de.anomic.crawler.IndexingStack;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpdProxyCacheEntry; import de.anomic.http.httpdProxyCacheEntry;
import de.anomic.kelondro.index.RowCollection; import de.anomic.kelondro.text.CachedIndexCollection;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.Order;
import de.anomic.kelondro.order.RotateIterator;
import de.anomic.kelondro.text.Index;
import de.anomic.kelondro.text.IndexCache;
import de.anomic.kelondro.text.IndexCollection;
import de.anomic.kelondro.text.MetadataRowContainer; import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.ReferenceContainer; import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceContainerOrder;
import de.anomic.kelondro.text.ReferenceRow; import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.MetadataRepository; import de.anomic.kelondro.text.MetadataRepository;
import de.anomic.kelondro.text.Word; import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.text.Blacklist; import de.anomic.kelondro.text.Blacklist;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.kelondroException; import de.anomic.kelondro.util.kelondroException;
import de.anomic.kelondro.util.Log; import de.anomic.kelondro.util.Log;
import de.anomic.server.serverProfiling;
import de.anomic.tools.iso639; import de.anomic.tools.iso639;
import de.anomic.xml.RSSFeed; import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage; import de.anomic.xml.RSSMessage;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL; import de.anomic.yacy.yacyURL;
public final class plasmaWordIndex implements Index { public final class plasmaWordIndex {
// environment constants // environment constants
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash
public static final int lowcachedivisor = 900; public static final int lowcachedivisor = 900;
public static final int maxCollectionPartition = 7; // should be 7 public static final int maxCollectionPartition = 7; // should be 7
private static final ByteOrder indexOrder = Base64Order.enhancedCoder;
public static final String CRAWL_PROFILE_PROXY = "proxy"; public static final String CRAWL_PROFILE_PROXY = "proxy";
public static final String CRAWL_PROFILE_REMOTE = "remote"; public static final String CRAWL_PROFILE_REMOTE = "remote";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText"; public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText";
@ -93,8 +77,7 @@ public final class plasmaWordIndex implements Index {
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
private final IndexCache indexCache; private final CachedIndexCollection index;
private final IndexCollection collections; // new database structure to replace AssortmentCluster and FileCluster
private final Log log; private final Log log;
private MetadataRepository metadata; private MetadataRepository metadata;
private final yacySeedDB peers; private final yacySeedDB peers;
@ -139,34 +122,13 @@ public final class plasmaWordIndex implements Index {
} }
} }
} }
this.index = new CachedIndexCollection(
final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE"); indexPrimaryTextLocation,
if (!(textindexcache.exists())) textindexcache.mkdirs(); entityCacheMaxSize,
if (new File(textindexcache, "index.dhtin.blob").exists()) { useCommons,
// migration of the both caches into one redundancy,
this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log); log);
IndexCache dhtInCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtin.blob", log);
for (ReferenceContainer c: dhtInCache) {
this.indexCache.addReferences(c);
}
new File(textindexcache, "index.dhtin.blob").delete();
} else {
// read in new BLOB
this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
}
// create collections storage path
final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
this.collections = new IndexCollection(
textindexcollections,
"collection",
12,
Base64Order.enhancedCoder,
maxCollectionPartition,
ReferenceRow.urlEntryRow,
useCommons);
// create LURL-db // create LURL-db
metadata = new MetadataRepository(new File(this.secondaryRoot, "TEXT")); metadata = new MetadataRepository(new File(this.secondaryRoot, "TEXT"));
@ -249,13 +211,12 @@ public final class plasmaWordIndex implements Index {
return this.peers; return this.peers;
} }
public CachedIndexCollection index() {
return this.index;
}
public void clear() { public void clear() {
indexCache.clear(); index.clear();
try {
collections.clear();
} catch (IOException e) {
e.printStackTrace();
}
try { try {
metadata.clear(); metadata.clear();
} catch (final IOException e) { } catch (final IOException e) {
@ -377,111 +338,7 @@ public final class plasmaWordIndex implements Index {
public File getLocation(final boolean primary) { public File getLocation(final boolean primary) {
return (primary) ? this.primaryRoot : this.secondaryRoot; return (primary) ? this.primaryRoot : this.secondaryRoot;
} }
public int minMem() {
return 1024*1024 /* indexing overhead */ + indexCache.minMem() + collections.minMem();
}
public int maxURLinCache() {
return indexCache.maxURLinCache();
}
public long minAgeOfCache() {
return indexCache.minAgeOfCache();
}
public long maxAgeOfCache() {
return indexCache.maxAgeOfCache();
}
public int indexCacheSize() {
return indexCache.size();
}
public long indexCacheSizeBytes() {
// calculate the real size in bytes of the index cache
long cacheBytes = 0;
final long entryBytes = ReferenceRow.urlEntryRow.objectsize;
final IndexCache cache = (indexCache);
synchronized (cache) {
final Iterator<ReferenceContainer> it = cache.referenceIterator(null, false, true);
while (it.hasNext()) cacheBytes += it.next().size() * entryBytes;
}
return cacheBytes;
}
public void setMaxWordCount(final int maxWords) {
indexCache.setMaxWordCount(maxWords);
}
public void cacheFlushControl(final IndexCache theCache) {
// check for forced flush
int cs = cacheSize();
if (cs > 0) {
// flush elements that are too big. This flushing depends on the fact that the flush rule
// selects the biggest elements first for flushing. If it does not for any reason, the following
// loop would not terminate.
serverProfiling.update("wordcache", Long.valueOf(cs), true);
// To ensure termination an additional counter is used
int l = 0;
while (theCache.size() > 0 && (l++ < 100) && (theCache.maxURLinCache() > wCacheMaxChunk)) {
flushCacheOne(theCache);
}
// next flush more entries if the size exceeds the maximum size of the cache
while (theCache.size() > 0 &&
((theCache.size() > theCache.getMaxWordCount()) ||
(MemoryControl.available() < collections.minMem()))) {
flushCacheOne(theCache);
}
if (cacheSize() != cs) serverProfiling.update("wordcache", Long.valueOf(cacheSize()), true);
}
}
public static ReferenceContainer emptyContainer(final String wordHash, final int elementCount) {
return new ReferenceContainer(wordHash, ReferenceRow.urlEntryRow, elementCount);
}
public void addEntry(final String wordHash, final ReferenceRow entry, final long updateTime) {
// add the entry
indexCache.addEntry(wordHash, entry, updateTime, true);
cacheFlushControl(this.indexCache);
}
public void addReferences(final ReferenceContainer entries) {
assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize);
// add the entry
indexCache.addReferences(entries);
cacheFlushControl(this.indexCache);
}
public void flushCacheFor(int time) {
flushCacheUntil(System.currentTimeMillis() + time);
}
private synchronized void flushCacheUntil(long timeout) {
while (System.currentTimeMillis() < timeout && indexCache.size() > 0) {
flushCacheOne(indexCache);
}
}
private synchronized void flushCacheOne(final IndexCache ram) {
if (ram.size() > 0) collections.addReferences(flushContainer(ram));
}
private ReferenceContainer flushContainer(final IndexCache ram) {
String wordHash;
ReferenceContainer c;
wordHash = ram.maxScoreWordHash();
c = ram.getReferences(wordHash, null);
if ((c != null) && (c.size() > wCacheMaxChunk)) {
return ram.deleteAllReferences(wordHash);
} else {
return ram.deleteAllReferences(ram.bestFlushWordHash());
}
}
/** /**
* this is called by the switchboard to put in a new page into the index * this is called by the switchboard to put in a new page into the index
* use all the words in one condenser object to simultanous create index entries * use all the words in one condenser object to simultanous create index entries
@ -526,221 +383,20 @@ public final class plasmaWordIndex implements Index {
doctype, doctype,
outlinksSame, outlinksOther, outlinksSame, outlinksOther,
wprop.flags); wprop.flags);
addEntry(Word.word2hash(word), ientry, System.currentTimeMillis()); this.index.addEntry(Word.word2hash(word), ientry, System.currentTimeMillis());
wordCount++; wordCount++;
} }
return wordCount; return wordCount;
} }
public boolean hasReferences(final String wordHash) {
if (indexCache.hasReferences(wordHash)) return true;
if (collections.hasReferences(wordHash)) return true;
return false;
}
public ReferenceContainer getReferences(final String wordHash, final Set<String> urlselection) {
if ((wordHash == null) || (wordHash.length() != yacySeedDB.commonHashLength)) {
// wrong input
return null;
}
// get from cache
ReferenceContainer container;
container = indexCache.getReferences(wordHash, urlselection);
// get from collection index
if (container == null) {
container = collections.getReferences(wordHash, urlselection);
} else {
container.addAllUnique(collections.getReferences(wordHash, urlselection));
}
if (container == null) return null;
// check doubles
final int beforeDouble = container.size();
container.sort();
final ArrayList<RowCollection> d = container.removeDoubles();
RowCollection set;
for (int i = 0; i < d.size(); i++) {
// for each element in the double-set, take that one that is the most recent one
set = d.get(i);
ReferenceRow e, elm = null;
long lm = 0;
for (int j = 0; j < set.size(); j++) {
e = new ReferenceRow(set.get(j, true));
if ((elm == null) || (e.lastModified() > lm)) {
elm = e;
lm = e.lastModified();
}
}
if(elm != null) {
container.addUnique(elm.toKelondroEntry());
}
}
if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getWordHash());
return container;
}
/**
* return map of wordhash:indexContainer
*
* @param wordHashes
* @param urlselection
* @param deleteIfEmpty
* @param interruptIfEmpty
* @return
*/
public HashMap<String, ReferenceContainer> getContainers(final Set<String> wordHashes, final Set<String> urlselection, final boolean interruptIfEmpty) {
// retrieve entities that belong to the hashes
final HashMap<String, ReferenceContainer> containers = new HashMap<String, ReferenceContainer>(wordHashes.size());
String singleHash;
ReferenceContainer singleContainer;
final Iterator<String> i = wordHashes.iterator();
while (i.hasNext()) {
// get next word hash:
singleHash = i.next();
// retrieve index
singleContainer = getReferences(singleHash, urlselection);
// check result
if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashMap<String, ReferenceContainer>(0);
containers.put(singleHash, singleContainer);
}
return containers;
}
@SuppressWarnings("unchecked")
public HashMap<String, ReferenceContainer>[] localSearchContainers(
final TreeSet<String> queryHashes,
final TreeSet<String> excludeHashes,
final Set<String> urlselection) {
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
// retrieve entities that belong to the hashes
HashMap<String, ReferenceContainer> inclusionContainers = (queryHashes.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
queryHashes,
urlselection,
true);
if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap<String, ReferenceContainer>(0); // prevent that only a subset is returned
final HashMap<String, ReferenceContainer> exclusionContainers = (inclusionContainers.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
excludeHashes,
urlselection,
true);
return new HashMap[]{inclusionContainers, exclusionContainers};
}
public int size() {
return java.lang.Math.max(collections.size(), indexCache.size());
}
public int collectionsSize() {
return collections.size();
}
public int cacheSize() {
return indexCache.size();
}
public void close() { public void close() {
indexCache.close(); index.close();
collections.close();
metadata.close(); metadata.close();
peers.close(); peers.close();
profilesActiveCrawls.close(); profilesActiveCrawls.close();
queuePreStack.close(); queuePreStack.close();
} }
public ReferenceContainer deleteAllReferences(final String wordHash) {
final ReferenceContainer c = new ReferenceContainer(
wordHash,
ReferenceRow.urlEntryRow,
indexCache.countReferences(wordHash));
c.addAllUnique(indexCache.deleteAllReferences(wordHash));
c.addAllUnique(collections.deleteAllReferences(wordHash));
return c;
}
public boolean removeReference(final String wordHash, final String urlHash) {
boolean removed = false;
removed = removed | (indexCache.removeReference(wordHash, urlHash));
removed = removed | (collections.removeReference(wordHash, urlHash));
return removed;
}
public int removeEntryMultiple(final Set<String> wordHashes, final String urlHash) {
// remove the same url hashes for multiple words
// this is mainly used when correcting a index after a search
final Iterator<String> i = wordHashes.iterator();
int count = 0;
while (i.hasNext()) {
if (removeReference(i.next(), urlHash)) count++;
}
return count;
}
public int removeReferences(final String wordHash, final Set<String> urlHashes) {
int removed = 0;
removed += indexCache.removeReferences(wordHash, urlHashes);
removed += collections.removeReferences(wordHash, urlHashes);
return removed;
}
public String removeEntriesExpl(final String wordHash, final Set<String> urlHashes) {
String removed = "";
removed += indexCache.removeReferences(wordHash, urlHashes) + ", ";
removed += collections.removeReferences(wordHash, urlHashes);
return removed;
}
public void removeEntriesMultiple(final Set<String> wordHashes, final Set<String> urlHashes) {
// remove the same url hashes for multiple words
// this is mainly used when correcting a index after a search
final Iterator<String> i = wordHashes.iterator();
while (i.hasNext()) {
removeReferences(i.next(), urlHashes);
}
}
public int removeWordReferences(final Set<String> words, final String urlhash) {
// sequentially delete all word references
// returns number of deletions
final Iterator<String> iter = words.iterator();
int count = 0;
while (iter.hasNext()) {
// delete the URL reference in this word index
if (removeReference(Word.word2hash(iter.next()), urlhash)) count++;
}
return count;
}
public synchronized TreeSet<ReferenceContainer> indexContainerSet(final String startHash, final boolean ram, final boolean rot, int count) {
// creates a set of indexContainers
// this does not use the cache
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(indexOrder.clone());
containerOrder.rotate(emptyContainer(startHash, 0));
final TreeSet<ReferenceContainer> containers = new TreeSet<ReferenceContainer>(containerOrder);
final Iterator<ReferenceContainer> i = referenceIterator(startHash, rot, ram);
if (ram) count = Math.min(indexCache.size(), count);
ReferenceContainer container;
// this loop does not terminate using the i.hasNex() predicate when rot == true
// because then the underlying iterator is a rotating iterator without termination
// in this case a termination must be ensured with a counter
// It must also be ensured that the counter is in/decreased every loop
while ((count > 0) && (i.hasNext())) {
container = i.next();
if ((container != null) && (container.size() > 0)) {
containers.add(container);
}
count--; // decrease counter even if the container was null or empty to ensure termination
}
return containers; // this may return less containers as demanded
}
public MetadataRowContainer storeDocument(final IndexingStack.QueueEntry entry, final plasmaParserDocument document, final plasmaCondenser condenser) throws IOException { public MetadataRowContainer storeDocument(final IndexingStack.QueueEntry entry, final plasmaParserDocument document, final plasmaCondenser condenser) throws IOException {
final long startTime = System.currentTimeMillis(); final long startTime = System.currentTimeMillis();
@ -856,32 +512,6 @@ public final class plasmaWordIndex implements Index {
return newEntry; return newEntry;
} }
public synchronized CloneableIterator<ReferenceContainer> referenceIterator(final String startHash, final boolean rot, final boolean ram) {
final CloneableIterator<ReferenceContainer> i = wordContainers(startHash, ram);
if (rot) {
return new RotateIterator<ReferenceContainer>(i, new String(Base64Order.zero(startHash.length())), indexCache.size() + ((ram) ? 0 : collections.size()));
}
return i;
}
private synchronized CloneableIterator<ReferenceContainer> wordContainers(final String startWordHash, final boolean ram) {
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(indexOrder.clone());
containerOrder.rotate(emptyContainer(startWordHash, 0));
if (ram) {
return indexCache.referenceIterator(startWordHash, false, true);
}
return collections.referenceIterator(startWordHash, false, false);
/*
return new MergeIterator<ReferenceContainer>(
indexCache.referenceIterator(startWordHash, false, true),
collections.referenceIterator(startWordHash, false, false),
containerOrder,
ReferenceContainer.containerMergeMethod,
true);
*/
}
// The Cleaner class was provided as "UrldbCleaner" by Hydrox // The Cleaner class was provided as "UrldbCleaner" by Hydrox
public synchronized ReferenceCleaner getReferenceCleaner(final String startHash) { public synchronized ReferenceCleaner getReferenceCleaner(final String startHash) {
return new ReferenceCleaner(startHash); return new ReferenceCleaner(startHash);
@ -899,7 +529,7 @@ public final class plasmaWordIndex implements Index {
public ReferenceCleaner(final String startHash) { public ReferenceCleaner(final String startHash) {
this.startHash = startHash; this.startHash = startHash;
this.rwiCountAtStart = size(); this.rwiCountAtStart = index().size();
} }
public void run() { public void run() {
@ -908,7 +538,7 @@ public final class plasmaWordIndex implements Index {
ReferenceRow entry = null; ReferenceRow entry = null;
yacyURL url = null; yacyURL url = null;
final HashSet<String> urlHashs = new HashSet<String>(); final HashSet<String> urlHashs = new HashSet<String>();
Iterator<ReferenceContainer> indexContainerIterator = indexContainerSet(startHash, false, false, 100).iterator(); Iterator<ReferenceContainer> indexContainerIterator = index.indexContainerSet(startHash, false, false, 100).iterator();
while (indexContainerIterator.hasNext() && run) { while (indexContainerIterator.hasNext() && run) {
waiter(); waiter();
container = indexContainerIterator.next(); container = indexContainerIterator.next();
@ -930,7 +560,7 @@ public final class plasmaWordIndex implements Index {
} }
} }
if (urlHashs.size() > 0) { if (urlHashs.size() > 0) {
final int removed = removeReferences(container.getWordHash(), urlHashs); final int removed = index.removeReferences(container.getWordHash(), urlHashs);
Log.logFine("INDEXCLEANER", container.getWordHash() + ": " + removed + " of " + container.size() + " URL-entries deleted"); Log.logFine("INDEXCLEANER", container.getWordHash() + ": " + removed + " of " + container.size() + " URL-entries deleted");
lastWordHash = container.getWordHash(); lastWordHash = container.getWordHash();
lastDeletionCounter = urlHashs.size(); lastDeletionCounter = urlHashs.size();
@ -938,7 +568,7 @@ public final class plasmaWordIndex implements Index {
} }
if (!containerIterator.hasNext()) { if (!containerIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes // We may not be finished yet, try to get the next chunk of wordHashes
final TreeSet<ReferenceContainer> containers = indexContainerSet(container.getWordHash(), false, false, 100); final TreeSet<ReferenceContainer> containers = index.indexContainerSet(container.getWordHash(), false, false, 100);
indexContainerIterator = containers.iterator(); indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word // Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext()) && (!container.getWordHash().equals(indexContainerIterator.next().getWordHash()))) { if ((indexContainerIterator.hasNext()) && (!container.getWordHash().equals(indexContainerIterator.next().getWordHash()))) {
@ -988,9 +618,4 @@ public final class plasmaWordIndex implements Index {
} }
} }
} }
public int countReferences(String key) {
return indexCache.countReferences(key) + collections.countReferences(key);
}
} }

@ -69,6 +69,7 @@ import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.order.Digest; import de.anomic.kelondro.order.Digest;
import de.anomic.kelondro.text.CachedIndexCollection;
import de.anomic.kelondro.text.MetadataRowContainer; import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.Reference; import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.ReferenceContainer; import de.anomic.kelondro.text.ReferenceContainer;
@ -529,7 +530,7 @@ public final class yacyClient {
final int words = wordhashes.length() / yacySeedDB.commonHashLength; final int words = wordhashes.length() / yacySeedDB.commonHashLength;
final ReferenceContainer[] container = new ReferenceContainer[words]; final ReferenceContainer[] container = new ReferenceContainer[words];
for (int i = 0; i < words; i++) { for (int i = 0; i < words; i++) {
container[i] = plasmaWordIndex.emptyContainer(wordhashes.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength), count); container[i] = CachedIndexCollection.emptyContainer(wordhashes.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength), count);
} }
// insert results to containers // insert results to containers
@ -638,7 +639,7 @@ public final class yacyClient {
// insert the containers to the index // insert the containers to the index
for (int m = 0; m < words; m++) { for (int m = 0; m < words; m++) {
wordIndex.addReferences(container[m]); wordIndex.index().addReferences(container[m]);
} }
// generate statistics // generate statistics

@ -676,7 +676,7 @@ public final class yacy {
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up."); if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0); final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0);
final Iterator<ReferenceContainer> indexContainerIterator = wordIndex.referenceIterator("AAAAAAAAAAAA", false, false); final Iterator<ReferenceContainer> indexContainerIterator = wordIndex.index().referenceIterator("AAAAAAAAAAAA", false, false);
long urlCounter = 0, wordCounter = 0; long urlCounter = 0, wordCounter = 0;
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0; long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0;
@ -867,7 +867,7 @@ public final class yacy {
Iterator<ReferenceContainer> indexContainerIterator = null; Iterator<ReferenceContainer> indexContainerIterator = null;
if (resource.equals("all")) { if (resource.equals("all")) {
WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0); WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0);
indexContainerIterator = WordIndex.referenceIterator(wordChunkStartHash, false, false); indexContainerIterator = WordIndex.index().referenceIterator(wordChunkStartHash, false, false);
} }
int counter = 0; int counter = 0;
ReferenceContainer container = null; ReferenceContainer container = null;

Loading…
Cancel
Save