From e3d75f42bd1a49e656d6cd59597e6aa81cfb3e1d Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 19 Nov 2006 20:05:25 +0000 Subject: [PATCH] final version of collection entry type definition - the test phase of the new collection data structure is finished - test data that had been generated is void. There will be no migration - the new collection files are located in DATA/INDEX/PUBLIC/TEXT/RICOLLECTION - the index dump is void. There will be no migration - the new index dump is in DATA/INDEX/PUBLIC/TEXT/RICACHE git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2983 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControl_p.java | 5 +- source/de/anomic/http/httpc.java | 1 - source/de/anomic/index/indexCachedRI.java | 11 +- source/de/anomic/index/indexCollectionRI.java | 8 +- source/de/anomic/index/indexContainer.java | 41 ++- source/de/anomic/index/indexRAMRI.java | 35 ++- source/de/anomic/index/indexRWIEntryNew.java | 6 +- source/de/anomic/index/indexURLEntryOld.java | 8 +- .../anomic/kelondro/kelondroBase64Order.java | 4 +- .../kelondro/kelondroRowCollection.java | 8 +- source/de/anomic/kelondro/kelondroRowSet.java | 15 +- .../de/anomic/plasma/plasmaSearchEvent.java | 8 +- .../de/anomic/plasma/plasmaSwitchboard.java | 49 ++-- source/de/anomic/plasma/plasmaWordIndex.java | 271 ++++++++++-------- .../plasma/plasmaWordIndexAssortment.java | 2 +- .../plasmaWordIndexAssortmentCluster.java | 14 +- .../plasma/plasmaWordIndexFileCluster.java | 8 +- source/de/anomic/yacy/yacyClient.java | 7 +- source/de/anomic/yacy/yacySearch.java | 18 +- 19 files changed, 290 insertions(+), 229 deletions(-) diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index f0792243a..6fd6491f3 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -304,7 +304,8 @@ public class IndexControl_p { } i++; } - prop.put("keyhashsimilar_rows", rows); + prop.put("keyhashsimilar_rows_"+rows+"_cols", cols); + prop.put("keyhashsimilar_rows", rows + 1); prop.put("result", ""); } catch (IOException e) { prop.put("result", "unknown keys: " + e.getMessage()); @@ -439,7 +440,7 @@ public class IndexControl_p { prop.put("genUrlList_keyHash", keyhash); - if (index.size() == 0) { + if ((index == null) || (index.size() == 0)) { prop.put("genUrlList", 1); } else { final Iterator en = index.entries(); diff --git a/source/de/anomic/http/httpc.java b/source/de/anomic/http/httpc.java index f8a383fd6..cc474ec14 100644 --- a/source/de/anomic/http/httpc.java +++ b/source/de/anomic/http/httpc.java @@ -83,7 +83,6 @@ import org.apache.commons.pool.impl.GenericObjectPool; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.net.URL; -import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverByteBuffer; import de.anomic.server.serverCore; import de.anomic.server.serverFileUtils; diff --git a/source/de/anomic/index/indexCachedRI.java b/source/de/anomic/index/indexCachedRI.java index 05258d06e..6acb9b148 100644 --- a/source/de/anomic/index/indexCachedRI.java +++ b/source/de/anomic/index/indexCachedRI.java @@ -86,10 +86,6 @@ public class indexCachedRI implements indexRI { return entries.updated(); } - public indexContainer emptyContainer(String wordHash) { - return new indexContainer(wordHash, payloadrow); - } - public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean intern) { // add the entry if (intern) { @@ -219,10 +215,9 @@ public class indexCachedRI implements indexRI { } public indexContainer deleteContainer(String wordHash) { - indexContainer c = new indexContainer(wordHash, payloadrow); - c.add(riIntern.deleteContainer(wordHash), -1); - c.add(riExtern.deleteContainer(wordHash), -1); - c.add(backend.deleteContainer(wordHash), -1); + indexContainer c = riIntern.deleteContainer(wordHash); + if (c == null) c = riExtern.deleteContainer(wordHash); else c.add(riExtern.deleteContainer(wordHash), -1); + if (c == null) c = backend.deleteContainer(wordHash); else c.add(backend.deleteContainer(wordHash), -1); return c; } diff --git a/source/de/anomic/index/indexCollectionRI.java b/source/de/anomic/index/indexCollectionRI.java index 6db01166f..ca0bdd3d7 100644 --- a/source/de/anomic/index/indexCollectionRI.java +++ b/source/de/anomic/index/indexCollectionRI.java @@ -104,7 +104,7 @@ public class indexCollectionRI implements indexRI { byte[] key = (byte[]) oo[0]; kelondroRowSet collection = (kelondroRowSet) oo[1]; if (collection == null) return null; - return new indexContainer(new String(key), collection); + return new indexContainer(new String(key), collection, true); } public void remove() { @@ -118,7 +118,7 @@ public class indexCollectionRI implements indexRI { kelondroRowSet collection = collectionIndex.get(wordHash.getBytes(), deleteIfEmpty); if (collection != null) collection.select(urlselection); if ((collection == null) || (collection.size() == 0)) return null; - return new indexContainer(wordHash, collection); + return new indexContainer(wordHash, collection, true); } catch (IOException e) { return null; } @@ -128,7 +128,7 @@ public class indexCollectionRI implements indexRI { try { kelondroRowSet collection = collectionIndex.delete(wordHash.getBytes()); if (collection == null) return null; - return new indexContainer(wordHash, collection); + return new indexContainer(wordHash, collection, true); } catch (IOException e) { return null; } @@ -153,7 +153,7 @@ public class indexCollectionRI implements indexRI { } public synchronized indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) { - indexContainer container = new indexContainer(wordHash, collectionIndex.payloadRow()); + indexContainer container = new indexContainer(wordHash, collectionIndex.payloadRow(), true); container.add(newEntry); return addEntries(container, updateTime, dhtCase); } diff --git a/source/de/anomic/index/indexContainer.java b/source/de/anomic/index/indexContainer.java index 1571fdfb5..5f866048e 100644 --- a/source/de/anomic/index/indexContainer.java +++ b/source/de/anomic/index/indexContainer.java @@ -41,30 +41,34 @@ import de.anomic.kelondro.kelondroRowSet; public class indexContainer extends kelondroRowSet { private String wordHash; + private boolean newRWI; - public indexContainer(String wordHash, kelondroRow rowdef, int objectCount, byte[] cache) { + public indexContainer(String wordHash, kelondroRow rowdef, int objectCount, byte[] cache, boolean newRWI) { super(rowdef, objectCount, cache, kelondroBase64Order.enhancedCoder, 0, 0); this.wordHash = wordHash; + this.newRWI = newRWI; } - public indexContainer(String wordHash, kelondroRow rowdef) { - this(wordHash, rowdef, kelondroBase64Order.enhancedCoder, 0); + public indexContainer(String wordHash, kelondroRow rowdef, boolean newRWI) { + this(wordHash, rowdef, kelondroBase64Order.enhancedCoder, 0, newRWI); } - public indexContainer(String wordHash, kelondroRowSet collection) { + public indexContainer(String wordHash, kelondroRowSet collection, boolean newRWI) { super(collection); this.wordHash = wordHash; + this.newRWI = newRWI; } - public indexContainer(String wordHash, kelondroRow rowdef, kelondroOrder ordering, int column) { + public indexContainer(String wordHash, kelondroRow rowdef, kelondroOrder ordering, int column, boolean newRWI) { super(rowdef); this.wordHash = wordHash; this.lastTimeWrote = 0; this.setOrdering(ordering, column); + this.newRWI = newRWI; } public indexContainer topLevelClone() { - indexContainer newContainer = new indexContainer(this.wordHash, this.rowdef, this.sortOrder, this.sortColumn); + indexContainer newContainer = new indexContainer(this.wordHash, this.rowdef, this.sortOrder, this.sortColumn, this.newRWI); newContainer.add(this, -1); return newContainer; } @@ -123,7 +127,11 @@ public class indexContainer extends kelondroRowSet { if (oldEntryRow == null) { return true; } else { - indexRWIEntry oldEntry = new indexRWIEntryOld(oldEntryRow); // FIXME: see if cloning is necessary + indexRWIEntry oldEntry; + if (entry instanceof indexRWIEntryNew) + oldEntry = new indexRWIEntryNew(oldEntryRow); + else + oldEntry = new indexRWIEntryOld(oldEntryRow); // FIXME: see if cloning is necessary if (entry.isOlder(oldEntry)) { // A more recent Entry is already in this container this.put(oldEntry.toKelondroEntry()); // put it back return false; @@ -136,13 +144,19 @@ public class indexContainer extends kelondroRowSet { public indexRWIEntry get(String urlHash) { kelondroRow.Entry entry = this.get(urlHash.getBytes()); if (entry == null) return null; - return new indexRWIEntryOld(entry); + if (this.newRWI) + return new indexRWIEntryNew(entry); + else + return new indexRWIEntryOld(entry); } public indexRWIEntry remove(String urlHash) { kelondroRow.Entry entry = this.remove(urlHash.getBytes()); if (entry == null) return null; - return new indexRWIEntryOld(entry); + if (this.newRWI) + return new indexRWIEntryNew(entry); + else + return new indexRWIEntryOld(entry); } public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { @@ -178,7 +192,10 @@ public class indexContainer extends kelondroRowSet { public Object next() { kelondroRow.Entry rentry = (kelondroRow.Entry) rowEntryIterator.next(); if (rentry == null) return null; - return new indexRWIEntryOld(rentry); + if (newRWI) + return new indexRWIEntryNew(rentry); + else + return new indexRWIEntryOld(rentry); } public void remove() { @@ -288,7 +305,7 @@ public class indexContainer extends kelondroRowSet { assert small.rowdef.equals(large.rowdef) : "small = " + small.rowdef.toString() + "; large = " + large.rowdef.toString(); int keylength = small.rowdef.width(0); assert (keylength == large.rowdef.width(0)); - indexContainer conj = new indexContainer(null, small.rowdef); // start with empty search result + indexContainer conj = new indexContainer(null, small.rowdef, small.newRWI); // start with empty search result Iterator se = small.entries(); indexRWIEntry ie0, ie1; long stamp = System.currentTimeMillis(); @@ -311,7 +328,7 @@ public class indexContainer extends kelondroRowSet { assert i1.rowdef.equals(i2.rowdef) : "i1 = " + i1.rowdef.toString() + "; i2 = " + i2.rowdef.toString(); int keylength = i1.rowdef.width(0); assert (keylength == i2.rowdef.width(0)); - indexContainer conj = new indexContainer(null, i1.rowdef); // start with empty search result + indexContainer conj = new indexContainer(null, i1.rowdef, i1.newRWI); // start with empty search result if (!((i1.order().signature().equals(i2.order().signature())) && (i1.primarykey() == i2.primarykey()))) return conj; // ordering must be equal Iterator e1 = i1.entries(); diff --git a/source/de/anomic/index/indexRAMRI.java b/source/de/anomic/index/indexRAMRI.java index ad00d3f28..c80faa90e 100644 --- a/source/de/anomic/index/indexRAMRI.java +++ b/source/de/anomic/index/indexRAMRI.java @@ -58,6 +58,7 @@ public final class indexRAMRI implements indexRI { private String indexArrayFileName; private kelondroRow payloadrow; private kelondroRow bufferStructureBasis; + private boolean newRWI; // calculated constants private static String maxKey; @@ -66,7 +67,7 @@ public final class indexRAMRI implements indexRI { //minKey = ""; for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-'; } - public indexRAMRI(File databaseRoot, kelondroRow payloadrow, int wCacheReferenceLimitInit, String dumpname, serverLog log) { + public indexRAMRI(File databaseRoot, kelondroRow payloadrow, int wCacheReferenceLimitInit, String dumpname, serverLog log, boolean newRWI) { // creates a new index cache // the cache has a back-end where indexes that do not fit in the cache are flushed @@ -78,6 +79,7 @@ public final class indexRAMRI implements indexRI { this.cacheMaxCount = 10000; this.cacheReferenceLimit = wCacheReferenceLimitInit; this.log = log; + this.newRWI = newRWI; this.indexArrayFileName = dumpname; this.payloadrow = payloadrow; this.bufferStructureBasis = new kelondroRow( @@ -178,7 +180,10 @@ public final class indexRAMRI implements indexRI { if ((row == null) || (row.empty(0)) || (row.empty(3))) continue; wordHash = row.getColString(0, "UTF-8"); //creationTime = kelondroRecords.bytes2long(row[2]); - wordEntry = new indexRWIEntryOld(row.getColBytes(3)); + if (newRWI) + wordEntry = new indexRWIEntryNew(row.getColBytes(3)); + else + wordEntry = new indexRWIEntryOld(row.getColBytes(3)); // store to cache addEntry(wordHash, wordEntry, startTime, false); urlCount++; @@ -421,25 +426,29 @@ public final class indexRAMRI implements indexRI { public synchronized indexContainer addEntries(indexContainer container, long updateTime, boolean dhtCase) { // this puts the entries into the cache, not into the assortment directly int added = 0; + if ((container == null) || (container.size() == 0)) return null; // put new words into cache - // put container into wCache - String wordHash = container.getWordHash(); - indexContainer entries = (indexContainer) cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null - if (entries == null) entries = new indexContainer(wordHash, container.row()); + String wordHash = container.getWordHash(); + indexContainer entries = (indexContainer) cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null + if (entries == null) { + entries = container.topLevelClone(); + added = entries.size(); + } else { added = entries.add(container, -1); - if (added > 0) { - cache.put(wordHash, entries); - hashScore.addScore(wordHash, added); - hashDate.setScore(wordHash, intTime(updateTime)); - } - entries = null; + } + if (added > 0) { + cache.put(wordHash, entries); + hashScore.addScore(wordHash, added); + hashDate.setScore(wordHash, intTime(updateTime)); + } + entries = null; return null; } public synchronized indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) { indexContainer container = (indexContainer) cache.get(wordHash); - if (container == null) container = new indexContainer(wordHash, this.payloadrow); + if (container == null) container = new indexContainer(wordHash, this.payloadrow, newEntry instanceof indexRWIEntryNew); indexRWIEntry[] entries = new indexRWIEntry[] { newEntry }; if (container.add(entries, updateTime) > 0) { cache.put(wordHash, container); diff --git a/source/de/anomic/index/indexRWIEntryNew.java b/source/de/anomic/index/indexRWIEntryNew.java index 9d67779e1..841ef3a8a 100644 --- a/source/de/anomic/index/indexRWIEntryNew.java +++ b/source/de/anomic/index/indexRWIEntryNew.java @@ -52,7 +52,7 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry { new kelondroColumn("y", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "lother"), new kelondroColumn("m", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "urlLength"), new kelondroColumn("n", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "urlComps"), - new kelondroColumn("g", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, 1, "typeofword"), + new kelondroColumn("g", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "typeofword"), new kelondroColumn("z", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 4, "flags"), new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "hitcount"), new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "posintext"), @@ -132,7 +132,7 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry { this.entry.setCol(col_lother, outlinksOther); this.entry.setCol(col_urlLength, urlLength); this.entry.setCol(col_urlComps, urlComps); - this.entry.setCol(col_typeofword, 0); // TODO: grammatical classification + this.entry.setCol(col_typeofword, new byte[]{(byte) 0}); // TODO: grammatical classification this.entry.setCol(col_flags, null); // TODO: generate flags this.entry.setCol(col_hitcount, hitcount); this.entry.setCol(col_posintext, posintext); @@ -159,7 +159,7 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry { int domlen = plasmaURL.domLengthEstimation(oldEntry.urlHash()); this.entry.setCol(col_urlLength, domlen * 2); // estimated this.entry.setCol(col_urlComps, domlen / 3); // estimated - this.entry.setCol(col_typeofword, 0); + this.entry.setCol(col_typeofword, new byte[]{(byte) 0}); this.entry.setCol(col_flags, null); this.entry.setCol(col_hitcount, oldEntry.hitcount()); this.entry.setCol(col_posintext, oldEntry.posintext()); diff --git a/source/de/anomic/index/indexURLEntryOld.java b/source/de/anomic/index/indexURLEntryOld.java index 17da67913..a956b79a8 100644 --- a/source/de/anomic/index/indexURLEntryOld.java +++ b/source/de/anomic/index/indexURLEntryOld.java @@ -160,7 +160,13 @@ public class indexURLEntryOld implements indexURLEntry { this.snippet = prop.getProperty("snippet", ""); if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null); - this.word = (prop.containsKey("word")) ? new indexRWIEntryOld(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null; + this.word = null; + if (prop.containsKey("word")) { + this.word = new indexRWIEntryOld(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))); + } + if (prop.containsKey("wi")) { + this.word = new indexRWIEntryNew(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))); + } } catch (Exception e) { serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/2:" diff --git a/source/de/anomic/kelondro/kelondroBase64Order.java b/source/de/anomic/kelondro/kelondroBase64Order.java index d4c8f33f6..676457e20 100644 --- a/source/de/anomic/kelondro/kelondroBase64Order.java +++ b/source/de/anomic/kelondro/kelondroBase64Order.java @@ -298,9 +298,9 @@ public class kelondroBase64Order extends kelondroAbstractOrder implements kelond bc = b[boffset + i]; assert (bc >= 0) && (bc < 128) : "bc = " + bc + ", b = " + serverLog.arrayList(b, boffset, len); acc = ahpla[ac]; - assert (acc >= 0) : "acc = " + acc + ", a = " + serverLog.arrayList(a, aoffset, len) + ", aoffset = 0x" + Integer.toHexString(aoffset) + ", i = " + i + "\n" + serverLog.table(a, 16, aoffset); + assert (acc >= 0) : "acc = " + acc + ", a = " + serverLog.arrayList(a, aoffset, len) + "/" + new String(a, aoffset, len) + ", aoffset = 0x" + Integer.toHexString(aoffset) + ", i = " + i + "\n" + serverLog.table(a, 16, aoffset); bcc = ahpla[bc]; - assert (bcc >= 0) : "bcc = " + bcc + ", b = " + serverLog.arrayList(b, boffset, len) + ", boffset = 0x" + Integer.toHexString(boffset) + ", i = " + i + "\n" + serverLog.table(b, 16, boffset); + assert (bcc >= 0) : "bcc = " + bcc + ", b = " + serverLog.arrayList(b, boffset, len) + "/" + new String(b, boffset, len) + ", boffset = 0x" + Integer.toHexString(boffset) + ", i = " + i + "\n" + serverLog.table(b, 16, boffset); if (acc > bcc) return 1; if (acc < bcc) return -1; // else the bytes are equal and it may go on yet undecided diff --git a/source/de/anomic/kelondro/kelondroRowCollection.java b/source/de/anomic/kelondro/kelondroRowCollection.java index 603abdf7f..f5e45e291 100644 --- a/source/de/anomic/kelondro/kelondroRowCollection.java +++ b/source/de/anomic/kelondro/kelondroRowCollection.java @@ -130,7 +130,7 @@ public class kelondroRowCollection { } public static final int exportOverheadSize = 14; - + public byte[] exportCollection() { // returns null if the collection is empty trim(); @@ -147,7 +147,7 @@ public class kelondroRowCollection { entry.setCol(exp_collection, chunkcache); return entry.bytes(); } - + public kelondroRow row() { return this.rowdef; } @@ -155,7 +155,7 @@ public class kelondroRowCollection { private final void ensureSize(int elements) { int needed = elements * rowdef.objectsize(); if (chunkcache.length >= needed) return; - byte[] newChunkcache = new byte[needed * 12 / 10]; // increase space by 20% + byte[] newChunkcache = new byte[needed * 2]; // increase space System.arraycopy(chunkcache, 0, newChunkcache, 0, chunkcache.length); chunkcache = newChunkcache; newChunkcache = null; @@ -441,7 +441,7 @@ public class kelondroRowCollection { protected final int swap(int i, int j, int p) { if (i == j) return p; - if (this.chunkcount * this.rowdef.objectsize() < this.chunkcache.length) { + if ((this.chunkcount + 1) * this.rowdef.objectsize() < this.chunkcache.length) { // there is space in the chunkcache that we can use as buffer System.arraycopy(chunkcache, this.rowdef.objectsize() * i, chunkcache, chunkcache.length - this.rowdef.objectsize(), this.rowdef.objectsize()); System.arraycopy(chunkcache, this.rowdef.objectsize() * j, chunkcache, this.rowdef.objectsize() * i, this.rowdef.objectsize()); diff --git a/source/de/anomic/kelondro/kelondroRowSet.java b/source/de/anomic/kelondro/kelondroRowSet.java index 69f858437..9f746848f 100644 --- a/source/de/anomic/kelondro/kelondroRowSet.java +++ b/source/de/anomic/kelondro/kelondroRowSet.java @@ -175,8 +175,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd } public void shape() { - //System.out.println("SHAPE"); - if (this.sortOrder == null) return; // we cannot shape without an object order + assert (this.sortOrder != null); // we cannot shape without an object order synchronized (chunkcache) { resolveMarkedRemoved(); super.sort(); @@ -246,7 +245,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd private int find(byte[] a, int astart, int alength) { // returns the chunknumber; -1 if not found - if (this.sortOrder == null) return iterativeSearch(a, astart, alength); + if (this.sortOrder == null) return iterativeSearch(a, astart, alength, 0, this.chunkcount); // check if a re-sorting make sense if ((this.chunkcount - this.sortBound) > collectionReSortLimit) shape(); @@ -256,20 +255,20 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd if (p >= 0) return p; // then find in unsorted area - return iterativeSearch(a, astart, alength); + return iterativeSearch(a, astart, alength, this.sortBound, this.chunkcount); } - private int iterativeSearch(byte[] key, int astart, int alength) { + private int iterativeSearch(byte[] key, int astart, int alength, int leftBorder, int rightBound) { // returns the chunknumber if (this.sortOrder == null) { - for (int i = this.sortBound; i < this.chunkcount; i++) { + for (int i = leftBorder; i < rightBound; i++) { if (match(key, astart, alength, i)) return i; } return -1; } else { - for (int i = this.sortBound; i < this.chunkcount; i++) { + for (int i = leftBorder; i < rightBound; i++) { if (compare(key, astart, alength, i) == 0) return i; } return -1; @@ -322,7 +321,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd private boolean match(byte[] a, int astart, int alength, int chunknumber) { if (chunknumber >= chunkcount) return false; int i = 0; - int p = chunknumber * this.rowdef.objectsize(); + int p = chunknumber * this.rowdef.objectsize() + this.rowdef.colstart[this.sortColumn]; final int len = Math.min(this.rowdef.width(this.sortColumn), Math.min(alength, a.length - astart)); while (i < len) if (a[astart + i++] != chunkcache[p++]) return false; return ((len == this.rowdef.width(this.sortColumn)) || (chunkcache[len] == 0)) ; diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 39def7504..3bc0afe9f 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -93,7 +93,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { this.ranking = ranking; this.urlStore = urlStore; this.snippetCache = snippetCache; - this.rcContainers = new indexContainer(null, wordIndex.payloadrow()); + this.rcContainers = wordIndex.emptyContainer(null); this.rcContainerFlushCount = 0; this.rcAbstracts = (query.size() > 1) ? new TreeMap() : null; // generate abstracts only for combined searches this.profileLocal = localTiming; @@ -139,7 +139,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { long secondaryTimeout = System.currentTimeMillis() + profileGlobal.duetime() / 3 * 2; long primaryTimeout = System.currentTimeMillis() + profileGlobal.duetime(); primarySearchThreads = yacySearch.primaryRemoteSearches(plasmaSearchQuery.hashSet2hashString(query.queryHashes), "", - query.prefer, query.urlMask, query.maxDistance, urlStore, rcContainers, rcAbstracts, + query.prefer, query.urlMask, query.maxDistance, urlStore, wordIndex, rcContainers, rcAbstracts, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking); // meanwhile do a local search @@ -280,7 +280,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " has urls: " + urls); System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " from words: " + words); secondarySearchThreads[c++] = yacySearch.secondaryRemoteSearch( - words, urls, urlStore, rcContainers, peer, plasmaSwitchboard.urlBlacklist, snippetCache, + words, urls, urlStore, wordIndex, rcContainers, peer, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking); } @@ -357,7 +357,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { assert (rcLocal != null); - indexContainer searchResult = new indexContainer(null, rcLocal.row()); + indexContainer searchResult = wordIndex.emptyContainer(null); long preorderTime = profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_PRESORT); profileLocal.startTimer(); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 57eeda707..ef52104ec 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -134,7 +134,6 @@ import de.anomic.http.httpc; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; import de.anomic.plasma.plasmaURL; -import de.anomic.index.indexRWIEntryOld; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; @@ -237,6 +236,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public dbImportManager dbImportManager; public plasmaDHTFlush transferIdxThread = null; private plasmaDHTChunk dhtTransferChunk = null; + private boolean newIndex; /* * Remote Proxy configuration @@ -434,8 +434,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser ramNURL, getConfigBool("useFlexTableForNURL", false), ramEURL, getConfigBool("useFlexTableForEURL", true), ramLURL_time); + newIndex = getConfigBool("useCollectionIndex", false); try { - wordIndex = new plasmaWordIndex(plasmaPath, indexPath, true, ramRWI, ramRWI_time, log, getConfigBool("useCollectionIndex", false)); + wordIndex = new plasmaWordIndex(plasmaPath, indexPath, true, ramRWI, ramRWI_time, log, newIndex); } catch (IOException e1) { e1.printStackTrace(); System.exit(-1); @@ -1672,28 +1673,28 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String word = (String) wentry.getKey(); wordStat = (plasmaCondenser.wordStatProp) wentry.getValue(); String wordHash = plasmaURL.word2hash(word); - indexRWIEntry wordIdxEntry = new indexRWIEntryOld( - urlHash, - urlLength, urlComps, - wordStat.count, - document.getMainLongTitle().length(), - condenser.RESULT_SIMI_WORDS, - condenser.RESULT_SIMI_SENTENCES, - wordStat.posInText, - wordStat.posInPhrase, - wordStat.numOfPhrase, - 0, - newEntry.size(), - docDate.getTime(), - System.currentTimeMillis(), - condenser.RESULT_WORD_ENTROPHY, - language, - doctype, - ioLinks[0].intValue(), - ioLinks[1].intValue(), - true - ); - indexContainer wordIdxContainer = new indexContainer(wordHash, wordIndex.payloadrow()); + indexRWIEntry wordIdxEntry = wordIndex.newRWIEntry( + urlHash, + urlLength, urlComps, + wordStat.count, + document.getMainLongTitle().length(), + condenser.RESULT_SIMI_WORDS, + condenser.RESULT_SIMI_SENTENCES, + wordStat.posInText, + wordStat.posInPhrase, + wordStat.numOfPhrase, + 0, + newEntry.size(), + docDate.getTime(), + System.currentTimeMillis(), + condenser.RESULT_WORD_ENTROPHY, + language, + doctype, + ioLinks[0].intValue(), + ioLinks[1].intValue(), + true + ); + indexContainer wordIdxContainer = wordIndex.emptyContainer(wordHash); wordIdxContainer.add(wordIdxEntry); tmpContainers.add(wordIdxContainer); } diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index e5c05dd34..75ada6aa5 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -43,6 +43,7 @@ import de.anomic.index.indexContainerOrder; import de.anomic.index.indexRWIEntry; import de.anomic.index.indexRAMRI; import de.anomic.index.indexRI; +import de.anomic.index.indexRWIEntryNew; import de.anomic.index.indexRWIEntryOld; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; @@ -60,7 +61,8 @@ public final class plasmaWordIndex implements indexRI { private static final String indexAssortmentClusterPath = "ACLUSTER"; private static final int assortmentCount = 64; - private static final kelondroRow payloadrow = indexRWIEntryOld.urlEntryRow; + private static final kelondroRow payloadrowold = indexRWIEntryOld.urlEntryRow; + private static final kelondroRow payloadrownew = indexRWIEntryNew.urlEntryRow; private final File oldDatabaseRoot; private final kelondroOrder indexOrder = new kelondroNaturalOrder(true); @@ -75,27 +77,31 @@ public final class plasmaWordIndex implements indexRI { public plasmaWordIndex(File oldDatabaseRoot, File newIndexRoot, boolean dummy, int bufferkb, long preloadTime, serverLog log, boolean useCollectionIndex) throws IOException { this.oldDatabaseRoot = oldDatabaseRoot; - this.backend = new plasmaWordIndexFileCluster(oldDatabaseRoot, payloadrow, log); - this.dhtOutCache = new indexRAMRI(oldDatabaseRoot, payloadrow, (useCollectionIndex) ? 1024 : 64, "indexDump1.array", log); - this.dhtInCache = new indexRAMRI(oldDatabaseRoot, payloadrow, (useCollectionIndex) ? 1024 : 64, "indexDump2.array", log); - + this.backend = new plasmaWordIndexFileCluster(oldDatabaseRoot, payloadrowold, log); + File textindexcache = new File(newIndexRoot, "PUBLIC/TEXT/RICACHE"); + if (!(textindexcache.exists())) textindexcache.mkdirs(); + if (useCollectionIndex) { + this.dhtOutCache = new indexRAMRI(textindexcache, payloadrownew, 1024, "dump1.array", log, true); + this.dhtInCache = new indexRAMRI(textindexcache, payloadrownew, 1024, "dump2.array", log, true); + } else { + this.dhtOutCache = new indexRAMRI(oldDatabaseRoot, payloadrowold, 64, "indexDump1.array", log, false); + this.dhtInCache = new indexRAMRI(oldDatabaseRoot, payloadrowold, 64, "indexDump2.array", log, false); + } + // create assortment cluster path File assortmentClusterPath = new File(oldDatabaseRoot, indexAssortmentClusterPath); this.assortmentBufferSize = bufferkb; // create collections storage path - File textindexpath = new File(newIndexRoot, "PUBLIC/TEXT"); - if (!(textindexpath.exists())) textindexpath.mkdirs(); + File textindexcollections = new File(newIndexRoot, "PUBLIC/TEXT/RICOLLECTION"); + if (!(textindexcollections.exists())) textindexcollections.mkdirs(); if (useCollectionIndex) { - this.collections = new indexCollectionRI(textindexpath, "test_generation1", bufferkb * 1024, preloadTime, payloadrow); - if (assortmentClusterPath.exists()) - this.assortmentCluster = new plasmaWordIndexAssortmentCluster(assortmentClusterPath, assortmentCount, payloadrow, assortmentBufferSize, preloadTime, log); - else - this.assortmentCluster = null; + this.collections = new indexCollectionRI(textindexcollections, "collection", bufferkb * 1024, preloadTime, payloadrownew); + this.assortmentCluster = null; } else { this.collections = null; if (!(assortmentClusterPath.exists())) assortmentClusterPath.mkdirs(); - this.assortmentCluster = new plasmaWordIndexAssortmentCluster(assortmentClusterPath, assortmentCount, payloadrow, assortmentBufferSize, preloadTime, log); + this.assortmentCluster = new plasmaWordIndexAssortmentCluster(assortmentClusterPath, assortmentCount, payloadrowold, assortmentBufferSize, preloadTime, log); } busyCacheFlush = false; @@ -105,7 +111,38 @@ public final class plasmaWordIndex implements indexRI { } public kelondroRow payloadrow() { - return payloadrow; + if (useCollectionIndex) return payloadrownew; else return payloadrowold; + } + + public indexRWIEntry newRWIEntry( + String urlHash, + int urlLength, + int urlComps, + int titleLength, + int hitcount, + int wordcount, + int phrasecount, + int posintext, + int posinphrase, + int posofphrase, + int worddistance, + int sizeOfPage, + long lastmodified, + long updatetime, + int quality, + String language, + char doctype, + int outlinksSame, + int outlinksOther, + boolean local ) { + if (useCollectionIndex) + return new indexRWIEntryNew(urlHash, urlLength, urlComps, titleLength, hitcount, wordcount, phrasecount, + posintext, posinphrase, posofphrase, worddistance, sizeOfPage, lastmodified, updatetime, quality, language, doctype, + outlinksSame, outlinksOther, local); + else + return new indexRWIEntryOld(urlHash, urlLength, urlComps, titleLength, hitcount, wordcount, phrasecount, + posintext, posinphrase, posofphrase, worddistance, sizeOfPage, lastmodified, updatetime, quality, language, doctype, + outlinksSame, outlinksOther, local); } public File getRoot() { @@ -198,10 +235,12 @@ public final class plasmaWordIndex implements indexRI { } public indexContainer emptyContainer(String wordHash) { - return new indexContainer(wordHash, payloadrow); + return new indexContainer(wordHash, payloadrow(), useCollectionIndex); } - + public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtInCase) { + if ((useCollectionIndex) && (entry instanceof indexRWIEntryOld)) entry = new indexRWIEntryNew((indexRWIEntryOld) entry); + // set dhtInCase depending on wordHash if ((!dhtInCase) && (yacyDHTAction.shallBeOwnWord(wordHash))) dhtInCase = true; @@ -215,7 +254,21 @@ public final class plasmaWordIndex implements indexRI { return null; } + private indexContainer convertOld2New(indexContainer entries) { + // convert old entries to new entries + indexContainer newentries = new indexContainer(entries.getWordHash(), payloadrownew, useCollectionIndex); + Iterator i = entries.entries(); + indexRWIEntryOld old; + while (i.hasNext()) { + old = (indexRWIEntryOld) i.next(); + newentries.add(new indexRWIEntryNew(old)); + } + return newentries; + } + public indexContainer addEntries(indexContainer entries, long updateTime, boolean dhtInCase) { + if ((useCollectionIndex) && (entries.row().objectsize() == payloadrowold.objectsize())) entries = convertOld2New(entries); + // set dhtInCase depending on wordHash if ((!dhtInCase) && (yacyDHTAction.shallBeOwnWord(entries.getWordHash()))) dhtInCase = true; @@ -330,23 +383,23 @@ public final class plasmaWordIndex implements indexRI { wprop = (plasmaCondenser.wordStatProp) wentry.getValue(); // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c); wordHash = plasmaURL.word2hash(word); - ientry = new indexRWIEntryOld(urlHash, - urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(), - wprop.count, - condenser.RESULT_SIMI_WORDS, - condenser.RESULT_SIMI_SENTENCES, - wprop.posInText, - wprop.posInPhrase, - wprop.numOfPhrase, - 0, - size, - urlModified.getTime(), - System.currentTimeMillis(), - condenser.RESULT_WORD_ENTROPHY, - language, - doctype, - outlinksSame, outlinksOther, - true); + ientry = newRWIEntry(urlHash, + urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(), + wprop.count, + condenser.RESULT_SIMI_WORDS, + condenser.RESULT_SIMI_SENTENCES, + wprop.posInText, + wprop.posInPhrase, + wprop.numOfPhrase, + 0, + size, + urlModified.getTime(), + System.currentTimeMillis(), + condenser.RESULT_WORD_ENTROPHY, + language, + doctype, + outlinksSame, outlinksOther, + true); addEntry(wordHash, ientry, System.currentTimeMillis(), false); } // System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + @@ -357,23 +410,22 @@ public final class plasmaWordIndex implements indexRI { public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) { long start = System.currentTimeMillis(); - // get from cache - indexContainer container = dhtOutCache.getContainer(wordHash, urlselection, true, -1); + // get from cache + indexContainer container = dhtOutCache.getContainer(wordHash, urlselection, true, -1); + if (container == null) { + container = dhtInCache.getContainer(wordHash, urlselection, true, -1); + } else { + container.add(dhtInCache.getContainer(wordHash, urlselection, true, -1), -1); + } + + // get from collection index + if (useCollectionIndex) { if (container == null) { - container = dhtInCache.getContainer(wordHash, urlselection, true, -1); + container = collections.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime); } else { - container.add(dhtInCache.getContainer(wordHash, urlselection, true, -1), -1); + container.add(collections.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime), -1); } - - // get from collection index - if (useCollectionIndex) { - if (container == null) { - container = collections.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime); - } else { - container.add(collections.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime), -1); - } - } - + } else { // get from assortments if (assortmentCluster != null) { if (container == null) { @@ -383,7 +435,7 @@ public final class plasmaWordIndex implements indexRI { container.add(assortmentCluster.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime), -1); } } - + // get from backend if (maxTime > 0) { maxTime = maxTime - (System.currentTimeMillis() - start); @@ -394,7 +446,8 @@ public final class plasmaWordIndex implements indexRI { } else { container.add(backend.getContainer(wordHash, urlselection, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime), -1); } - return container; + } + return container; } public Map getContainers(Set wordHashes, Set urlselection, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) { @@ -429,10 +482,7 @@ public final class plasmaWordIndex implements indexRI { public int size() { if (useCollectionIndex) - return java.lang.Math.max(collections.size(), - java.lang.Math.max((assortmentCluster == null) ? 0 : assortmentCluster.size(), - java.lang.Math.max(backend.size(), - java.lang.Math.max(dhtInCache.size(), dhtOutCache.size())))); + return java.lang.Math.max(collections.size(), java.lang.Math.max(dhtInCache.size(), dhtOutCache.size())); else return java.lang.Math.max((assortmentCluster == null) ? 0 : assortmentCluster.size(), java.lang.Math.max(backend.size(), @@ -441,17 +491,18 @@ public final class plasmaWordIndex implements indexRI { public int indexSize(String wordHash) { int size = 0; - try { - plasmaWordIndexFile entity = backend.getEntity(wordHash, true, -1); - if (entity != null) { - size += entity.size(); - entity.close(); - } - } catch (IOException e) {} - if (useCollectionIndex) size += collections.indexSize(wordHash); + size += dhtInCache.indexSize(wordHash); + size += dhtOutCache.indexSize(wordHash); + if (useCollectionIndex) { + size += collections.indexSize(wordHash); + } else try { size += (assortmentCluster == null) ? 0 : assortmentCluster.indexSize(wordHash); - size += dhtInCache.indexSize(wordHash); - size += dhtOutCache.indexSize(wordHash); + plasmaWordIndexFile entity = backend.getEntity(wordHash, true, -1); + if (entity != null) { + size += entity.size(); + entity.close(); + } + } catch (IOException e) {} return size; } @@ -459,44 +510,51 @@ public final class plasmaWordIndex implements indexRI { synchronized (this) { dhtInCache.close(waitingBoundSeconds); dhtOutCache.close(waitingBoundSeconds); - if (useCollectionIndex) collections.close(-1); - if (assortmentCluster != null) assortmentCluster.close(-1); - backend.close(10); + if (useCollectionIndex) { + collections.close(-1); + } else { + if (assortmentCluster != null) assortmentCluster.close(-1); + backend.close(10); + } } } public indexContainer deleteContainer(String wordHash) { - indexContainer c = new indexContainer(wordHash, payloadrow); - c.add(dhtInCache.deleteContainer(wordHash), -1); - c.add(dhtOutCache.deleteContainer(wordHash), -1); - if (useCollectionIndex) c.add(collections.deleteContainer(wordHash), -1); + indexContainer c = new indexContainer(wordHash, payloadrow(), useCollectionIndex); + c.add(dhtInCache.deleteContainer(wordHash), -1); + c.add(dhtOutCache.deleteContainer(wordHash), -1); + if (useCollectionIndex) { + c.add(collections.deleteContainer(wordHash), -1); + } else { if (assortmentCluster != null) c.add(assortmentCluster.deleteContainer(wordHash), -1); c.add(backend.deleteContainer(wordHash), -1); - return c; + } + return c; } public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { boolean removed = false; - removed = removed | (dhtInCache.removeEntry(wordHash, urlHash, deleteComplete)); - removed = removed | (dhtOutCache.removeEntry(wordHash, urlHash, deleteComplete)); - if (useCollectionIndex) {removed = removed | (collections.removeEntry(wordHash, urlHash, deleteComplete));} + removed = removed | (dhtInCache.removeEntry(wordHash, urlHash, deleteComplete)); + removed = removed | (dhtOutCache.removeEntry(wordHash, urlHash, deleteComplete)); + if (useCollectionIndex) { + removed = removed | (collections.removeEntry(wordHash, urlHash, deleteComplete)); + } else { if (assortmentCluster != null) removed = removed | (assortmentCluster.removeEntry(wordHash, urlHash, deleteComplete)); removed = removed | backend.removeEntry(wordHash, urlHash, deleteComplete); - return removed; + } + return removed; } public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete) { int removed = 0; removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete); removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete); - //if (removed == urlHashes.size()) return removed; if (useCollectionIndex) { removed += collections.removeEntries(wordHash, urlHashes, deleteComplete); - //if (removed == urlHashes.size()) return removed; + } else if (assortmentCluster != null) { + removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete); + removed += backend.removeEntries(wordHash, urlHashes, deleteComplete); } - if (assortmentCluster != null) removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete); - //if (removed == urlHashes.size()) return removed; - removed += backend.removeEntries(wordHash, urlHashes, deleteComplete); return removed; } @@ -505,10 +563,11 @@ public final class plasmaWordIndex implements indexRI { removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", "; removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", "; if (useCollectionIndex) { - removed += collections.removeEntries(wordHash, urlHashes, deleteComplete) + ", "; - } else removed += "0, "; - if (assortmentCluster != null) removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete) + ", "; - removed += backend.removeEntries(wordHash, urlHashes, deleteComplete); + removed += collections.removeEntries(wordHash, urlHashes, deleteComplete); + } else { + if (assortmentCluster != null) removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete) + ", "; + removed += backend.removeEntries(wordHash, urlHashes, deleteComplete); + } return removed; } @@ -522,7 +581,7 @@ public final class plasmaWordIndex implements indexRI { // urlHash assigned. This can only work if the entry is really fresh // and can be found in the RAM cache // this returns the number of deletion that had been possible - return dhtInCache.tryRemoveURLs(urlHash); + return dhtInCache.tryRemoveURLs(urlHash) | dhtOutCache.tryRemoveURLs(urlHash); } public TreeSet indexContainerSet(String startHash, int resourceLevel, boolean rot, int count) throws IOException { @@ -562,28 +621,15 @@ public final class plasmaWordIndex implements indexRI { if (resourceLevel == plasmaWordIndex.RL_RAMCACHE) { return dhtOutCache.wordContainers(startWordHash, false); } - if ((resourceLevel == plasmaWordIndex.RL_COLLECTIONS) && (useCollectionIndex)) { + if (useCollectionIndex) { return new kelondroMergeIterator( dhtOutCache.wordContainers(startWordHash, false), collections.wordContainers(startWordHash, false), new indexContainerOrder(kelondroNaturalOrder.naturalOrder), indexContainer.containerMergeMethod, true); - } - if (resourceLevel == plasmaWordIndex.RL_ASSORTMENTS) { - if (useCollectionIndex) { - return new kelondroMergeIterator( - new kelondroMergeIterator( - dhtOutCache.wordContainers(startWordHash, false), - collections.wordContainers(startWordHash, false), - new indexContainerOrder(kelondroNaturalOrder.naturalOrder), - indexContainer.containerMergeMethod, - true), - (assortmentCluster == null) ? null : assortmentCluster.wordContainers(startWordHash, true, false), - new indexContainerOrder(kelondroNaturalOrder.naturalOrder), - indexContainer.containerMergeMethod, - true); - } else { + } else { + if (resourceLevel == plasmaWordIndex.RL_ASSORTMENTS) { return new kelondroMergeIterator( dhtOutCache.wordContainers(startWordHash, false), (assortmentCluster == null) ? null : assortmentCluster.wordContainers(startWordHash, true, false), @@ -591,26 +637,7 @@ public final class plasmaWordIndex implements indexRI { indexContainer.containerMergeMethod, true); } - } - if (resourceLevel == plasmaWordIndex.RL_WORDFILES) { - if (useCollectionIndex) { - return new kelondroMergeIterator( - new kelondroMergeIterator( - new kelondroMergeIterator( - dhtOutCache.wordContainers(startWordHash, false), - collections.wordContainers(startWordHash, false), - new indexContainerOrder(kelondroNaturalOrder.naturalOrder), - indexContainer.containerMergeMethod, - true), - (assortmentCluster == null) ? null : assortmentCluster.wordContainers(startWordHash, true, false), - new indexContainerOrder(kelondroNaturalOrder.naturalOrder), - indexContainer.containerMergeMethod, - true), - backend.wordContainers(startWordHash, false), - new indexContainerOrder(kelondroNaturalOrder.naturalOrder), - indexContainer.containerMergeMethod, - true); - } else { + if (resourceLevel == plasmaWordIndex.RL_WORDFILES) { return new kelondroMergeIterator( new kelondroMergeIterator( dhtOutCache.wordContainers(startWordHash, false), @@ -719,7 +746,7 @@ public final class plasmaWordIndex implements indexRI { try { entity = new plasmaWordIndexFile(oldDatabaseRoot, wordhash, true); int size = entity.size(); - indexContainer container = new indexContainer(wordhash, payloadrow); + indexContainer container = new indexContainer(wordhash, payloadrow(), useCollectionIndex); try { Iterator entries = entity.elements(true); diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java index 85356539a..2cb31b479 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java @@ -218,7 +218,7 @@ public final class plasmaWordIndexAssortment { if (row == null) return null; String wordHash = row.getColString(0, null); final long updateTime = row.getColLong(2); - indexContainer container = new indexContainer(wordHash, payloadrow); + indexContainer container = new indexContainer(wordHash, payloadrow, false); int al = assortmentCapacity(row.objectsize()); for (int i = 0; i < al; i++) { container.add(new indexRWIEntry[] { new indexRWIEntryOld(row.getColBytes(3 + i)) }, updateTime); diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java index 983f1307e..377cc8f09 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java @@ -165,7 +165,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI { indexContainer c; Iterator i = newContainer.entries(); for (int j = clusterStart; j >= 1; j--) { - c = new indexContainer(newContainer.getWordHash(), payloadrow); + c = new indexContainer(newContainer.getWordHash(), payloadrow, false); for (int k = 0; k < j; k++) { if (i.hasNext()) { c.add((indexRWIEntry) i.next(), newContainer.updated()); @@ -179,7 +179,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI { } public indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) { - indexContainer container = new indexContainer(wordHash, payloadrow); + indexContainer container = new indexContainer(wordHash, payloadrow, false); container.add(newEntry); return addEntries(container, updateTime, dhtCase); } @@ -220,7 +220,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI { Iterator i = newContainer.entries(); for (int j = testsize - 1; j >= 0; j--) { if (spaces[j] == 0) continue; - c = new indexContainer(newContainer.getWordHash(), payloadrow); + c = new indexContainer(newContainer.getWordHash(), payloadrow, false); for (int k = 0; k <= j; k++) { assert (i.hasNext()); c.add((indexRWIEntry) i.next(), newContainer.updated()); @@ -258,7 +258,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI { public indexContainer deleteContainer(String wordHash, long maxTime) { // removes all records from all the assortments and return them - indexContainer buffer, record = new indexContainer(wordHash, payloadrow); + indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false); long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long remainingTime; for (int i = 0; i < clusterCount; i++) { @@ -283,7 +283,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI { */ public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { - indexContainer buffer, record = new indexContainer(wordHash, payloadrow); + indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false); boolean found = false; for (int i = 0; i < clusterCount; i++) { buffer = assortments[i].remove(wordHash); @@ -299,7 +299,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI { } public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete) { - indexContainer buffer, record = new indexContainer(wordHash, payloadrow); + indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false); int initialSize = urlHashes.size(); for (int i = 0; i < clusterCount; i++) { buffer = assortments[i].remove(wordHash); @@ -324,7 +324,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI { public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) { // collect all records from all the assortments and return them - indexContainer buffer, record = new indexContainer(wordHash, payloadrow); + indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false); long timeout = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; for (int i = 0; i < clusterCount; i++) { buffer = assortments[i].get(wordHash); diff --git a/source/de/anomic/plasma/plasmaWordIndexFileCluster.java b/source/de/anomic/plasma/plasmaWordIndexFileCluster.java index 720c01ac4..8cd76accd 100644 --- a/source/de/anomic/plasma/plasmaWordIndexFileCluster.java +++ b/source/de/anomic/plasma/plasmaWordIndexFileCluster.java @@ -234,7 +234,7 @@ public class plasmaWordIndexFileCluster implements indexRI { if ((maxTime < 0) || (maxTime > 60000)) maxTime=60000; // maximum is one minute if (exists(wordHash)) { plasmaWordIndexFile entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10); - indexContainer container = new indexContainer(wordHash, payloadrow); + indexContainer container = new indexContainer(wordHash, payloadrow, false); indexRWIEntry entry; Iterator i = entity.elements(true); while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) { @@ -243,7 +243,7 @@ public class plasmaWordIndexFileCluster implements indexRI { } return container; } else { - return new indexContainer(wordHash, payloadrow); + return new indexContainer(wordHash, payloadrow, false); } } @@ -258,7 +258,7 @@ public class plasmaWordIndexFileCluster implements indexRI { public indexContainer deleteContainer(String wordHash) { plasmaWordIndexFile.removePlasmaIndex(databaseRoot, wordHash); - return new indexContainer(wordHash, payloadrow); + return new indexContainer(wordHash, payloadrow, false); } public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { @@ -303,7 +303,7 @@ public class plasmaWordIndexFileCluster implements indexRI { } public indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) { - indexContainer container = new indexContainer(wordHash, payloadrow); + indexContainer container = new indexContainer(wordHash, payloadrow, false); container.add(newEntry); return addEntries(container, updateTime, dhtCase); } diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 16ba9f067..926f0a5be 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -57,7 +57,6 @@ import de.anomic.http.httpc; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; import de.anomic.plasma.plasmaURL; -import de.anomic.index.indexRWIEntryOld; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.net.URL; @@ -66,6 +65,7 @@ import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverByteBuffer; import de.anomic.server.serverCodings; @@ -373,6 +373,7 @@ public final class yacyClient { boolean global, yacySeed targetPeer, plasmaCrawlLURL urlManager, + plasmaWordIndex wordIndex, indexContainer containerCache, Map abstractCache, plasmaURLPattern blacklist, @@ -493,7 +494,7 @@ public final class yacyClient { final int words = wordhashes.length() / yacySeedDB.commonHashLength; indexContainer[] container = new indexContainer[words]; for (int i = 0; i < words; i++) { - container[i] = new indexContainer(wordhashes.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength), indexRWIEntryOld.urlEntryRow); + container[i] = wordIndex.emptyContainer(wordhashes.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength)); } // insert results to containers @@ -517,7 +518,7 @@ public final class yacyClient { int urlLength = comp.url().toNormalform().length(); int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length; - entry = new indexRWIEntryOld( + entry = wordIndex.newRWIEntry( urlEntry.hash(), urlLength, urlComps, diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index b9500fe1e..1f83742c9 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -57,6 +57,7 @@ import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.plasma.plasmaSnippetCache; +import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.logging.serverLog; @@ -65,6 +66,7 @@ public class yacySearch extends Thread { final private String wordhashes, urlhashes; final private boolean global; final private plasmaCrawlLURL urlManager; + final private plasmaWordIndex wordIndex; final private indexContainer containerCache; final private Map abstractCache; final private plasmaURLPattern blacklist; @@ -77,7 +79,7 @@ public class yacySearch extends Thread { final private String prefer, filter; public yacySearch(String wordhashes, String urlhashes, String prefer, String filter, int maxDistance, - boolean global, yacySeed targetPeer, plasmaCrawlLURL urlManager, + boolean global, yacySeed targetPeer, plasmaCrawlLURL urlManager, plasmaWordIndex wordIndex, indexContainer containerCache, Map abstractCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) { @@ -89,6 +91,7 @@ public class yacySearch extends Thread { this.filter = filter; this.global = global; this.urlManager = urlManager; + this.wordIndex = wordIndex; this.containerCache = containerCache; this.abstractCache = abstractCache; this.blacklist = blacklist; @@ -101,7 +104,7 @@ public class yacySearch extends Thread { } public void run() { - this.urls = yacyClient.search(wordhashes, urlhashes, prefer, filter, maxDistance, global, targetPeer, urlManager, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile); + this.urls = yacyClient.search(wordhashes, urlhashes, prefer, filter, maxDistance, global, targetPeer, urlManager, wordIndex, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile); if (urls != null) { StringBuffer urllist = new StringBuffer(this.urls.length * 13); for (int i = 0; i < this.urls.length; i++) urllist.append(this.urls[i]).append(' '); @@ -194,7 +197,8 @@ public class yacySearch extends Thread { return result; } - public static yacySearch[] primaryRemoteSearches(String wordhashes, String urlhashes, String prefer, String filter, int maxDist, plasmaCrawlLURL urlManager, + public static yacySearch[] primaryRemoteSearches(String wordhashes, String urlhashes, String prefer, String filter, int maxDist, + plasmaCrawlLURL urlManager, plasmaWordIndex wordIndex, indexContainer containerCache, Map abstractCache, int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) { @@ -210,14 +214,16 @@ public class yacySearch extends Thread { yacySearch[] searchThreads = new yacySearch[targets]; for (int i = 0; i < targets; i++) { searchThreads[i]= new yacySearch(wordhashes, urlhashes, prefer, filter, maxDist, true, targetPeers[i], - urlManager, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile); + urlManager, wordIndex, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile); searchThreads[i].start(); //try {Thread.sleep(20);} catch (InterruptedException e) {} } return searchThreads; } - public static yacySearch secondaryRemoteSearch(String wordhashes, String urlhashes, plasmaCrawlLURL urlManager, indexContainer containerCache, + public static yacySearch secondaryRemoteSearch(String wordhashes, String urlhashes, + plasmaCrawlLURL urlManager, plasmaWordIndex wordIndex, + indexContainer containerCache, String targethash, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) { // check own peer status @@ -228,7 +234,7 @@ public class yacySearch extends Thread { final yacySeed targetPeer = yacyCore.seedDB.getConnected(targethash); if (targetPeer == null) return null; yacySearch searchThread = new yacySearch(wordhashes, urlhashes, "", "", 9999, true, targetPeer, - urlManager, containerCache, new TreeMap(), blacklist, snippetCache, timingProfile, rankingProfile); + urlManager, wordIndex, containerCache, new TreeMap(), blacklist, snippetCache, timingProfile, rankingProfile); searchThread.start(); return searchThread; }