diff --git a/source/de/anomic/index/indexAbstractContainer.java b/source/de/anomic/index/indexAbstractContainer.java deleted file mode 100644 index 491332e3b..000000000 --- a/source/de/anomic/index/indexAbstractContainer.java +++ /dev/null @@ -1,64 +0,0 @@ -// indexAbstractConatiner.java -// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany -// first published 20.05.2006 on http://www.anomic.de -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.index; - -import de.anomic.kelondro.kelondroBase64Order; - -public abstract class indexAbstractContainer implements indexContainer { - - private String wordHash; - private long updateTime; - - public void setWordHash(String newWordHash) { - // this is used to replicate a container for different word indexes during global search - this.wordHash = newWordHash; - } - - public long updated() { - return updateTime; - } - - public String getWordHash() { - return wordHash; - } - - public int add(indexEntry entry) { - return add(entry, System.currentTimeMillis()); - } - - public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) { - if (!wordHash.equals(this.wordHash)) return 0; - int count = 0; - for (int i = 0; i < urlHashes.length; i++) count += (remove(urlHashes[i]) == null) ? 0 : 1; - return count; - } - - public int hashCode() { - return (int) kelondroBase64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4)); - } - -} diff --git a/source/de/anomic/index/indexAbstractRI.java b/source/de/anomic/index/indexAbstractRI.java index fc6cc4e70..58029f64b 100644 --- a/source/de/anomic/index/indexAbstractRI.java +++ b/source/de/anomic/index/indexAbstractRI.java @@ -30,7 +30,7 @@ package de.anomic.index; public abstract class indexAbstractRI implements indexRI { public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { - indexTreeMapContainer container = new indexTreeMapContainer(wordHash); + indexContainer container = new indexRowSetContainer(wordHash); container.add(newEntry); return addEntries(container, updateTime, dhtCase); } diff --git a/source/de/anomic/index/indexContainer.java b/source/de/anomic/index/indexContainer.java index 7ff4dd53d..58d0af9b4 100644 --- a/source/de/anomic/index/indexContainer.java +++ b/source/de/anomic/index/indexContainer.java @@ -53,11 +53,8 @@ public interface indexContainer { public int add(indexEntry[] entries, long updateTime); public int add(indexContainer c, long maxTime); - public Set urlHashes(); - public boolean contains(String urlHash) ; public indexEntry get(String urlHash); - public indexEntry[] getEntryArray() ; - + public indexEntry remove(String urlHash); public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete); public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete); @@ -66,6 +63,4 @@ public interface indexContainer { public String toString(); public int hashCode(); - //public void joinConstructive(indexContainer c, long time, int maxDistance); - } diff --git a/source/de/anomic/index/indexRAMCacheRI.java b/source/de/anomic/index/indexRAMCacheRI.java index 1a7c9b37a..25b4bb2b1 100644 --- a/source/de/anomic/index/indexRAMCacheRI.java +++ b/source/de/anomic/index/indexRAMCacheRI.java @@ -101,7 +101,7 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI { long wordsPerSecond = 0, wordcount = 0, urlcount = 0; Map.Entry entry; String wordHash; - indexTreeMapContainer container; + indexContainer container; long updateTime; indexEntry iEntry; kelondroRow.Entry row = dumpArray.row().newEntry(); @@ -110,7 +110,7 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI { synchronized (kCache) { Iterator i = kCache.values().iterator(); while (i.hasNext()) { - container = (indexTreeMapContainer) i.next(); + container = (indexContainer) i.next(); // put entries on stack if (container != null) { @@ -139,7 +139,7 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI { entry = (Map.Entry) i.next(); wordHash = (String) entry.getKey(); updateTime = getUpdateTime(wordHash); - container = (indexTreeMapContainer) entry.getValue(); + container = (indexContainer) entry.getValue(); // put entries on stack if (container != null) { @@ -269,7 +269,7 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI { public int indexSize(String wordHash) { int size = 0; - indexTreeMapContainer cacheIndex = (indexTreeMapContainer) wCache.get(wordHash); + indexContainer cacheIndex = (indexContainer) wCache.get(wordHash); if (cacheIndex != null) size += cacheIndex.size(); return size; } @@ -326,13 +326,13 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI { // find entries in kCache that are too old for that place and shift them to the wCache long time; Long l; - indexTreeMapContainer container; + indexContainer container; synchronized (kCache) { while (kCache.size() > 0) { l = (Long) kCache.firstKey(); time = l.longValue(); if (System.currentTimeMillis() - time < kCacheMaxAge) return; - container = (indexTreeMapContainer) kCache.remove(l); + container = (indexContainer) kCache.remove(l); addEntries(container, container.updated(), false); } } @@ -386,13 +386,13 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI { } public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime_dummy) { - return (indexTreeMapContainer) wCache.get(wordHash); + return (indexContainer) wCache.get(wordHash); } public indexContainer deleteContainer(String wordHash) { // returns the index that had been deleted synchronized (wCache) { - indexTreeMapContainer container = (indexTreeMapContainer) wCache.remove(wordHash); + indexContainer container = (indexContainer) wCache.remove(wordHash); hashScore.deleteScore(wordHash); hashDate.deleteScore(wordHash); return container; @@ -401,7 +401,7 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI { public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { synchronized (wCache) { - indexTreeMapContainer c = (indexTreeMapContainer) deleteContainer(wordHash); + indexContainer c = (indexContainer) deleteContainer(wordHash); if (c != null) { if (c.removeEntry(wordHash, urlHash, deleteComplete)) return true; this.addEntries(c, System.currentTimeMillis(), false); @@ -414,7 +414,7 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI { if (urlHashes.size() == 0) return 0; int count = 0; synchronized (wCache) { - indexTreeMapContainer c = (indexTreeMapContainer) deleteContainer(wordHash); + indexContainer c = (indexContainer) deleteContainer(wordHash); if (c != null) { count = c.removeEntries(wordHash, urlHashes, deleteComplete); if (c.size() != 0) this.addEntries(c, System.currentTimeMillis(), false); @@ -432,13 +432,13 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI { Iterator i = kCache.entrySet().iterator(); Map.Entry entry; Long l; - indexTreeMapContainer c; + indexContainer c; while (i.hasNext()) { entry = (Map.Entry) i.next(); l = (Long) entry.getKey(); // get container - c = (indexTreeMapContainer) entry.getValue(); + c = (indexContainer) entry.getValue(); if (c.remove(urlHash) != null) { if (c.size() == 0) { i.remove(); @@ -466,8 +466,8 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI { } else synchronized (wCache) { // put container into wCache String wordHash = container.getWordHash(); - indexTreeMapContainer entries = (indexTreeMapContainer) wCache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null - if (entries == null) entries = new indexTreeMapContainer(wordHash); + indexContainer entries = (indexContainer) wCache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null + if (entries == null) entries = new indexRowSetContainer(wordHash); added = entries.add(container, -1); if (added > 0) { wCache.put(wordHash, entries); @@ -482,15 +482,15 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI { public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { if (dhtCase) synchronized (kCache) { // put container into kCache - indexTreeMapContainer container = new indexTreeMapContainer(wordHash); + indexContainer container = new indexRowSetContainer(wordHash); container.add(newEntry); kCache.put(new Long(updateTime + kCacheInc), container); kCacheInc++; if (kCacheInc > 10000) kCacheInc = 0; return null; } else synchronized (wCache) { - indexTreeMapContainer container = (indexTreeMapContainer) wCache.get(wordHash); - if (container == null) container = new indexTreeMapContainer(wordHash); + indexContainer container = (indexContainer) wCache.get(wordHash); + if (container == null) container = new indexRowSetContainer(wordHash); indexEntry[] entries = new indexEntry[] { newEntry }; if (container.add(entries, updateTime) > 0) { wCache.put(wordHash, container); diff --git a/source/de/anomic/index/indexRowSetContainer.java b/source/de/anomic/index/indexRowSetContainer.java index c66ada562..39b38608d 100644 --- a/source/de/anomic/index/indexRowSetContainer.java +++ b/source/de/anomic/index/indexRowSetContainer.java @@ -30,22 +30,31 @@ import java.lang.reflect.Method; import java.util.ConcurrentModificationException; import java.util.Iterator; import java.util.Set; +import java.util.TreeMap; +import de.anomic.kelondro.kelondroBase64Order; +import de.anomic.kelondro.kelondroNaturalOrder; +import de.anomic.kelondro.kelondroOrder; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRowSet; public class indexRowSetContainer extends kelondroRowSet implements indexContainer { private String wordHash; + + public indexRowSetContainer(String wordHash) { + this(wordHash, new kelondroNaturalOrder(true), 0); + } - public indexRowSetContainer(kelondroRow rowdef) { - super(rowdef); + public indexRowSetContainer(String wordHash, kelondroOrder ordering, int column) { + super(indexURLEntry.urlEntryRow); + this.wordHash = wordHash; + this.lastTimeWrote = 0; + this.setOrdering(ordering, column); } - + public indexContainer topLevelClone() { - indexContainer newContainer = new indexRowSetContainer(this.rowdef); - newContainer.setWordHash(this.wordHash); - newContainer.setOrdering(this.sortOrder, this.sortColumn); + indexContainer newContainer = new indexRowSetContainer(this.wordHash, this.sortOrder, this.sortColumn); newContainer.add(this, -1); return newContainer; } @@ -97,49 +106,74 @@ public class indexRowSetContainer extends kelondroRowSet implements indexContain private boolean addi(indexEntry entry) { // returns true if the new entry was added, false if it already existed - indexEntry oldEntry = new indexURLEntry(this.put(entry.toKelondroEntry())); // FIXME: see if cloning is necessary - if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this container - this.put(oldEntry.toKelondroEntry()); // put it back - return false; + kelondroRow.Entry oldEntryRow = this.put(entry.toKelondroEntry()); + if (oldEntryRow == null) { + return true; + } else { + indexEntry oldEntry = new indexURLEntry(oldEntryRow); // FIXME: see if cloning is necessary + if (entry.isOlder(oldEntry)) { // A more recent Entry is already in this container + this.put(oldEntry.toKelondroEntry()); // put it back + return false; + } else { + return true; + } } - return (oldEntry == null); - } - - public boolean contains(String urlHash) { -// TODO Auto-generated method stub - return false; } public indexEntry get(String urlHash) { - // TODO Auto-generated method stub - return null; - } - - public indexEntry[] getEntryArray() { - // TODO Auto-generated method stub - return null; + kelondroRow.Entry entry = this.get(urlHash.getBytes()); + if (entry == null) return null; + return new indexURLEntry(entry); } public indexEntry remove(String urlHash) { - // TODO Auto-generated method stub - return null; + kelondroRow.Entry entry = this.remove(urlHash.getBytes()); + if (entry == null) return null; + return new indexURLEntry(entry); } public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { - // TODO Auto-generated method stub - return false; + if (!wordHash.equals(this.wordHash)) return false; + return remove(urlHash) != null; } public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete) { - // TODO Auto-generated method stub - return 0; + if (!wordHash.equals(this.wordHash)) return 0; + int count = 0; + Iterator i = urlHashes.iterator(); + while (i.hasNext()) count += (remove((String) i.next()) == null) ? 0 : 1; + return count; } public Iterator entries() { - // TODO Auto-generated method stub - return null; + // returns an iterator of indexEntry objects + return new entryIterator(); } + public class entryIterator implements Iterator { + + Iterator rowEntryIterator; + + public entryIterator() { + rowEntryIterator = rows(); + } + + public boolean hasNext() { + return rowEntryIterator.hasNext(); + } + + public Object next() { + kelondroRow.Entry rentry = (kelondroRow.Entry) rowEntryIterator.next(); + if (rentry == null) return null; + return new indexURLEntry(rentry); + } + + public void remove() { + rowEntryIterator.remove(); + } + + } + public static Method containerMergeMethod = null; static { try { @@ -162,10 +196,138 @@ public class indexRowSetContainer extends kelondroRowSet implements indexContain c.add((indexContainer) b, -1); return c; } + + public static indexContainer joinContainer(Set containers, long time, int maxDistance) { + + long stamp = System.currentTimeMillis(); + + // order entities by their size + TreeMap map = new TreeMap(); + indexContainer singleContainer; + Iterator i = containers.iterator(); + int count = 0; + while (i.hasNext()) { + // get next entity: + singleContainer = (indexContainer) i.next(); + + // check result + if ((singleContainer == null) || (singleContainer.size() == 0)) return new indexRowSetContainer(null); // as this is a cunjunction of searches, we have no result if any word is not known + + // store result in order of result size + map.put(new Long(singleContainer.size() * 1000 + count), singleContainer); + count++; + } + + // check if there is any result + if (map.size() == 0) return new indexRowSetContainer(null); // no result, nothing found + + // the map now holds the search results in order of number of hits per word + // we now must pairwise build up a conjunction of these sets + Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries + indexContainer searchA, searchB, searchResult = (indexContainer) map.remove(k); + while ((map.size() > 0) && (searchResult.size() > 0)) { + // take the first element of map which is a result and combine it with result + k = (Long) map.firstKey(); // the next smallest... + time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis(); + searchA = searchResult; + searchB = (indexContainer) map.remove(k); + searchResult = indexRowSetContainer.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1), maxDistance); + // free resources + searchA = null; + searchB = null; + } + + // in 'searchResult' is now the combined search result + if (searchResult.size() == 0) return new indexRowSetContainer(null); + return searchResult; + } + + // join methods + private static int log2(int x) { + int l = 0; + while (x > 0) {x = x >> 1; l++;} + return l; + } + + public static indexContainer joinConstructive(indexContainer i1, indexContainer i2, long time, int maxDistance) { + if ((i1 == null) || (i2 == null)) return null; + if ((i1.size() == 0) || (i2.size() == 0)) return new indexRowSetContainer(null); + + // decide which method to use + int high = ((i1.size() > i2.size()) ? i1.size() : i2.size()); + int low = ((i1.size() > i2.size()) ? i2.size() : i1.size()); + int stepsEnum = 10 * (high + low - 1); + int stepsTest = 12 * log2(high) * low; + + // start most efficient method + if (stepsEnum > stepsTest) { + if (i1.size() < i2.size()) + return joinConstructiveByTest(i1, i2, time, maxDistance); + else + return joinConstructiveByTest(i2, i1, time, maxDistance); + } else { + return joinConstructiveByEnumeration(i1, i2, time, maxDistance); + } + } + + private static indexContainer joinConstructiveByTest(indexContainer small, indexContainer large, long time, int maxDistance) { + System.out.println("DEBUG: JOIN METHOD BY TEST"); + indexContainer conj = new indexRowSetContainer(null); // start with empty search result + Iterator se = small.entries(); + indexEntry ie0, ie1; + long stamp = System.currentTimeMillis(); + while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) { + ie0 = (indexEntry) se.next(); + ie1 = large.get(ie0.urlHash()); + if (ie1 != null) { + // this is a hit. Calculate word distance: + ie0.combineDistance(ie1); + if (ie0.worddistance() <= maxDistance) conj.add(ie0); + } + } + return conj; + } + + private static indexContainer joinConstructiveByEnumeration(indexContainer i1, indexContainer i2, long time, int maxDistance) { + System.out.println("DEBUG: JOIN METHOD BY ENUMERATION"); + indexContainer conj = new indexRowSetContainer(null); // start with empty search result + if (!((i1.getOrdering().signature().equals(i2.getOrdering().signature())) && + (i1.getOrderColumn() == i2.getOrderColumn()))) return conj; // ordering must be equal + Iterator e1 = i1.entries(); + Iterator e2 = i2.entries(); + int c; + if ((e1.hasNext()) && (e2.hasNext())) { + indexEntry ie1; + indexEntry ie2; + ie1 = (indexEntry) e1.next(); + ie2 = (indexEntry) e2.next(); - public Set urlHashes() { - // TODO Auto-generated method stub - return null; + long stamp = System.currentTimeMillis(); + while ((System.currentTimeMillis() - stamp) < time) { + c = i1.getOrdering().compare(ie1.urlHash(), ie2.urlHash()); + //System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c); + if (c < 0) { + if (e1.hasNext()) ie1 = (indexEntry) e1.next(); else break; + } else if (c > 0) { + if (e2.hasNext()) ie2 = (indexEntry) e2.next(); else break; + } else { + // we have found the same urls in different searches! + ie1.combineDistance(ie2); + if (ie1.worddistance() <= maxDistance) conj.add(ie1); + if (e1.hasNext()) ie1 = (indexEntry) e1.next(); else break; + if (e2.hasNext()) ie2 = (indexEntry) e2.next(); else break; + } + } + } + return conj; + } + + public String toString() { + return "C[" + wordHash + "] has " + this.size() + " entries"; + } + + public int hashCode() { + return (int) kelondroBase64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4)); } } diff --git a/source/de/anomic/index/indexTreeMapContainer.java b/source/de/anomic/index/indexTreeMapContainer.java deleted file mode 100644 index f69c33fec..000000000 --- a/source/de/anomic/index/indexTreeMapContainer.java +++ /dev/null @@ -1,322 +0,0 @@ -// indexTreeMapContainer.java -// (C) 2005, 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany -// first published 07.05.2005 on http://www.anomic.de -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -/* - an indexContainer is a bag of indexEntries for a single word - such an container represents a RWI snippet: - it collects a new RWI until it is so big that it should be flushed to either - - an indexAssortment: collection of indexContainers of same size or - - the backend storage - - the creationTime is necessary to organize caching of containers -*/ - -package de.anomic.index; - -import java.util.ConcurrentModificationException; -import java.util.Iterator; -import java.util.Set; -import java.util.TreeMap; - -import de.anomic.kelondro.kelondroBase64Order; -import de.anomic.kelondro.kelondroNaturalOrder; -import de.anomic.kelondro.kelondroOrder; - -public final class indexTreeMapContainer extends indexAbstractContainer implements indexContainer { - - private String wordHash; - private final TreeMap container; // urlHash/plasmaWordIndexEntry - Mapping - private long updateTime; - private kelondroOrder ordering; - private int order_column; - - public indexTreeMapContainer(String wordHash) { - this(wordHash, new kelondroNaturalOrder(true), 0); - } - - public indexTreeMapContainer(String wordHash, kelondroOrder ordering, int column) { - this.wordHash = wordHash; - this.updateTime = 0; - this.ordering = ordering; - this.order_column = column; - container = new TreeMap(ordering); // a urlhash/plasmaWordIndexEntry - relation - } - - public indexContainer topLevelClone() { - indexContainer newContainer = new indexTreeMapContainer(this.wordHash, this.ordering, this.order_column); - newContainer.add(this, -1); - return newContainer; - } - - public void setWordHash(String newWordHash) { - // this is used to replicate a container for different word indexes during global search - this.wordHash = newWordHash; - } - - public void clear() { - container.clear(); - } - - public int size() { - return container.size(); - } - - public long updated() { - return updateTime; - } - - public void setOrdering(kelondroOrder newOrder, int newColumn) { - this.ordering = newOrder; - this.order_column = newColumn; - } - - public kelondroOrder getOrdering() { - return this.ordering; - } - - public int getOrderColumn() { - return this.order_column; - } - - public String getWordHash() { - return wordHash; - } - - public int add(indexEntry entry) { - return add(entry, System.currentTimeMillis()); - } - - public int add(indexEntry entry, long updateTime) { - this.updateTime = java.lang.Math.max(this.updateTime, updateTime); - return (addi(entry)) ? 1 : 0; - } - - public int add(indexEntry[] entries, long updateTime) { - int c = 0; - for (int i = 0; i < entries.length; i++) if (addi(entries[i])) c++; - this.updateTime = java.lang.Math.max(this.updateTime, updateTime); - return c; - } - - public int add(indexContainer c, long maxTime) { - // returns the number of new elements - long startTime = System.currentTimeMillis(); - if (c == null) return 0; - int x = 0; - synchronized (c) { - Iterator i = c.entries(); - while ((i.hasNext()) && ((maxTime < 0) || ((startTime + maxTime) > System.currentTimeMillis()))) { - try { - if (addi((indexEntry) i.next())) x++; - } catch (ConcurrentModificationException e) {} - } - } - this.updateTime = java.lang.Math.max(this.updateTime, c.updated()); - return x; - } - - private boolean addi(indexEntry entry) { - // returns true if the new entry was added, false if it already existed - indexEntry oldEntry = (indexEntry) container.put(entry.urlHash(), entry); - if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this container - container.put(entry.urlHash(), oldEntry); // put it back - return false; - } - return (oldEntry == null); - } - - public boolean contains(String urlHash) { - return container.containsKey(urlHash); - } - - public indexEntry get(String urlHash) { - return (indexEntry) container.get(urlHash); - } - - public indexEntry[] getEntryArray() { - return (indexEntry[]) container.values().toArray(); - } - - public indexEntry remove(String urlHash) { - return (indexEntry) container.remove(urlHash); - } - - public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { - if (!wordHash.equals(this.wordHash)) return false; - return remove(urlHash) != null; - } - - public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete) { - if (!wordHash.equals(this.wordHash)) return 0; - int count = 0; - Iterator i = urlHashes.iterator(); - while (i.hasNext()) count += (remove((String) i.next()) == null) ? 0 : 1; - return count; - } - - public Iterator entries() { - // returns an iterator of indexEntry objects - return container.values().iterator(); - } - - public String toString() { - return "C[" + wordHash + "] has " + container.size() + " entries"; - } - - public int hashCode() { - return (int) kelondroBase64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4)); - } - - public static indexContainer joinContainer(Set containers, long time, int maxDistance) { - - long stamp = System.currentTimeMillis(); - - // order entities by their size - TreeMap map = new TreeMap(); - indexTreeMapContainer singleContainer; - Iterator i = containers.iterator(); - int count = 0; - while (i.hasNext()) { - // get next entity: - singleContainer = (indexTreeMapContainer) i.next(); - - // check result - if ((singleContainer == null) || (singleContainer.size() == 0)) return new indexTreeMapContainer(null); // as this is a cunjunction of searches, we have no result if any word is not known - - // store result in order of result size - map.put(new Long(singleContainer.size() * 1000 + count), singleContainer); - count++; - } - - // check if there is any result - if (map.size() == 0) return new indexTreeMapContainer(null); // no result, nothing found - - // the map now holds the search results in order of number of hits per word - // we now must pairwise build up a conjunction of these sets - Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries - indexContainer searchA, searchB, searchResult = (indexContainer) map.remove(k); - while ((map.size() > 0) && (searchResult.size() > 0)) { - // take the first element of map which is a result and combine it with result - k = (Long) map.firstKey(); // the next smallest... - time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis(); - searchA = searchResult; - searchB = (indexContainer) map.remove(k); - searchResult = indexTreeMapContainer.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1), maxDistance); - // free resources - searchA = null; - searchB = null; - } - - // in 'searchResult' is now the combined search result - if (searchResult.size() == 0) return new indexTreeMapContainer(null); - return searchResult; - } - - // join methods - private static int log2(int x) { - int l = 0; - while (x > 0) {x = x >> 1; l++;} - return l; - } - - public static indexContainer joinConstructive(indexContainer i1, indexContainer i2, long time, int maxDistance) { - if ((i1 == null) || (i2 == null)) return null; - if ((i1.size() == 0) || (i2.size() == 0)) return new indexTreeMapContainer(null); - - // decide which method to use - int high = ((i1.size() > i2.size()) ? i1.size() : i2.size()); - int low = ((i1.size() > i2.size()) ? i2.size() : i1.size()); - int stepsEnum = 10 * (high + low - 1); - int stepsTest = 12 * log2(high) * low; - - // start most efficient method - if (stepsEnum > stepsTest) { - if (i1.size() < i2.size()) - return joinConstructiveByTest(i1, i2, time, maxDistance); - else - return joinConstructiveByTest(i2, i1, time, maxDistance); - } else { - return joinConstructiveByEnumeration(i1, i2, time, maxDistance); - } - } - - private static indexContainer joinConstructiveByTest(indexContainer small, indexContainer large, long time, int maxDistance) { - System.out.println("DEBUG: JOIN METHOD BY TEST"); - indexTreeMapContainer conj = new indexTreeMapContainer(null); // start with empty search result - Iterator se = small.entries(); - indexEntry ie0, ie1; - long stamp = System.currentTimeMillis(); - while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) { - ie0 = (indexEntry) se.next(); - ie1 = large.get(ie0.urlHash()); - if (ie1 != null) { - // this is a hit. Calculate word distance: - ie0.combineDistance(ie1); - if (ie0.worddistance() <= maxDistance) conj.add(ie0); - } - } - return conj; - } - - private static indexContainer joinConstructiveByEnumeration(indexContainer i1, indexContainer i2, long time, int maxDistance) { - System.out.println("DEBUG: JOIN METHOD BY ENUMERATION"); - indexTreeMapContainer conj = new indexTreeMapContainer(null); // start with empty search result - if (!((i1.getOrdering().signature().equals(i2.getOrdering().signature())) && - (i1.getOrderColumn() == i2.getOrderColumn()))) return conj; // ordering must be equal - Iterator e1 = i1.entries(); - Iterator e2 = i2.entries(); - int c; - if ((e1.hasNext()) && (e2.hasNext())) { - indexEntry ie1; - indexEntry ie2; - ie1 = (indexEntry) e1.next(); - ie2 = (indexEntry) e2.next(); - - long stamp = System.currentTimeMillis(); - while ((System.currentTimeMillis() - stamp) < time) { - c = i1.getOrdering().compare(ie1.urlHash(), ie2.urlHash()); - //System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c); - if (c < 0) { - if (e1.hasNext()) ie1 = (indexEntry) e1.next(); else break; - } else if (c > 0) { - if (e2.hasNext()) ie2 = (indexEntry) e2.next(); else break; - } else { - // we have found the same urls in different searches! - ie1.combineDistance(ie2); - if (ie1.worddistance() <= maxDistance) conj.add(ie1); - if (e1.hasNext()) ie1 = (indexEntry) e1.next(); else break; - if (e2.hasNext()) ie2 = (indexEntry) e2.next(); else break; - } - } - } - return conj; - } - - public Set urlHashes() { - return container.keySet(); - } - -} diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java index bdaddedaf..41adb9afc 100644 --- a/source/de/anomic/plasma/plasmaDHTChunk.java +++ b/source/de/anomic/plasma/plasmaDHTChunk.java @@ -49,7 +49,6 @@ import java.util.Iterator; import de.anomic.index.indexContainer; import de.anomic.index.indexEntry; -import de.anomic.index.indexTreeMapContainer; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; import de.anomic.server.serverCodings; @@ -75,22 +74,22 @@ public class plasmaDHTChunk { private int status = chunkStatus_UNDEFINED; private String startPointHash; - private indexTreeMapContainer[] indexContainers = null; + private indexContainer[] indexContainers = null; private HashMap urlCache; // String (url-hash) / plasmaCrawlLURL.Entry private int idxCount; private long selectionStartTime = 0; private long selectionEndTime = 0; - public indexTreeMapContainer firstContainer() { + public indexContainer firstContainer() { return indexContainers[0]; } - public indexTreeMapContainer lastContainer() { + public indexContainer lastContainer() { return indexContainers[indexContainers.length - 1]; } - public indexTreeMapContainer[] containers() { + public indexContainer[] containers() { return indexContainers; } @@ -200,7 +199,7 @@ public class plasmaDHTChunk { double maximumDistance = ((double) peerRedundancy * 2) / ((double) yacyCore.seedDB.sizeConnected()); while ((maxcount > refcount) && (indexContainerIterator.hasNext()) && ((container = (indexContainer) indexContainerIterator.next()) != null) && (container.size() > 0) - && ((tmpContainers.size() == 0) || (yacyDHTAction.dhtDistance(container.getWordHash(), ((indexTreeMapContainer) tmpContainers.get(0)).getWordHash()) < maximumDistance))) { + && ((tmpContainers.size() == 0) || (yacyDHTAction.dhtDistance(container.getWordHash(), ((indexContainer) tmpContainers.get(0)).getWordHash()) < maximumDistance))) { // make an on-the-fly entity and insert values int notBoundCounter = 0; try { @@ -243,7 +242,7 @@ public class plasmaDHTChunk { } } // create result - indexContainers = (indexTreeMapContainer[]) tmpContainers.toArray(new indexTreeMapContainer[tmpContainers.size()]); + indexContainers = (indexContainer[]) tmpContainers.toArray(new indexContainer[tmpContainers.size()]); if ((indexContainers == null) || (indexContainers.length == 0)) { log.logFine("No index available for index transfer, hash start-point " + startPointHash); @@ -256,13 +255,13 @@ public class plasmaDHTChunk { return refcount; } catch (kelondroException e) { log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e); - indexContainers = new indexTreeMapContainer[0]; + indexContainers = new indexContainer[0]; urlCache = new HashMap(); this.status = chunkStatus_FAILED; return 0; } catch (IOException e) { log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e); - indexContainers = new indexTreeMapContainer[0]; + indexContainers = new indexContainer[0]; urlCache = new HashMap(); this.status = chunkStatus_FAILED; return 0; diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 168f4c1e6..77450ae72 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -53,7 +53,7 @@ import de.anomic.server.serverInstantThread; import de.anomic.yacy.yacySearch; import de.anomic.index.indexContainer; import de.anomic.index.indexEntry; -import de.anomic.index.indexTreeMapContainer; +import de.anomic.index.indexRowSetContainer; public final class plasmaSearchEvent extends Thread implements Runnable { @@ -86,8 +86,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable { this.ranking = ranking; this.urlStore = urlStore; this.snippetCache = snippetCache; - this.rcLocal = new indexTreeMapContainer(null); - this.rcGlobal = new indexTreeMapContainer(null); + this.rcLocal = new indexRowSetContainer(null); + this.rcGlobal = new indexRowSetContainer(null); this.rcGlobalCount = 0; this.profileLocal = localTiming; this.profileGlobal = remoteTiming; @@ -178,13 +178,13 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // since this is a conjunction we return an empty entity if any word // is not known if (containers == null) { - rcLocal = new indexTreeMapContainer(null); + rcLocal = new indexRowSetContainer(null); return 0; } // join the result profileLocal.startTimer(); - rcLocal = indexTreeMapContainer.joinContainer(containers, + rcLocal = indexRowSetContainer.joinContainer(containers, profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_JOIN), query.maxDistance); profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_JOIN); @@ -220,7 +220,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime - indexTreeMapContainer searchResult = new indexTreeMapContainer(null); + indexContainer searchResult = new indexRowSetContainer(null); long preorderTime = profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_PRESORT); profileLocal.startTimer(); diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index 5c60aa696..a74bcdda7 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -49,8 +49,8 @@ import java.util.Iterator; import de.anomic.server.serverCodings; import de.anomic.server.serverFileUtils; +import de.anomic.index.indexContainer; import de.anomic.index.indexEntry; -import de.anomic.index.indexTreeMapContainer; import de.anomic.kelondro.kelondroBinSearch; public final class plasmaSearchPreOrder { @@ -123,7 +123,7 @@ public final class plasmaSearchPreOrder { return (indexEntry) pageAcc.remove(top); } - public void addContainer(indexTreeMapContainer container, long maxTime) { + public void addContainer(indexContainer container, long maxTime) { long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; indexEntry iEntry; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 4b4f6f2c8..565fe6dfa 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -130,9 +130,10 @@ import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpHeader; import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpc; +import de.anomic.index.indexContainer; import de.anomic.index.indexEntry; import de.anomic.index.indexEntryAttribute; -import de.anomic.index.indexTreeMapContainer; +import de.anomic.index.indexRowSetContainer; import de.anomic.index.indexURL; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; @@ -1487,7 +1488,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String word = (String) wentry.getKey(); wordStat = (plasmaCondenser.wordStatProp) wentry.getValue(); String wordHash = indexEntryAttribute.word2hash(word); - indexTreeMapContainer wordIdxContainer = new indexTreeMapContainer(wordHash); + indexContainer wordIdxContainer = new indexRowSetContainer(wordHash); indexEntry wordIdxEntry = new indexURLEntry(urlHash, urlLength, urlComps, wordStat.count, @@ -1517,7 +1518,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // transfering the index to the storage peer HashMap resultObj = yacyClient.transferIndex( seed, - (indexTreeMapContainer[])tmpContainers.toArray(new indexTreeMapContainer[tmpContainers.size()]), + (indexContainer[]) tmpContainers.toArray(new indexContainer[tmpContainers.size()]), urlCache, true, 120000); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 9dcf3dcbd..fe63e6b77 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -66,7 +66,6 @@ import de.anomic.index.indexRAMCacheRI; import de.anomic.index.indexRI; import de.anomic.index.indexAbstractRI; import de.anomic.index.indexRowSetContainer; -import de.anomic.index.indexTreeMapContainer; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; @@ -371,7 +370,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { public synchronized indexContainer deleteContainer(String wordHash) { indexContainer c = ramCache.deleteContainer(wordHash); - if (c == null) c = new indexTreeMapContainer(wordHash); + if (c == null) c = new indexRowSetContainer(wordHash); c.add(assortmentCluster.deleteContainer(wordHash, -1), -1); c.add(backend.deleteContainer(wordHash), -1); return c; diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java index 54417f13e..ea0887d55 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java @@ -59,7 +59,7 @@ import java.util.Iterator; import de.anomic.index.indexContainer; import de.anomic.index.indexEntry; import de.anomic.index.indexEntryAttribute; -import de.anomic.index.indexTreeMapContainer; +import de.anomic.index.indexRowSetContainer; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroTree; @@ -246,7 +246,7 @@ public final class plasmaWordIndexAssortment { if (row == null) return null; String wordHash = row.getColString(0, null); final long updateTime = row.getColLongB256(2); - indexTreeMapContainer container = new indexTreeMapContainer(wordHash); + indexContainer container = new indexRowSetContainer(wordHash); int al = assortmentCapacity(row.objectsize()); for (int i = 0; i < al; i++) { container.add( diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java index 6fa64915e..89fb8d4c1 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java @@ -58,7 +58,6 @@ import de.anomic.index.indexEntry; import de.anomic.index.indexRI; import de.anomic.index.indexAbstractRI; import de.anomic.index.indexRowSetContainer; -import de.anomic.index.indexTreeMapContainer; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroObjectCache; import de.anomic.kelondro.kelondroRecords; @@ -160,10 +159,10 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl int clusterStart = clusterCount - (int) (Math.random() * (clusterCount - clusterMinStart)); // do the insert - indexTreeMapContainer c; + indexContainer c; Iterator i = newContainer.entries(); for (int j = clusterStart; j >= 1; j--) { - c = new indexTreeMapContainer(newContainer.getWordHash()); + c = new indexRowSetContainer(newContainer.getWordHash()); for (int k = 0; k < j; k++) { if (i.hasNext()) { c.add((indexEntry) i.next(), newContainer.updated()); @@ -202,11 +201,11 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl } if (need == 0) { // we found spaces so that we can put in the newContainer into these spaces - indexTreeMapContainer c; + indexContainer c; Iterator i = newContainer.entries(); for (int j = testsize - 1; j >= 0; j--) { if (spaces[j] == 0) continue; - c = new indexTreeMapContainer(newContainer.getWordHash()); + c = new indexRowSetContainer(newContainer.getWordHash()); for (int k = 0; k <= j; k++) { assert (i.hasNext()); c.add((indexEntry) i.next(), newContainer.updated()); @@ -232,7 +231,7 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl public indexContainer deleteContainer(String wordHash, long maxTime) { // removes all records from all the assortments and return them - indexContainer buffer, record = new indexTreeMapContainer(wordHash); + indexContainer buffer, record = new indexRowSetContainer(wordHash); long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long remainingTime; for (int i = 0; i < clusterCount; i++) { @@ -257,7 +256,7 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl */ public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { - indexContainer buffer, record = new indexTreeMapContainer(wordHash); + indexContainer buffer, record = new indexRowSetContainer(wordHash); boolean found = false; for (int i = 0; i < clusterCount; i++) { buffer = assortments[i].remove(wordHash); @@ -273,7 +272,7 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl } public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete) { - indexContainer buffer, record = new indexTreeMapContainer(wordHash); + indexContainer buffer, record = new indexRowSetContainer(wordHash); int initialSize = urlHashes.size(); for (int i = 0; i < clusterCount; i++) { buffer = assortments[i].remove(wordHash); @@ -298,7 +297,7 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { // collect all records from all the assortments and return them - indexContainer buffer, record = new indexTreeMapContainer(wordHash); + indexContainer buffer, record = new indexRowSetContainer(wordHash); long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long remainingTime; for (int i = 0; i < clusterCount; i++) { diff --git a/source/de/anomic/plasma/plasmaWordIndexFileCluster.java b/source/de/anomic/plasma/plasmaWordIndexFileCluster.java index 5a26ecd67..dc2eadc00 100644 --- a/source/de/anomic/plasma/plasmaWordIndexFileCluster.java +++ b/source/de/anomic/plasma/plasmaWordIndexFileCluster.java @@ -54,7 +54,7 @@ import de.anomic.index.indexContainer; import de.anomic.index.indexEntry; import de.anomic.index.indexRI; import de.anomic.index.indexAbstractRI; -import de.anomic.index.indexTreeMapContainer; +import de.anomic.index.indexRowSetContainer; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySeedDB; @@ -230,7 +230,7 @@ public class plasmaWordIndexFileCluster extends indexAbstractRI implements index if ((maxTime < 0) || (maxTime > 60000)) maxTime=60000; // maximum is one minute if (plasmaWordIndexFile.wordHash2path(databaseRoot, wordHash).exists()) { plasmaWordIndexFile entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10); - indexTreeMapContainer container = new indexTreeMapContainer(wordHash); + indexContainer container = new indexRowSetContainer(wordHash); indexEntry entry; Iterator i = entity.elements(true); while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) { @@ -239,7 +239,7 @@ public class plasmaWordIndexFileCluster extends indexAbstractRI implements index } return container; } else { - return new indexTreeMapContainer(wordHash); + return new indexRowSetContainer(wordHash); } } @@ -254,7 +254,7 @@ public class plasmaWordIndexFileCluster extends indexAbstractRI implements index public indexContainer deleteContainer(String wordHash) { plasmaWordIndexFile.removePlasmaIndex(databaseRoot, wordHash); - return new indexTreeMapContainer(wordHash); + return new indexRowSetContainer(wordHash); } public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index a1ae3b8c9..8d0864f03 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -56,7 +56,7 @@ import de.anomic.http.httpc; import de.anomic.index.indexContainer; import de.anomic.index.indexEntry; import de.anomic.index.indexEntryAttribute; -import de.anomic.index.indexTreeMapContainer; +import de.anomic.index.indexRowSetContainer; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.plasma.plasmaCrawlLURL; @@ -468,9 +468,9 @@ public final class yacyClient { // create containers final int words = wordhashes.length() / indexEntryAttribute.wordHashLength; - indexTreeMapContainer[] container = new indexTreeMapContainer[words]; + indexContainer[] container = new indexContainer[words]; for (int i = 0; i < words; i++) { - container[i] = new indexTreeMapContainer(wordhashes.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength)); + container[i] = new indexRowSetContainer(wordhashes.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength)); } // insert results to containers