From 7b3b12888ce93a9f55ce33892df35cde12fe8f07 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 28 May 2006 01:09:31 +0000 Subject: [PATCH] refactoring: integrated indexContainer abstraction layer git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2149 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControl_p.java | 10 +-- .../anomic/index/indexAbstractContainer.java | 64 ++++++++++++++++ .../de/anomic/index/indexAbstractEntry.java | 1 - source/de/anomic/index/indexAbstractRI.java | 4 +- source/de/anomic/index/indexContainer.java | 75 +++++++++++++++++++ source/de/anomic/index/indexEntry.java | 1 + source/de/anomic/index/indexRI.java | 10 +-- .../plasma/dbImport/plasmaDbImporter.java | 4 +- .../plasmaWordIndexAssortmentImporter.java | 4 +- source/de/anomic/plasma/plasmaDHTChunk.java | 3 +- .../de/anomic/plasma/plasmaSearchEvent.java | 3 +- source/de/anomic/plasma/plasmaWordIndex.java | 28 +++---- .../plasma/plasmaWordIndexAssortment.java | 9 ++- .../plasmaWordIndexAssortmentCluster.java | 23 +++--- .../anomic/plasma/plasmaWordIndexCache.java | 9 ++- .../plasma/plasmaWordIndexClassicDB.java | 7 +- .../anomic/plasma/plasmaWordIndexEntity.java | 4 +- .../plasma/plasmaWordIndexEntryContainer.java | 38 ++++++---- source/de/anomic/yacy/yacyClient.java | 7 +- source/de/anomic/yacy/yacySearch.java | 8 +- source/yacy.java | 8 +- 21 files changed, 237 insertions(+), 83 deletions(-) create mode 100644 source/de/anomic/index/indexAbstractContainer.java create mode 100644 source/de/anomic/index/indexContainer.java diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 270a1d17f..69b926465 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -57,13 +57,13 @@ import java.util.TreeMap; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpHeader; +import de.anomic.index.indexContainer; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaWordIndexEntryInstance; -import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyClient; @@ -147,7 +147,7 @@ public class IndexControl_p { if (post.containsKey("keyhashdeleteall")) { if (delurl || delurlref) { // generate an urlx array - plasmaWordIndexEntryContainer index = null; + indexContainer index = null; index = switchboard.wordIndex.getContainer(keyhash, true, -1); Iterator en = index.entries(); int i = 0; @@ -246,7 +246,7 @@ public class IndexControl_p { } prop.put("urlstring", ""); prop.put("urlhash", ""); - plasmaWordIndexEntryContainer index; + indexContainer index; String result; long starttime = System.currentTimeMillis(); index = switchboard.wordIndex.getContainer(keyhash, true, -1); @@ -275,7 +275,7 @@ public class IndexControl_p { int timeout = (int) switchboard.getConfigLong("indexControl.timeout",60000); result = yacyClient.transferIndex( yacyCore.seedDB.getConnected(post.get("hostHash", "")), - new plasmaWordIndexEntryContainer[]{index}, + new indexContainer[]{index}, knownURLs, "true".equalsIgnoreCase(gzipBody), timeout); @@ -421,7 +421,7 @@ public class IndexControl_p { public static String genUrlList(plasmaSwitchboard switchboard, String keyhash, String keystring) { // search for a word hash and generate a list of url links - plasmaWordIndexEntryContainer index = null; + indexContainer index = null; try { index = switchboard.wordIndex.getContainer(keyhash, true, -1); diff --git a/source/de/anomic/index/indexAbstractContainer.java b/source/de/anomic/index/indexAbstractContainer.java new file mode 100644 index 000000000..3a74293bc --- /dev/null +++ b/source/de/anomic/index/indexAbstractContainer.java @@ -0,0 +1,64 @@ +// indexAbstractConatiner.java +// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 20.05.2006 on http://www.anomic.de +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.index; + +import de.anomic.kelondro.kelondroBase64Order; + +public abstract class indexAbstractContainer implements indexContainer { + + private String wordHash; + private long updateTime; + + public void setWordHash(String newWordHash) { + // this is used to replicate a container for different word indexes during global search + this.wordHash = newWordHash; + } + + public long updated() { + return updateTime; + } + + public String wordHash() { + return wordHash; + } + + public int add(indexEntry entry) { + return add(entry, System.currentTimeMillis()); + } + + public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) { + if (!wordHash.equals(this.wordHash)) return 0; + int count = 0; + for (int i = 0; i < urlHashes.length; i++) count += (remove(urlHashes[i]) == null) ? 0 : 1; + return count; + } + + public int hashCode() { + return (int) kelondroBase64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4)); + } + +} diff --git a/source/de/anomic/index/indexAbstractEntry.java b/source/de/anomic/index/indexAbstractEntry.java index 134faa2c4..53218ddca 100644 --- a/source/de/anomic/index/indexAbstractEntry.java +++ b/source/de/anomic/index/indexAbstractEntry.java @@ -26,7 +26,6 @@ package de.anomic.index; -//import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaWordIndex; public abstract class indexAbstractEntry implements indexEntry { diff --git a/source/de/anomic/index/indexAbstractRI.java b/source/de/anomic/index/indexAbstractRI.java index d58f5a42e..2ad05b311 100644 --- a/source/de/anomic/index/indexAbstractRI.java +++ b/source/de/anomic/index/indexAbstractRI.java @@ -30,14 +30,14 @@ import de.anomic.plasma.plasmaWordIndexEntryContainer; public abstract class indexAbstractRI implements indexRI { - public plasmaWordIndexEntryContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { + public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); container.add(newEntry); return addEntries(container, updateTime, dhtCase); } public long getUpdateTime(String wordHash) { - plasmaWordIndexEntryContainer entries = getContainer(wordHash, false, -1); + indexContainer entries = getContainer(wordHash, false, -1); if (entries == null) return 0; return entries.updated(); } diff --git a/source/de/anomic/index/indexContainer.java b/source/de/anomic/index/indexContainer.java new file mode 100644 index 000000000..797444761 --- /dev/null +++ b/source/de/anomic/index/indexContainer.java @@ -0,0 +1,75 @@ +// indexContainer.java +// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 20.05.2006 on http://www.anomic.de +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +// an index container is a set of index entries + +package de.anomic.index; + +import java.util.Iterator; + +import de.anomic.kelondro.kelondroOrder; + +public interface indexContainer { + + public void setWordHash(String newWordHash); + + public void clear(); + + public int size(); + + public long updated(); + + public kelondroOrder order(); + + public String wordHash(); + + public int add(indexEntry entry); + + public int add(indexEntry entry, long updateTime); + + public int add(indexEntry[] entries, long updateTime); + + public int add(indexContainer c, long maxTime); + + public boolean contains(String urlHash) ; + + public indexEntry get(String urlHash); + + public indexEntry[] getEntryArray() ; + + public indexEntry remove(String urlHash); + + public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete); + + public Iterator entries(); + + public String toString(); + + public int hashCode(); + + //public void joinConstructive(indexContainer c, long time, int maxDistance); + +} diff --git a/source/de/anomic/index/indexEntry.java b/source/de/anomic/index/indexEntry.java index 50785b5ca..f00cb7600 100644 --- a/source/de/anomic/index/indexEntry.java +++ b/source/de/anomic/index/indexEntry.java @@ -35,6 +35,7 @@ public interface indexEntry { public String getUrlHash(); public void combineDistance(indexEntry oe); + public int worddistance(); public void min(indexEntry other); public void max(indexEntry other); public void normalize(indexEntry min, indexEntry max); diff --git a/source/de/anomic/index/indexRI.java b/source/de/anomic/index/indexRI.java index ebc661f0c..bcbeee508 100644 --- a/source/de/anomic/index/indexRI.java +++ b/source/de/anomic/index/indexRI.java @@ -44,8 +44,6 @@ package de.anomic.index; import java.util.Iterator; -import de.anomic.plasma.plasmaWordIndexEntryContainer; - public interface indexRI { public int size(); @@ -53,12 +51,12 @@ public interface indexRI { public Iterator wordHashes(String startWordHash, boolean rot); public long getUpdateTime(String wordHash); - public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime); - public plasmaWordIndexEntryContainer deleteContainer(String wordHash); + public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime); + public indexContainer deleteContainer(String wordHash); public int removeEntries(String wordHash, String[] referenceHashes, boolean deleteComplete); - public plasmaWordIndexEntryContainer addEntry(String wordHash, indexEntry entry, long updateTime, boolean dhtCase); - public plasmaWordIndexEntryContainer addEntries(plasmaWordIndexEntryContainer newEntries, long creationTime, boolean dhtCase); + public indexContainer addEntry(String wordHash, indexEntry entry, long updateTime, boolean dhtCase); + public indexContainer addEntries(indexContainer newEntries, long creationTime, boolean dhtCase); public void close(int waitingSeconds); diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index 4b9d837cf..14c6650ab 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -6,12 +6,12 @@ import java.util.HashSet; import java.util.Iterator; import java.util.TreeSet; +import de.anomic.index.indexContainer; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaWordIndexEntryInstance; -import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.server.serverDate; public class plasmaDbImporter extends AbstractImporter implements dbImporter { @@ -119,7 +119,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { while (!isAborted() && importWordHashIterator.hasNext()) { TreeSet entityUrls = new TreeSet(new kelondroNaturalOrder(true)); - plasmaWordIndexEntryContainer newContainer = null; + indexContainer newContainer = null; try { this.wordCounter++; this.wordHash = (String) importWordHashIterator.next(); diff --git a/source/de/anomic/plasma/dbImport/plasmaWordIndexAssortmentImporter.java b/source/de/anomic/plasma/dbImport/plasmaWordIndexAssortmentImporter.java index 61804a32f..88887bf0c 100644 --- a/source/de/anomic/plasma/dbImport/plasmaWordIndexAssortmentImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaWordIndexAssortmentImporter.java @@ -3,9 +3,9 @@ package de.anomic.plasma.dbImport; import java.io.File; import java.util.Iterator; +import de.anomic.index.indexContainer; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndexAssortment; -import de.anomic.plasma.plasmaWordIndexEntryContainer; public class plasmaWordIndexAssortmentImporter extends AbstractImporter implements dbImporter{ @@ -99,7 +99,7 @@ public class plasmaWordIndexAssortmentImporter extends AbstractImporter implemen String hash = new String(row[0]); // creating an word entry container - plasmaWordIndexEntryContainer container; + indexContainer container; try { container = this.assortmentFile.row2container(hash, row); } catch (NullPointerException e) { diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java index d1c4a3045..a6a1c6c03 100644 --- a/source/de/anomic/plasma/plasmaDHTChunk.java +++ b/source/de/anomic/plasma/plasmaDHTChunk.java @@ -46,6 +46,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; +import de.anomic.index.indexContainer; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; import de.anomic.server.serverCodings; @@ -186,7 +187,7 @@ public class plasmaDHTChunk { String nexthash = ""; try { Iterator wordHashIterator = wordIndex.wordHashSet(hash, resourceLevel, true, maxcount).iterator(); - plasmaWordIndexEntryContainer indexContainer; + indexContainer indexContainer; Iterator urlIter; plasmaWordIndexEntryInstance indexEntry; plasmaCrawlLURL.Entry lurl; diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index b4a38b975..6d02e6f18 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -51,6 +51,7 @@ import de.anomic.kelondro.kelondroException; import de.anomic.server.logging.serverLog; import de.anomic.server.serverInstantThread; import de.anomic.yacy.yacySearch; +import de.anomic.index.indexContainer; public final class plasmaSearchEvent extends Thread implements Runnable { @@ -64,7 +65,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { private plasmaWordIndex wordIndex; private plasmaCrawlLURL urlStore; private plasmaSnippetCache snippetCache; - private plasmaWordIndexEntryContainer rcLocal, rcGlobal; // caches for results + private indexContainer rcLocal, rcGlobal; // caches for results private int rcGlobalCount; private plasmaSearchTimingProfile profileLocal, profileGlobal; private yacySearch[] searchThreads; diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 643dd456a..252e4be65 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -58,6 +58,7 @@ import java.util.TreeSet; import java.net.URL; import de.anomic.htmlFilter.htmlFilterContentScraper; +import de.anomic.index.indexContainer; import de.anomic.index.indexEntry; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexRI; @@ -158,8 +159,8 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { } } - public plasmaWordIndexEntryContainer addEntry(String wordHash, indexEntry entry, long updateTime, boolean dhtCase) { - plasmaWordIndexEntryContainer c; + public indexContainer addEntry(String wordHash, indexEntry entry, long updateTime, boolean dhtCase) { + indexContainer c; if ((c = ramCache.addEntry(wordHash, entry, updateTime, dhtCase)) == null) { if (!dhtCase) flushControl(); return null; @@ -167,8 +168,8 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { return c; } - public plasmaWordIndexEntryContainer addEntries(plasmaWordIndexEntryContainer entries, long updateTime, boolean dhtCase) { - plasmaWordIndexEntryContainer added = ramCache.addEntries(entries, updateTime, dhtCase); + public indexContainer addEntries(indexContainer entries, long updateTime, boolean dhtCase) { + indexContainer added = ramCache.addEntries(entries, updateTime, dhtCase); // force flush if (!dhtCase) flushControl(); @@ -192,9 +193,9 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { } private synchronized void flushCache(String wordHash) { - plasmaWordIndexEntryContainer c = ramCache.deleteContainer(wordHash); + indexContainer c = ramCache.deleteContainer(wordHash); if (c != null) { - plasmaWordIndexEntryContainer feedback = assortmentCluster.addEntries(c, c.updated(), false); + indexContainer feedback = assortmentCluster.addEntries(c, c.updated(), false); if (feedback != null) { backend.addEntries(feedback, System.currentTimeMillis(), true); } @@ -277,7 +278,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { return condenser.RESULT_SIMI_WORDS; } - public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { + public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { long start = System.currentTimeMillis(); plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); @@ -307,7 +308,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { // retrieve entities that belong to the hashes HashSet containers = new HashSet(); String singleHash; - plasmaWordIndexEntryContainer singleContainer; + indexContainer singleContainer; Iterator i = wordHashes.iterator(); long start = System.currentTimeMillis(); long remaining; @@ -356,8 +357,9 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { backend.close(10); } - public synchronized plasmaWordIndexEntryContainer deleteContainer(String wordHash) { - plasmaWordIndexEntryContainer c = ramCache.deleteContainer(wordHash); + public synchronized indexContainer deleteContainer(String wordHash) { + indexContainer c = ramCache.deleteContainer(wordHash); + if (c == null) c = new plasmaWordIndexEntryContainer(wordHash); c.add(assortmentCluster.deleteContainer(wordHash, -1), -1); c.add(backend.deleteContainer(wordHash), -1); return c; @@ -369,7 +371,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { synchronized (this) { removed = ramCache.removeEntries(wordHash, urlHashes, deleteComplete); if (removed == urlHashes.length) return removed; - plasmaWordIndexEntryContainer container = assortmentCluster.deleteContainer(wordHash, -1); + indexContainer container = assortmentCluster.deleteContainer(wordHash, -1); if (container != null) { removed += container.removeEntries(wordHash, urlHashes, deleteComplete); if (container.size() != 0) { @@ -506,7 +508,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { } else { // take out all words from the assortment to see if it fits // together with the extracted assortment - plasmaWordIndexEntryContainer container = assortmentCluster.deleteContainer(wordhash, -1); + indexContainer container = assortmentCluster.deleteContainer(wordhash, -1); if (size + container.size() > assortmentCluster.clusterCapacity) { // this will also be too big to integrate, add to entity entity.addEntries(container); @@ -567,7 +569,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { public void run() { serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started"); String wordHash = ""; - plasmaWordIndexEntryContainer wordContainer = null; + indexContainer wordContainer = null; plasmaWordIndexEntryInstance entry = null; URL url = null; HashSet urlHashs = new HashSet(); diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java index aaf8fa013..628334e5f 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java @@ -56,6 +56,7 @@ import java.io.File; import java.io.IOException; import java.util.Iterator; +import de.anomic.index.indexContainer; import de.anomic.index.indexEntryAttribute; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroRecords; @@ -125,7 +126,7 @@ public final class plasmaWordIndexAssortment { if (log != null) log.logConfig("Created new Assortment Database, width " + assortmentLength + ", " + bufferkb + "kb buffer"); } - public void store(plasmaWordIndexEntryContainer newContainer) { + public void store(indexContainer newContainer) { // stores a word index to assortment database // this throws an exception if the word hash already existed //log.logDebug("storeAssortment: wordHash=" + wordHash + ", urlHash=" + entry.getUrlHash() + ", time=" + creationTime); @@ -154,7 +155,7 @@ public final class plasmaWordIndexAssortment { if (oldrow != null) throw new RuntimeException("Store to assortment ambiguous"); } - public plasmaWordIndexEntryContainer remove(String wordHash) { + public indexContainer remove(String wordHash) { // deletes a word index from assortment database // and returns the content record byte[][] row = null; @@ -191,7 +192,7 @@ public final class plasmaWordIndexAssortment { } } - public plasmaWordIndexEntryContainer get(String wordHash) { + public indexContainer get(String wordHash) { // gets a word index from assortment database // and returns the content record byte[][] row = null; @@ -211,7 +212,7 @@ public final class plasmaWordIndexAssortment { return row2container(wordHash, row); } - public plasmaWordIndexEntryContainer row2container(String wordHash, byte[][] row) { + public indexContainer row2container(String wordHash, byte[][] row) { if (row == null) return null; final long updateTime = kelondroRecords.bytes2long(row[2]); plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java index 3e91621c7..12cc15777 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java @@ -51,6 +51,7 @@ import java.io.IOException; import java.util.HashSet; import java.util.Iterator; +import de.anomic.index.indexContainer; import de.anomic.index.indexRI; import de.anomic.index.indexAbstractRI; import de.anomic.kelondro.kelondroNaturalOrder; @@ -97,13 +98,13 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl } } - private plasmaWordIndexEntryContainer storeSingular(plasmaWordIndexEntryContainer newContainer) { + private indexContainer storeSingular(indexContainer newContainer) { // this tries to store the record. If the record does not fit, or a same hash already // exists and would not fit together with the new record, then the record is deleted from // the assortmen(s) and returned together with the newRecord. // if storage was successful, NULL is returned. if (newContainer.size() > clusterCount) return newContainer; // it will not fit - plasmaWordIndexEntryContainer buffer; + indexContainer buffer; while ((buffer = assortments[newContainer.size() - 1].remove(newContainer.wordHash())) != null) { if (newContainer.add(buffer, -1) == 0) return newContainer; // security check; othervise this loop does not terminate if (newContainer.size() > clusterCount) return newContainer; // it will not fit @@ -114,14 +115,14 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl return null; } - private void storeForced(plasmaWordIndexEntryContainer newContainer) { + private void storeForced(indexContainer newContainer) { // this stores the record and overwrites an existing record. // this is safe if we can be shure that the record does not exist before. if ((newContainer == null) || (newContainer.size() == 0) || (newContainer.size() > clusterCount)) return; // it will not fit assortments[newContainer.size() - 1].store(newContainer); } - private void storeStretched(plasmaWordIndexEntryContainer newContainer) { + private void storeStretched(indexContainer newContainer) { // this stores the record and stretches the storage over // all the assortments that are necessary to fit in the record // IMPORTANT: it must be ensured that the wordHash does not exist in the cluster before @@ -159,7 +160,7 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl } } - public plasmaWordIndexEntryContainer addEntries(plasmaWordIndexEntryContainer newContainer, long creationTime, boolean dhtCase) { + public indexContainer addEntries(indexContainer newContainer, long creationTime, boolean dhtCase) { // this is called by the index ram cache flush process // it returnes NULL if the storage was successful // it returnes a new container if the given container cannot be stored @@ -209,13 +210,13 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl return null; } - public plasmaWordIndexEntryContainer deleteContainer(String wordHash) { + public indexContainer deleteContainer(String wordHash) { return deleteContainer(wordHash, -1); } - public plasmaWordIndexEntryContainer deleteContainer(String wordHash, long maxTime) { + public indexContainer deleteContainer(String wordHash, long maxTime) { // removes all records from all the assortments and return them - plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash); + indexContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash); long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long remainingTime; for (int i = 0; i < clusterCount; i++) { @@ -228,7 +229,7 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl } public int removeEntries(String wordHash, String[] referenceHashes, boolean deleteComplete) { - plasmaWordIndexEntryContainer c = deleteContainer(wordHash, -1); + indexContainer c = deleteContainer(wordHash, -1); int b = c.size(); c.removeEntries(wordHash, referenceHashes, false); if (c.size() != 0) { @@ -237,9 +238,9 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl return b - c.size(); } - public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { + public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { // collect all records from all the assortments and return them - plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash); + indexContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash); long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long remainingTime; for (int i = 0; i < clusterCount; i++) { diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index 8a1284446..2390c2e3e 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -50,6 +50,7 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeMap; +import de.anomic.index.indexContainer; import de.anomic.index.indexEntry; import de.anomic.index.indexRI; import de.anomic.index.indexAbstractRI; @@ -360,11 +361,11 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index return (((long) intTime) * (long) 1000) + startTime; } - public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime_dummy) { + public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime_dummy) { return (plasmaWordIndexEntryContainer) wCache.get(wordHash); } - public plasmaWordIndexEntryContainer deleteContainer(String wordHash) { + public indexContainer deleteContainer(String wordHash) { // returns the index that had been deleted synchronized (wCache) { plasmaWordIndexEntryContainer container = (plasmaWordIndexEntryContainer) wCache.remove(wordHash); @@ -416,7 +417,7 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index return delCount; } - public plasmaWordIndexEntryContainer addEntries(plasmaWordIndexEntryContainer container, long updateTime, boolean dhtCase) { + public indexContainer addEntries(indexContainer container, long updateTime, boolean dhtCase) { // this puts the entries into the cache, not into the assortment directly int added = 0; @@ -443,7 +444,7 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index return null; } - public plasmaWordIndexEntryContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { + public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { if (dhtCase) synchronized (kCache) { // put container into kCache plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); diff --git a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java index 9b89b2b33..233a0412d 100644 --- a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java +++ b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java @@ -49,6 +49,7 @@ import java.util.Comparator; import java.util.Iterator; import java.util.TreeSet; +import de.anomic.index.indexContainer; import de.anomic.index.indexRI; import de.anomic.index.indexAbstractRI; import de.anomic.kelondro.kelondroNaturalOrder; @@ -187,7 +188,7 @@ public class plasmaWordIndexClassicDB extends indexAbstractRI implements indexRI } } - public synchronized plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { + public synchronized indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { long start = System.currentTimeMillis(); if ((maxTime < 0) || (maxTime > 60000)) maxTime=60000; // maximum is one minute if (plasmaWordIndexEntity.wordHash2path(databaseRoot, wordHash).exists()) { @@ -214,7 +215,7 @@ public class plasmaWordIndexClassicDB extends indexAbstractRI implements indexRI if (f.exists()) return f.lastModified(); else return -1; } - public plasmaWordIndexEntryContainer deleteContainer(String wordHash) { + public indexContainer deleteContainer(String wordHash) { plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash); return new plasmaWordIndexEntryContainer(wordHash); } @@ -240,7 +241,7 @@ public class plasmaWordIndexClassicDB extends indexAbstractRI implements indexRI } } - public plasmaWordIndexEntryContainer addEntries(plasmaWordIndexEntryContainer container, long creationTime, boolean highPriority) { + public indexContainer addEntries(indexContainer container, long creationTime, boolean highPriority) { //System.out.println("* adding " + newEntries.size() + " cached word index entries for word " + wordHash); // debug // fetch the index cache if ((container == null) || (container.size() == 0)) return null; diff --git a/source/de/anomic/plasma/plasmaWordIndexEntity.java b/source/de/anomic/plasma/plasmaWordIndexEntity.java index 4c2f8968d..779b8a9d1 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntity.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntity.java @@ -48,6 +48,7 @@ import java.io.File; import java.io.IOException; import java.util.Iterator; +import de.anomic.index.indexContainer; import de.anomic.index.indexURL; import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroException; @@ -157,7 +158,7 @@ public final class plasmaWordIndexEntity { return (theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedStringForm().getBytes()) == null); } - public int addEntries(plasmaWordIndexEntryContainer container) throws IOException { + public int addEntries(indexContainer container) throws IOException { //System.out.println("* adding " + newEntries.size() + " cached word index entries for word " + wordHash); // debug // fetch the index cache if ((container == null) || (container.size() == 0)) return 0; @@ -178,6 +179,7 @@ public final class plasmaWordIndexEntity { } public boolean deleteComplete() { + if (theIndex == null) return false; try { theIndex.close(); } catch (IOException e) {} // remove file boolean success = theLocation.delete(); diff --git a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java index 40a60a17f..0fc3f3bf3 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java @@ -57,12 +57,14 @@ import java.util.Iterator; import java.util.Set; import java.util.TreeMap; +import de.anomic.index.indexContainer; +import de.anomic.index.indexAbstractContainer; import de.anomic.index.indexEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroOrder; -public final class plasmaWordIndexEntryContainer { +public final class plasmaWordIndexEntryContainer extends indexAbstractContainer implements indexContainer { private String wordHash; private final TreeMap container; // urlHash/plasmaWordIndexEntry - Mapping @@ -97,6 +99,10 @@ public final class plasmaWordIndexEntryContainer { return updateTime; } + public kelondroOrder order() { + return ordering; + } + public String wordHash() { return wordHash; } @@ -117,7 +123,7 @@ public final class plasmaWordIndexEntryContainer { return c; } - public int add(plasmaWordIndexEntryContainer c, long maxTime) { + public int add(indexContainer c, long maxTime) { // returns the number of new elements long startTime = System.currentTimeMillis(); if (c == null) return 0; @@ -128,7 +134,7 @@ public final class plasmaWordIndexEntryContainer { if (addi((plasmaWordIndexEntryInstance) i.next())) x++; } catch (ConcurrentModificationException e) {} } - this.updateTime = java.lang.Math.max(this.updateTime, c.updateTime); + this.updateTime = java.lang.Math.max(this.updateTime, c.updated()); return x; } @@ -146,15 +152,15 @@ public final class plasmaWordIndexEntryContainer { return container.containsKey(urlHash); } - public plasmaWordIndexEntryInstance get(String urlHash) { + public indexEntry get(String urlHash) { return (plasmaWordIndexEntryInstance) container.get(urlHash); } - public plasmaWordIndexEntryInstance[] getEntryArray() { + public indexEntry[] getEntryArray() { return (plasmaWordIndexEntryInstance[]) container.values().toArray(); } - public plasmaWordIndexEntryInstance remove(String urlHash) { + public indexEntry remove(String urlHash) { return (plasmaWordIndexEntryInstance) container.remove(urlHash); } @@ -178,7 +184,7 @@ public final class plasmaWordIndexEntryContainer { return (int) kelondroBase64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4)); } - public static plasmaWordIndexEntryContainer joinContainer(Set containers, long time, int maxDistance) { + public static indexContainer joinContainer(Set containers, long time, int maxDistance) { long stamp = System.currentTimeMillis(); @@ -205,13 +211,13 @@ public final class plasmaWordIndexEntryContainer { // the map now holds the search results in order of number of hits per word // we now must pairwise build up a conjunction of these sets Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries - plasmaWordIndexEntryContainer searchA, searchB, searchResult = (plasmaWordIndexEntryContainer) map.remove(k); + indexContainer searchA, searchB, searchResult = (indexContainer) map.remove(k); while ((map.size() > 0) && (searchResult.size() > 0)) { // take the first element of map which is a result and combine it with result k = (Long) map.firstKey(); // the next smallest... time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis(); searchA = searchResult; - searchB = (plasmaWordIndexEntryContainer) map.remove(k); + searchB = (indexContainer) map.remove(k); searchResult = plasmaWordIndexEntryContainer.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1), maxDistance); // free resources searchA = null; @@ -230,7 +236,7 @@ public final class plasmaWordIndexEntryContainer { return l; } - public static plasmaWordIndexEntryContainer joinConstructive(plasmaWordIndexEntryContainer i1, plasmaWordIndexEntryContainer i2, long time, int maxDistance) { + public static indexContainer joinConstructive(indexContainer i1, indexContainer i2, long time, int maxDistance) { if ((i1 == null) || (i2 == null)) return null; if ((i1.size() == 0) || (i2.size() == 0)) return new plasmaWordIndexEntryContainer(null); @@ -251,14 +257,14 @@ public final class plasmaWordIndexEntryContainer { } } - private static plasmaWordIndexEntryContainer joinConstructiveByTest(plasmaWordIndexEntryContainer small, plasmaWordIndexEntryContainer large, long time, int maxDistance) { + private static indexContainer joinConstructiveByTest(indexContainer small, indexContainer large, long time, int maxDistance) { System.out.println("DEBUG: JOIN METHOD BY TEST"); plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result Iterator se = small.entries(); - plasmaWordIndexEntryInstance ie0, ie1; + indexEntry ie0, ie1; long stamp = System.currentTimeMillis(); while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) { - ie0 = (plasmaWordIndexEntryInstance) se.next(); + ie0 = (indexEntry) se.next(); ie1 = large.get(ie0.getUrlHash()); if (ie1 != null) { // this is a hit. Calculate word distance: @@ -269,10 +275,10 @@ public final class plasmaWordIndexEntryContainer { return conj; } - private static plasmaWordIndexEntryContainer joinConstructiveByEnumeration(plasmaWordIndexEntryContainer i1, plasmaWordIndexEntryContainer i2, long time, int maxDistance) { + private static indexContainer joinConstructiveByEnumeration(indexContainer i1, indexContainer i2, long time, int maxDistance) { System.out.println("DEBUG: JOIN METHOD BY ENUMERATION"); plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result - if (!(i1.ordering.signature().equals(i2.ordering.signature()))) return conj; // ordering must be equal + if (!(i1.order().signature().equals(i2.order().signature()))) return conj; // ordering must be equal Iterator e1 = i1.entries(); Iterator e2 = i2.entries(); int c; @@ -284,7 +290,7 @@ public final class plasmaWordIndexEntryContainer { long stamp = System.currentTimeMillis(); while ((System.currentTimeMillis() - stamp) < time) { - c = i1.ordering.compare(ie1.getUrlHash(), ie2.getUrlHash()); + c = i1.order().compare(ie1.getUrlHash(), ie2.getUrlHash()); //System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c); if (c < 0) { if (e1.hasNext()) ie1 = (plasmaWordIndexEntryInstance) e1.next(); else break; diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index fc856c3c2..19e61dbd4 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -53,6 +53,7 @@ import java.util.Iterator; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpc; +import de.anomic.index.indexContainer; import de.anomic.index.indexEntryAttribute; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.plasma.plasmaCrawlLURL; @@ -363,7 +364,7 @@ public final class yacyClient { boolean global, yacySeed targetPeer, plasmaCrawlLURL urlManager, - plasmaWordIndexEntryContainer containerCache, + indexContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippets, plasmaSearchTimingProfile timingProfile, @@ -877,7 +878,7 @@ public final class yacyClient { } } - public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) { + public static String transferIndex(yacySeed targetSeed, indexContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) { // check if we got all necessary urls in the urlCache (only for debugging) Iterator eenum; @@ -935,7 +936,7 @@ public final class yacyClient { return null; } - private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, boolean gzipBody, int timeout) { + private static HashMap transferRWI(yacySeed targetSeed, indexContainer[] indexes, boolean gzipBody, int timeout) { final String address = targetSeed.getAddress(); if (address == null) { return null; } diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 953b14618..b92f43052 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -48,13 +48,13 @@ import java.util.Iterator; import java.util.Set; import java.util.HashMap; +import de.anomic.index.indexContainer; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaURLPattern; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSearchTimingProfile; -import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.server.logging.serverLog; public class yacySearch extends Thread { @@ -62,7 +62,7 @@ public class yacySearch extends Thread { final private Set wordhashes; final private boolean global; final private plasmaCrawlLURL urlManager; - final private plasmaWordIndexEntryContainer containerCache; + final private indexContainer containerCache; final private plasmaURLPattern blacklist; final private plasmaSnippetCache snippetCache; final private yacySeed targetPeer; @@ -73,7 +73,7 @@ public class yacySearch extends Thread { final private String prefer, filter; public yacySearch(Set wordhashes, String prefer, String filter, int maxDistance, boolean global, yacySeed targetPeer, - plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, + plasmaCrawlLURL urlManager, indexContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) { super("yacySearch_" + targetPeer.getName()); this.wordhashes = wordhashes; @@ -181,7 +181,7 @@ public class yacySearch extends Thread { return result; } - public static yacySearch[] searchHashes(Set wordhashes, String prefer, String filter, int maxDist, plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache, + public static yacySearch[] searchHashes(Set wordhashes, String prefer, String filter, int maxDist, plasmaCrawlLURL urlManager, indexContainer containerCache, int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) { // check own peer status diff --git a/source/yacy.java b/source/yacy.java index cf89188a4..e096ff35a 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -71,6 +71,7 @@ import de.anomic.http.httpc; import de.anomic.http.httpd; import de.anomic.http.httpdFileHandler; import de.anomic.http.httpdProxyHandler; +import de.anomic.index.indexContainer; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; import de.anomic.kelondro.kelondroDyn; @@ -85,7 +86,6 @@ import de.anomic.plasma.plasmaWordIndexAssortmentCluster; import de.anomic.plasma.plasmaWordIndexClassicDB; import de.anomic.plasma.plasmaWordIndexEntity; import de.anomic.plasma.plasmaWordIndexEntryInstance; -import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.server.serverCore; import de.anomic.server.serverDate; import de.anomic.server.serverFileUtils; @@ -754,7 +754,7 @@ public final class yacy { byte[][] row = (byte[][]) contentIter.next(); String hash = new String(row[0]); - plasmaWordIndexEntryContainer container = assortmentFile.row2container(hash, row); + indexContainer container = assortmentFile.row2container(hash, row); wordEntryCount += container.size(); // importing entity container to home db @@ -848,7 +848,7 @@ public final class yacy { // testing if import process was aborted if (Thread.interrupted()) break; - plasmaWordIndexEntryContainer newContainer; + indexContainer newContainer; try { wordCounter++; wordHash = (String) importWordHashIterator.next(); @@ -961,7 +961,7 @@ public final class yacy { String wordChunkStartHash = "------------", wordChunkEndHash; while (wordHashIterator.hasNext()) { - plasmaWordIndexEntryContainer wordIdxContainer = null; + indexContainer wordIdxContainer = null; try { wordCounter++; wordhash = (String) wordHashIterator.next();