From 5041d330cecc34aecffa071f600fe719bb1f0b6d Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 28 May 2006 11:44:50 +0000 Subject: [PATCH] refactoring git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2150 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControl_p.java | 12 +- htroot/yacy/transferRWI.java | 6 +- source/de/anomic/index/indexAbstractRI.java | 3 +- .../indexRAMCacheRI.java} | 91 ++-- .../indexTreeMapContainer.java} | 93 ++-- .../indexURLEntry.java} | 423 +++++++++--------- .../plasma/dbImport/plasmaDbImporter.java | 6 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 11 +- source/de/anomic/plasma/plasmaDHTChunk.java | 28 +- .../de/anomic/plasma/plasmaSearchEvent.java | 14 +- .../anomic/plasma/plasmaSearchPreOrder.java | 24 +- .../plasma/plasmaSearchRankingProfile.java | 5 +- .../de/anomic/plasma/plasmaSearchResult.java | 13 +- .../de/anomic/plasma/plasmaSwitchboard.java | 8 +- source/de/anomic/plasma/plasmaWordIndex.java | 27 +- .../plasma/plasmaWordIndexAssortment.java | 12 +- .../plasmaWordIndexAssortmentCluster.java | 18 +- .../plasma/plasmaWordIndexClassicDB.java | 12 +- .../anomic/plasma/plasmaWordIndexEntity.java | 21 +- source/de/anomic/yacy/yacyClient.java | 22 +- source/yacy.java | 10 +- 21 files changed, 411 insertions(+), 448 deletions(-) rename source/de/anomic/{plasma/plasmaWordIndexCache.java => index/indexRAMCacheRI.java} (80%) rename source/de/anomic/{plasma/plasmaWordIndexEntryContainer.java => index/indexTreeMapContainer.java} (71%) rename source/de/anomic/{plasma/plasmaWordIndexEntryInstance.java => index/indexURLEntry.java} (83%) diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 69b926465..1e9f54c12 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -63,7 +63,7 @@ import de.anomic.index.indexURL; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; -import de.anomic.plasma.plasmaWordIndexEntryInstance; +import de.anomic.index.indexURLEntry; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyClient; @@ -153,7 +153,7 @@ public class IndexControl_p { int i = 0; urlx = new String[index.size()]; while (en.hasNext()) { - urlx[i++] = ((plasmaWordIndexEntryInstance) en.next()).getUrlHash(); + urlx[i++] = ((indexURLEntry) en.next()).getUrlHash(); } index = null; } @@ -254,10 +254,10 @@ public class IndexControl_p { Iterator urlIter = index.entries(); HashMap knownURLs = new HashMap(); HashSet unknownURLEntries = new HashSet(); - plasmaWordIndexEntryInstance indexEntry; + indexURLEntry indexEntry; plasmaCrawlLURL.Entry lurl; while (urlIter.hasNext()) { - indexEntry = (plasmaWordIndexEntryInstance) urlIter.next(); + indexEntry = (indexURLEntry) urlIter.next(); try { lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), null); if (lurl.toString() == null) { @@ -437,9 +437,9 @@ public class IndexControl_p { int i = 0; final TreeMap tm = new TreeMap(); - plasmaWordIndexEntryInstance xi; + indexURLEntry xi; while (en.hasNext()) { - xi = (plasmaWordIndexEntryInstance) en.next(); + xi = (indexURLEntry) en.next(); uh = new String[]{xi.getUrlHash(), Integer.toString(xi.posintext())}; try { us = switchboard.urlPool.loadedURL.getEntry(uh[0], null).url().toString(); diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index eb8e255b5..510749299 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -52,7 +52,7 @@ import java.util.LinkedList; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaWordIndexEntryInstance; +import de.anomic.index.indexURLEntry; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -123,7 +123,7 @@ public final class transferRWI { int p; String wordHash; String urlHash; - plasmaWordIndexEntryInstance entry; + indexURLEntry entry; int wordhashesSize = v.size(); final HashSet unknownURL = new HashSet(); String[] wordhashes = new String[v.size()]; @@ -136,7 +136,7 @@ public final class transferRWI { if (p > 0) { wordHash = estring.substring(0, p); wordhashes[received] = wordHash; - entry = new plasmaWordIndexEntryInstance(estring.substring(p)); + entry = new indexURLEntry(estring.substring(p)); sb.wordIndex.addEntry(wordHash, entry, System.currentTimeMillis(), true); serverCore.checkInterruption(); diff --git a/source/de/anomic/index/indexAbstractRI.java b/source/de/anomic/index/indexAbstractRI.java index 2ad05b311..fc6cc4e70 100644 --- a/source/de/anomic/index/indexAbstractRI.java +++ b/source/de/anomic/index/indexAbstractRI.java @@ -26,12 +26,11 @@ package de.anomic.index; -import de.anomic.plasma.plasmaWordIndexEntryContainer; public abstract class indexAbstractRI implements indexRI { public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { - plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); + indexTreeMapContainer container = new indexTreeMapContainer(wordHash); container.add(newEntry); return addEntries(container, updateTime, dhtCase); } diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/index/indexRAMCacheRI.java similarity index 80% rename from source/de/anomic/plasma/plasmaWordIndexCache.java rename to source/de/anomic/index/indexRAMCacheRI.java index 2390c2e3e..e48dfcfc7 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/index/indexRAMCacheRI.java @@ -1,14 +1,15 @@ -// plasmaWordIndexCache.java -// ------------------------- -// part of YACY -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2005 +// indexRAMCacheRI.java +// (C) 2005, 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 2005 on http://www.anomic.de // -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ +// This is a part of YaCy, a peer-to-peer based web search engine // +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -22,27 +23,8 @@ // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. -package de.anomic.plasma; +package de.anomic.index; import java.io.File; import java.io.IOException; @@ -50,18 +32,15 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeMap; -import de.anomic.index.indexContainer; -import de.anomic.index.indexEntry; -import de.anomic.index.indexRI; -import de.anomic.index.indexAbstractRI; import de.anomic.kelondro.kelondroArray; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroRecords; +import de.anomic.plasma.plasmaWordIndexAssortment; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySeedDB; -public final class plasmaWordIndexCache extends indexAbstractRI implements indexRI { +public final class indexRAMCacheRI extends indexAbstractRI implements indexRI { // environment constants private static final String indexArrayFileName = "indexDump1.array"; @@ -87,7 +66,7 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index //minKey = ""; for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-'; } - public plasmaWordIndexCache(File databaseRoot, serverLog log) { + public indexRAMCacheRI(File databaseRoot, serverLog log) { // creates a new index cache // the cache has a back-end where indexes that do not fit in the cache are flushed @@ -120,22 +99,22 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index long wordsPerSecond = 0, wordcount = 0, urlcount = 0; Map.Entry entry; String wordHash; - plasmaWordIndexEntryContainer container; + indexTreeMapContainer container; long updateTime; - plasmaWordIndexEntryInstance wordEntry; + indexURLEntry wordEntry; byte[][] row = new byte[5][]; // write kCache, this will be melted with the wCache upon load synchronized (kCache) { Iterator i = kCache.values().iterator(); while (i.hasNext()) { - container = (plasmaWordIndexEntryContainer) i.next(); + container = (indexTreeMapContainer) i.next(); // put entries on stack if (container != null) { Iterator ci = container.entries(); while (ci.hasNext()) { - wordEntry = (plasmaWordIndexEntryInstance) ci.next(); + wordEntry = (indexURLEntry) ci.next(); row[0] = container.wordHash().getBytes(); row[1] = kelondroRecords.long2bytes(container.size(), 4); row[2] = kelondroRecords.long2bytes(container.updated(), 8); @@ -158,13 +137,13 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index entry = (Map.Entry) i.next(); wordHash = (String) entry.getKey(); updateTime = getUpdateTime(wordHash); - container = (plasmaWordIndexEntryContainer) entry.getValue(); + container = (indexTreeMapContainer) entry.getValue(); // put entries on stack if (container != null) { Iterator ci = container.entries(); while (ci.hasNext()) { - wordEntry = (plasmaWordIndexEntryInstance) ci.next(); + wordEntry = (indexURLEntry) ci.next(); row[0] = wordHash.getBytes(); row[1] = kelondroRecords.long2bytes(container.size(), 4); row[2] = kelondroRecords.long2bytes(updateTime, 8); @@ -203,7 +182,7 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index int i = dumpArray.size(); String wordHash; //long creationTime; - plasmaWordIndexEntryInstance wordEntry; + indexURLEntry wordEntry; byte[][] row; //Runtime rt = Runtime.getRuntime(); while (i-- > 0) { @@ -212,7 +191,7 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index if ((row[0] == null) || (row[1] == null) || (row[2] == null) || (row[3] == null) || (row[4] == null)) continue; wordHash = new String(row[0], "UTF-8"); //creationTime = kelondroRecords.bytes2long(row[2]); - wordEntry = new plasmaWordIndexEntryInstance(new String(row[3], "UTF-8"), new String(row[4], "UTF-8")); + wordEntry = new indexURLEntry(new String(row[3], "UTF-8"), new String(row[4], "UTF-8")); // store to cache addEntry(wordHash, wordEntry, startTime, false); urlCount++; @@ -288,7 +267,7 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index public int indexSize(String wordHash) { int size = 0; - plasmaWordIndexEntryContainer cacheIndex = (plasmaWordIndexEntryContainer) wCache.get(wordHash); + indexTreeMapContainer cacheIndex = (indexTreeMapContainer) wCache.get(wordHash); if (cacheIndex != null) size += cacheIndex.size(); return size; } @@ -302,13 +281,13 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index // find entries in kCache that are too old for that place and shift them to the wCache long time; Long l; - plasmaWordIndexEntryContainer container; + indexTreeMapContainer container; synchronized (kCache) { while (kCache.size() > 0) { l = (Long) kCache.firstKey(); time = l.longValue(); if (System.currentTimeMillis() - time < kCacheMaxAge) return; - container = (plasmaWordIndexEntryContainer) kCache.remove(l); + container = (indexTreeMapContainer) kCache.remove(l); addEntries(container, container.updated(), false); } } @@ -362,13 +341,13 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index } public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime_dummy) { - return (plasmaWordIndexEntryContainer) wCache.get(wordHash); + return (indexTreeMapContainer) wCache.get(wordHash); } public indexContainer deleteContainer(String wordHash) { // returns the index that had been deleted synchronized (wCache) { - plasmaWordIndexEntryContainer container = (plasmaWordIndexEntryContainer) wCache.remove(wordHash); + indexTreeMapContainer container = (indexTreeMapContainer) wCache.remove(wordHash); hashScore.deleteScore(wordHash); hashDate.deleteScore(wordHash); return container; @@ -379,7 +358,7 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index if (urlHashes.length == 0) return 0; int count = 0; synchronized (wCache) { - plasmaWordIndexEntryContainer c = (plasmaWordIndexEntryContainer) deleteContainer(wordHash); + indexTreeMapContainer c = (indexTreeMapContainer) deleteContainer(wordHash); if (c != null) { count = c.removeEntries(wordHash, urlHashes, deleteComplete); if (c.size() != 0) this.addEntries(c, System.currentTimeMillis(), false); @@ -397,13 +376,13 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index Iterator i = kCache.entrySet().iterator(); Map.Entry entry; Long l; - plasmaWordIndexEntryContainer c; + indexTreeMapContainer c; while (i.hasNext()) { entry = (Map.Entry) i.next(); l = (Long) entry.getKey(); // get container - c = (plasmaWordIndexEntryContainer) entry.getValue(); + c = (indexTreeMapContainer) entry.getValue(); if (c.remove(urlHash) != null) { if (c.size() == 0) { i.remove(); @@ -431,8 +410,8 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index } else synchronized (wCache) { // put container into wCache String wordHash = container.wordHash(); - plasmaWordIndexEntryContainer entries = (plasmaWordIndexEntryContainer) wCache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null - if (entries == null) entries = new plasmaWordIndexEntryContainer(wordHash); + indexTreeMapContainer entries = (indexTreeMapContainer) wCache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null + if (entries == null) entries = new indexTreeMapContainer(wordHash); added = entries.add(container, -1); if (added > 0) { wCache.put(wordHash, entries); @@ -447,15 +426,15 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { if (dhtCase) synchronized (kCache) { // put container into kCache - plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); + indexTreeMapContainer container = new indexTreeMapContainer(wordHash); container.add(newEntry); kCache.put(new Long(updateTime + kCacheInc), container); kCacheInc++; if (kCacheInc > 10000) kCacheInc = 0; return null; } else synchronized (wCache) { - plasmaWordIndexEntryContainer container = (plasmaWordIndexEntryContainer) wCache.get(wordHash); - if (container == null) container = new plasmaWordIndexEntryContainer(wordHash); + indexTreeMapContainer container = (indexTreeMapContainer) wCache.get(wordHash); + if (container == null) container = new indexTreeMapContainer(wordHash); indexEntry[] entries = new indexEntry[] { newEntry }; if (container.add(entries, updateTime) > 0) { wCache.put(wordHash, container); diff --git a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java b/source/de/anomic/index/indexTreeMapContainer.java similarity index 71% rename from source/de/anomic/plasma/plasmaWordIndexEntryContainer.java rename to source/de/anomic/index/indexTreeMapContainer.java index 0fc3f3bf3..4857be021 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java +++ b/source/de/anomic/index/indexTreeMapContainer.java @@ -1,11 +1,15 @@ -// plasmaIndexEntryContainer.java -// ------------------------------ -// part of YaCy -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2005 -// last major change: 07.05.2005 +// indexTreeMapContainer.java +// (C) 2005, 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 07.05.2005 on http://www.anomic.de // +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -19,26 +23,6 @@ // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - /* an indexContainer is a bag of indexEntries for a single word @@ -50,32 +34,29 @@ the creationTime is necessary to organize caching of containers */ -package de.anomic.plasma; +package de.anomic.index; import java.util.ConcurrentModificationException; import java.util.Iterator; import java.util.Set; import java.util.TreeMap; -import de.anomic.index.indexContainer; -import de.anomic.index.indexAbstractContainer; -import de.anomic.index.indexEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroOrder; -public final class plasmaWordIndexEntryContainer extends indexAbstractContainer implements indexContainer { +public final class indexTreeMapContainer extends indexAbstractContainer implements indexContainer { private String wordHash; private final TreeMap container; // urlHash/plasmaWordIndexEntry - Mapping private long updateTime; private kelondroOrder ordering; - public plasmaWordIndexEntryContainer(String wordHash) { + public indexTreeMapContainer(String wordHash) { this(wordHash, new kelondroNaturalOrder(true)); } - public plasmaWordIndexEntryContainer(String wordHash, kelondroOrder ordering) { + public indexTreeMapContainer(String wordHash, kelondroOrder ordering) { this.wordHash = wordHash; this.updateTime = 0; this.ordering = ordering; @@ -131,7 +112,7 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer int x = 0; while ((i.hasNext()) && ((maxTime < 0) || ((startTime + maxTime) > System.currentTimeMillis()))) { try { - if (addi((plasmaWordIndexEntryInstance) i.next())) x++; + if (addi((indexURLEntry) i.next())) x++; } catch (ConcurrentModificationException e) {} } this.updateTime = java.lang.Math.max(this.updateTime, c.updated()); @@ -140,7 +121,7 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer private boolean addi(indexEntry entry) { // returns true if the new entry was added, false if it already existed - plasmaWordIndexEntryInstance oldEntry = (plasmaWordIndexEntryInstance) container.put(entry.getUrlHash(), entry); + indexURLEntry oldEntry = (indexURLEntry) container.put(entry.getUrlHash(), entry); if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this container container.put(entry.getUrlHash(), oldEntry); // put it back return false; @@ -153,15 +134,15 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer } public indexEntry get(String urlHash) { - return (plasmaWordIndexEntryInstance) container.get(urlHash); + return (indexURLEntry) container.get(urlHash); } public indexEntry[] getEntryArray() { - return (plasmaWordIndexEntryInstance[]) container.values().toArray(); + return (indexURLEntry[]) container.values().toArray(); } public indexEntry remove(String urlHash) { - return (plasmaWordIndexEntryInstance) container.remove(urlHash); + return (indexURLEntry) container.remove(urlHash); } public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) { @@ -190,15 +171,15 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer // order entities by their size TreeMap map = new TreeMap(); - plasmaWordIndexEntryContainer singleContainer; + indexTreeMapContainer singleContainer; Iterator i = containers.iterator(); int count = 0; while (i.hasNext()) { // get next entity: - singleContainer = (plasmaWordIndexEntryContainer) i.next(); + singleContainer = (indexTreeMapContainer) i.next(); // check result - if ((singleContainer == null) || (singleContainer.size() == 0)) return new plasmaWordIndexEntryContainer(null); // as this is a cunjunction of searches, we have no result if any word is not known + if ((singleContainer == null) || (singleContainer.size() == 0)) return new indexTreeMapContainer(null); // as this is a cunjunction of searches, we have no result if any word is not known // store result in order of result size map.put(new Long(singleContainer.size() * 1000 + count), singleContainer); @@ -206,7 +187,7 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer } // check if there is any result - if (map.size() == 0) return new plasmaWordIndexEntryContainer(null); // no result, nothing found + if (map.size() == 0) return new indexTreeMapContainer(null); // no result, nothing found // the map now holds the search results in order of number of hits per word // we now must pairwise build up a conjunction of these sets @@ -218,14 +199,14 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis(); searchA = searchResult; searchB = (indexContainer) map.remove(k); - searchResult = plasmaWordIndexEntryContainer.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1), maxDistance); + searchResult = indexTreeMapContainer.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1), maxDistance); // free resources searchA = null; searchB = null; } // in 'searchResult' is now the combined search result - if (searchResult.size() == 0) return new plasmaWordIndexEntryContainer(null); + if (searchResult.size() == 0) return new indexTreeMapContainer(null); return searchResult; } @@ -238,7 +219,7 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer public static indexContainer joinConstructive(indexContainer i1, indexContainer i2, long time, int maxDistance) { if ((i1 == null) || (i2 == null)) return null; - if ((i1.size() == 0) || (i2.size() == 0)) return new plasmaWordIndexEntryContainer(null); + if ((i1.size() == 0) || (i2.size() == 0)) return new indexTreeMapContainer(null); // decide which method to use int high = ((i1.size() > i2.size()) ? i1.size() : i2.size()); @@ -259,7 +240,7 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer private static indexContainer joinConstructiveByTest(indexContainer small, indexContainer large, long time, int maxDistance) { System.out.println("DEBUG: JOIN METHOD BY TEST"); - plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result + indexTreeMapContainer conj = new indexTreeMapContainer(null); // start with empty search result Iterator se = small.entries(); indexEntry ie0, ie1; long stamp = System.currentTimeMillis(); @@ -277,31 +258,31 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer private static indexContainer joinConstructiveByEnumeration(indexContainer i1, indexContainer i2, long time, int maxDistance) { System.out.println("DEBUG: JOIN METHOD BY ENUMERATION"); - plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result + indexTreeMapContainer conj = new indexTreeMapContainer(null); // start with empty search result if (!(i1.order().signature().equals(i2.order().signature()))) return conj; // ordering must be equal Iterator e1 = i1.entries(); Iterator e2 = i2.entries(); int c; if ((e1.hasNext()) && (e2.hasNext())) { - plasmaWordIndexEntryInstance ie1; - plasmaWordIndexEntryInstance ie2; - ie1 = (plasmaWordIndexEntryInstance) e1.next(); - ie2 = (plasmaWordIndexEntryInstance) e2.next(); + indexURLEntry ie1; + indexURLEntry ie2; + ie1 = (indexURLEntry) e1.next(); + ie2 = (indexURLEntry) e2.next(); long stamp = System.currentTimeMillis(); while ((System.currentTimeMillis() - stamp) < time) { c = i1.order().compare(ie1.getUrlHash(), ie2.getUrlHash()); //System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c); if (c < 0) { - if (e1.hasNext()) ie1 = (plasmaWordIndexEntryInstance) e1.next(); else break; + if (e1.hasNext()) ie1 = (indexURLEntry) e1.next(); else break; } else if (c > 0) { - if (e2.hasNext()) ie2 = (plasmaWordIndexEntryInstance) e2.next(); else break; + if (e2.hasNext()) ie2 = (indexURLEntry) e2.next(); else break; } else { // we have found the same urls in different searches! ie1.combineDistance(ie2); if (ie1.worddistance() <= maxDistance) conj.add(ie1); - if (e1.hasNext()) ie1 = (plasmaWordIndexEntryInstance) e1.next(); else break; - if (e2.hasNext()) ie2 = (plasmaWordIndexEntryInstance) e2.next(); else break; + if (e1.hasNext()) ie1 = (indexURLEntry) e1.next(); else break; + if (e2.hasNext()) ie2 = (indexURLEntry) e2.next(); else break; } } } diff --git a/source/de/anomic/plasma/plasmaWordIndexEntryInstance.java b/source/de/anomic/index/indexURLEntry.java similarity index 83% rename from source/de/anomic/plasma/plasmaWordIndexEntryInstance.java rename to source/de/anomic/index/indexURLEntry.java index ddf5bbdc4..04f17d790 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntryInstance.java +++ b/source/de/anomic/index/indexURLEntry.java @@ -1,220 +1,203 @@ -// plasmaIndexEntry.java -// ----------------------- -// part of YaCy -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004 -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - -/* - This class defines the structures of an index entry -*/ - -package de.anomic.plasma; - -import java.util.Properties; - -import de.anomic.index.indexEntry; -import de.anomic.index.indexEntryAttribute; -import de.anomic.index.indexAbstractEntry; -import de.anomic.index.indexURL; -import de.anomic.kelondro.kelondroBase64Order; - -public final class plasmaWordIndexEntryInstance extends indexAbstractEntry implements Cloneable, indexEntry { - - // an wordEntry can be filled in either of two ways: - // by the discrete values of the entry - // or by the encoded entry-string - - // the class instantiation can only be done by a plasmaStore method - // therefore they are all public - public plasmaWordIndexEntryInstance(String urlHash, - int urlLength, // byte-length of complete URL - int urlComps, // number of path components - int titleLength, // length of description/length (longer are better?) - int hitcount, //*how often appears this word in the text - int wordcount, //*total number of words - int phrasecount, //*total number of phrases - int posintext, //*position of word in all words - int posinphrase, //*position of word in its phrase - int posofphrase, //*number of the phrase where word appears - int distance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search - int sizeOfPage, // # of bytes of the page - long lastmodified, //*last-modified time of the document where word appears - long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short - int quality, //*the entropy value - String language, //*(guessed) language of document - char doctype, //*type of document - int outlinksSame, // outlinks to same domain - int outlinksOther,// outlinks to other domain - boolean local //*flag shows that this index was generated locally; othervise its from a remote peer - ) { - - // more needed attributes: - // - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc - // - boolean: URL attributes - - if ((language == null) || (language.length() != indexURL.urlLanguageLength)) language = "uk"; - this.urlHash = urlHash; - this.hitcount = hitcount; - this.wordcount = wordcount; - this.phrasecount = phrasecount; - this.posintext = posintext; - this.posinphrase = posinphrase; - this.posofphrase = posofphrase; - this.worddistance = distance; - this.lastModified = lastmodified; - this.quality = quality; - this.language = language.getBytes(); - this.doctype = doctype; - this.localflag = (local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL; - } - - public plasmaWordIndexEntryInstance(String urlHash, String code) { - // the code is not parsed but used later on - this.urlHash = urlHash; - this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(6, 8)); - this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(3, 6))); - this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(0, 3)); - this.language = code.substring(8, 10).getBytes(); - this.doctype = code.charAt(10); - this.localflag = code.charAt(11); - this.posintext = (code.length() >= 14) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(12, 14)) : 0; - this.posinphrase = (code.length() >= 15) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(14, 16)) : 0; - this.posofphrase = (code.length() >= 17) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(16, 18)) : 0; - this.worddistance = (code.length() >= 19) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(18, 20)) : 0; - this.wordcount = (code.length() >= 21) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(20, 22)) : 0; - this.phrasecount = (code.length() >= 23) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(22, 24)) : 0; - if (hitcount == 0) hitcount = 1; - if (wordcount == 0) wordcount = 1000; - if (phrasecount == 0) phrasecount = 100; - } - - public plasmaWordIndexEntryInstance(String external) { - // parse external form - String[] elts = external.substring(1, external.length() - 1).split(","); - Properties pr = new Properties(); - int p; - for (int i = 0; i < elts.length; i++) { - pr.put(elts[i].substring(0, (p = elts[i].indexOf("="))), elts[i].substring(p + 1)); - } - // set values - this.urlHash = pr.getProperty("h", ""); - this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("c", "A")); - this.wordcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("w", "__")); - this.phrasecount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("p", "__")); - this.posintext = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("t", "__")); - this.posinphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("r", "__")); - this.posofphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("o", "__")); - this.worddistance = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("i", "__")); - this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("a", "A"))); - this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("q", "__")); - this.language = pr.getProperty("l", "uk").getBytes(); - this.doctype = pr.getProperty("d", "u").charAt(0); - this.localflag = pr.getProperty("f", ""+indexEntryAttribute.LT_LOCAL).charAt(0); - } - - public Object clone() { - return new plasmaWordIndexEntryInstance(this.toPropertyForm()); - } - - public static int encodedStringFormLength() { - // the size of the index entry attributes when encoded to string - return 24; - } - - public String toEncodedStringForm() { - // attention: this integrates NOT the URL hash into the encoding - // if you need a complete dump, use toExternalForm() - StringBuffer buf = new StringBuffer(encodedStringFormLength()); - - buf.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, indexURL.urlQualityLength)) - .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3)) - .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.hitcount, 2)) - .append(new String(this.language)) - .append(this.doctype) - .append(this.localflag) - .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2)) - .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2)) - .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2)) - .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.worddistance, 2)) - .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.wordcount, 2)) - .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.phrasecount, 2)); // 3+3+2+2+1+1+2+2+2+2+2+2= 24 bytes - - return buf.toString(); - } - - public static int encodedByteArrayFormLength() { - // the size of the index entry attributes when encoded to string - return encodedStringFormLength(); - } - - public byte[] toEncodedByteArrayForm() { - return toEncodedStringForm().getBytes(); - } - - public String toPropertyForm() { - StringBuffer str = new StringBuffer(61); - - str.append("{") - .append( "h=").append(this.urlHash) - .append(",q=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, indexURL.urlQualityLength)) - .append(",a=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3)) - .append(",c=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.hitcount, 2)) - .append(",l=").append(new String(this.language)) - .append(",d=").append(this.doctype) - .append(",f=").append(this.localflag) - .append(",t=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2)) - .append(",r=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2)) - .append(",o=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2)) - .append(",i=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.worddistance, 2)) - .append(",w=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.wordcount, 2)) - .append(",p=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.phrasecount, 2)) - .append("}"); - - return str.toString(); - } - - public static void main(String[] args) { - // outputs the word hash to a given word - if (args.length != 1) System.exit(0); - System.out.println("WORDHASH: " + indexEntryAttribute.word2hash(args[0])); - } - -} +// indexURLEntry.java +// (C) 2004, 2005, 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 2004 on http://www.anomic.de +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +/* + This class defines the structures of an index entry for URLs +*/ + +package de.anomic.index; + +import java.util.Properties; + +import de.anomic.index.indexEntry; +import de.anomic.index.indexEntryAttribute; +import de.anomic.index.indexAbstractEntry; +import de.anomic.index.indexURL; +import de.anomic.kelondro.kelondroBase64Order; +import de.anomic.plasma.plasmaWordIndex; + +public final class indexURLEntry extends indexAbstractEntry implements Cloneable, indexEntry { + + // an wordEntry can be filled in either of two ways: + // by the discrete values of the entry + // or by the encoded entry-string + + // the class instantiation can only be done by a plasmaStore method + // therefore they are all public + public indexURLEntry(String urlHash, + int urlLength, // byte-length of complete URL + int urlComps, // number of path components + int titleLength, // length of description/length (longer are better?) + int hitcount, //*how often appears this word in the text + int wordcount, //*total number of words + int phrasecount, //*total number of phrases + int posintext, //*position of word in all words + int posinphrase, //*position of word in its phrase + int posofphrase, //*number of the phrase where word appears + int distance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search + int sizeOfPage, // # of bytes of the page + long lastmodified, //*last-modified time of the document where word appears + long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short + int quality, //*the entropy value + String language, //*(guessed) language of document + char doctype, //*type of document + int outlinksSame, // outlinks to same domain + int outlinksOther,// outlinks to other domain + boolean local //*flag shows that this index was generated locally; othervise its from a remote peer + ) { + + // more needed attributes: + // - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc + // - boolean: URL attributes + + if ((language == null) || (language.length() != indexURL.urlLanguageLength)) language = "uk"; + this.urlHash = urlHash; + this.hitcount = hitcount; + this.wordcount = wordcount; + this.phrasecount = phrasecount; + this.posintext = posintext; + this.posinphrase = posinphrase; + this.posofphrase = posofphrase; + this.worddistance = distance; + this.lastModified = lastmodified; + this.quality = quality; + this.language = language.getBytes(); + this.doctype = doctype; + this.localflag = (local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL; + } + + public indexURLEntry(String urlHash, String code) { + // the code is not parsed but used later on + this.urlHash = urlHash; + this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(6, 8)); + this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(3, 6))); + this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(0, 3)); + this.language = code.substring(8, 10).getBytes(); + this.doctype = code.charAt(10); + this.localflag = code.charAt(11); + this.posintext = (code.length() >= 14) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(12, 14)) : 0; + this.posinphrase = (code.length() >= 15) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(14, 16)) : 0; + this.posofphrase = (code.length() >= 17) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(16, 18)) : 0; + this.worddistance = (code.length() >= 19) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(18, 20)) : 0; + this.wordcount = (code.length() >= 21) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(20, 22)) : 0; + this.phrasecount = (code.length() >= 23) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(22, 24)) : 0; + if (hitcount == 0) hitcount = 1; + if (wordcount == 0) wordcount = 1000; + if (phrasecount == 0) phrasecount = 100; + } + + public indexURLEntry(String external) { + // parse external form + String[] elts = external.substring(1, external.length() - 1).split(","); + Properties pr = new Properties(); + int p; + for (int i = 0; i < elts.length; i++) { + pr.put(elts[i].substring(0, (p = elts[i].indexOf("="))), elts[i].substring(p + 1)); + } + // set values + this.urlHash = pr.getProperty("h", ""); + this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("c", "A")); + this.wordcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("w", "__")); + this.phrasecount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("p", "__")); + this.posintext = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("t", "__")); + this.posinphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("r", "__")); + this.posofphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("o", "__")); + this.worddistance = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("i", "__")); + this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("a", "A"))); + this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("q", "__")); + this.language = pr.getProperty("l", "uk").getBytes(); + this.doctype = pr.getProperty("d", "u").charAt(0); + this.localflag = pr.getProperty("f", ""+indexEntryAttribute.LT_LOCAL).charAt(0); + } + + public Object clone() { + return new indexURLEntry(this.toPropertyForm()); + } + + public static int encodedStringFormLength() { + // the size of the index entry attributes when encoded to string + return 24; + } + + public String toEncodedStringForm() { + // attention: this integrates NOT the URL hash into the encoding + // if you need a complete dump, use toExternalForm() + StringBuffer buf = new StringBuffer(encodedStringFormLength()); + + buf.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, indexURL.urlQualityLength)) + .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3)) + .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.hitcount, 2)) + .append(new String(this.language)) + .append(this.doctype) + .append(this.localflag) + .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2)) + .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2)) + .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2)) + .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.worddistance, 2)) + .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.wordcount, 2)) + .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.phrasecount, 2)); // 3+3+2+2+1+1+2+2+2+2+2+2= 24 bytes + + return buf.toString(); + } + + public static int encodedByteArrayFormLength() { + // the size of the index entry attributes when encoded to string + return encodedStringFormLength(); + } + + public byte[] toEncodedByteArrayForm() { + return toEncodedStringForm().getBytes(); + } + + public String toPropertyForm() { + StringBuffer str = new StringBuffer(61); + + str.append("{") + .append( "h=").append(this.urlHash) + .append(",q=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, indexURL.urlQualityLength)) + .append(",a=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3)) + .append(",c=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.hitcount, 2)) + .append(",l=").append(new String(this.language)) + .append(",d=").append(this.doctype) + .append(",f=").append(this.localflag) + .append(",t=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2)) + .append(",r=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2)) + .append(",o=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2)) + .append(",i=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.worddistance, 2)) + .append(",w=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.wordcount, 2)) + .append(",p=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.phrasecount, 2)) + .append("}"); + + return str.toString(); + } + + public static void main(String[] args) { + // outputs the word hash to a given word + if (args.length != 1) System.exit(0); + System.out.println("WORDHASH: " + indexEntryAttribute.word2hash(args[0])); + } + +} diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index 14c6650ab..2b1272109 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -11,7 +11,7 @@ import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; -import de.anomic.plasma.plasmaWordIndexEntryInstance; +import de.anomic.index.indexURLEntry; import de.anomic.server.serverDate; public class plasmaDbImporter extends AbstractImporter implements dbImporter { @@ -128,13 +128,13 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { // loop throug the entities of the container and get the // urlhash Iterator importWordIdxEntries = newContainer.entries(); - plasmaWordIndexEntryInstance importWordIdxEntry; + indexURLEntry importWordIdxEntry; while (importWordIdxEntries.hasNext()) { // testing if import process was aborted if (isAborted()) break; // getting next word index entry - importWordIdxEntry = (plasmaWordIndexEntryInstance) importWordIdxEntries.next(); + importWordIdxEntry = (indexURLEntry) importWordIdxEntries.next(); String urlHash = importWordIdxEntry.getUrlHash(); entityUrls.add(urlHash); } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index dba489f81..df9f87061 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -67,6 +67,7 @@ import java.util.Properties; import de.anomic.http.httpc; import de.anomic.http.httpc.response; import de.anomic.index.indexURL; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroTree; import de.anomic.plasma.plasmaHTCache; @@ -159,7 +160,7 @@ public final class plasmaCrawlLURL extends indexURL { gcrawlResultStack.add(urlHash + initiatorHash + executorHash); } - public Entry getEntry(String hash, plasmaWordIndexEntryInstance searchedWord) throws IOException { + public Entry getEntry(String hash, indexURLEntry searchedWord) throws IOException { return new Entry(hash, searchedWord); } @@ -414,7 +415,7 @@ public final class plasmaCrawlLURL extends indexURL { private int size; private int wordCount; private String snippet; - private plasmaWordIndexEntryInstance word; // this is only used if the url is transported via remote search requests + private indexURLEntry word; // this is only used if the url is transported via remote search requests private boolean stored; // more needed attributes: @@ -449,7 +450,7 @@ public final class plasmaCrawlLURL extends indexURL { this.stored = false; } - public Entry(String urlHash, plasmaWordIndexEntryInstance searchedWord) throws IOException { + public Entry(String urlHash, indexURLEntry searchedWord) throws IOException { // generates an plasmaLURLEntry using the url hash // to speed up the access, the url-hashes are buffered // in the hash cache. @@ -510,7 +511,7 @@ public final class plasmaCrawlLURL extends indexURL { this.wordCount = Integer.parseInt(prop.getProperty("wc", "0")); this.snippet = prop.getProperty("snippet", ""); if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null); - this.word = (prop.containsKey("word")) ? new plasmaWordIndexEntryInstance(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null; + this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null; this.stored = false; //} } catch (Exception e) { @@ -647,7 +648,7 @@ public final class plasmaCrawlLURL extends indexURL { return snippet; } - public plasmaWordIndexEntryInstance word() { + public indexURLEntry word() { return word; } diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java index a6a1c6c03..b135d0e21 100644 --- a/source/de/anomic/plasma/plasmaDHTChunk.java +++ b/source/de/anomic/plasma/plasmaDHTChunk.java @@ -47,6 +47,8 @@ import java.util.HashMap; import java.util.Iterator; import de.anomic.index.indexContainer; +import de.anomic.index.indexTreeMapContainer; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; import de.anomic.server.serverCodings; @@ -72,22 +74,22 @@ public class plasmaDHTChunk { private int status = chunkStatus_UNDEFINED; private String startPointHash; - private plasmaWordIndexEntryContainer[] indexContainers = null; + private indexTreeMapContainer[] indexContainers = null; private HashMap urlCache; // String (url-hash) / plasmaCrawlLURL.Entry private int idxCount; private long selectionStartTime = 0; private long selectionEndTime = 0; - public plasmaWordIndexEntryContainer firstContainer() { + public indexTreeMapContainer firstContainer() { return indexContainers[0]; } - public plasmaWordIndexEntryContainer lastContainer() { + public indexTreeMapContainer lastContainer() { return indexContainers[indexContainers.length - 1]; } - public plasmaWordIndexEntryContainer[] containers() { + public indexTreeMapContainer[] containers() { return indexContainers; } @@ -189,7 +191,7 @@ public class plasmaDHTChunk { Iterator wordHashIterator = wordIndex.wordHashSet(hash, resourceLevel, true, maxcount).iterator(); indexContainer indexContainer; Iterator urlIter; - plasmaWordIndexEntryInstance indexEntry; + indexURLEntry indexEntry; plasmaCrawlLURL.Entry lurl; int refcount = 0; @@ -197,7 +199,7 @@ public class plasmaDHTChunk { double maximumDistance = ((double) peerRedundancy * 2) / ((double) yacyCore.seedDB.sizeConnected()); while ((maxcount > refcount) && (wordHashIterator.hasNext()) && ((nexthash = (String) wordHashIterator.next()) != null) && (nexthash.trim().length() > 0) - && ((tmpContainers.size() == 0) || (yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntryContainer) tmpContainers.get(0)).wordHash()) < maximumDistance))) { + && ((tmpContainers.size() == 0) || (yacyDHTAction.dhtDistance(nexthash, ((indexTreeMapContainer) tmpContainers.get(0)).wordHash()) < maximumDistance))) { // make an on-the-fly entity and insert values indexContainer = wordIndex.getContainer(nexthash, true, 10000); int notBoundCounter = 0; @@ -205,7 +207,7 @@ public class plasmaDHTChunk { urlIter = indexContainer.entries(); // iterate over indexes to fetch url entries and store them in the urlCache while ((urlIter.hasNext()) && (maxcount > refcount)) { - indexEntry = (plasmaWordIndexEntryInstance) urlIter.next(); + indexEntry = (indexURLEntry) urlIter.next(); try { lurl = lurls.getEntry(indexEntry.getUrlHash(), indexEntry); if ((lurl == null) || (lurl.url() == null)) { @@ -225,7 +227,7 @@ public class plasmaDHTChunk { // remove all remaining; we have enough while (urlIter.hasNext()) { - indexEntry = (plasmaWordIndexEntryInstance) urlIter.next(); + indexEntry = (indexURLEntry) urlIter.next(); urlIter.remove(); } @@ -238,7 +240,7 @@ public class plasmaDHTChunk { } } // create result - indexContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]); + indexContainers = (indexTreeMapContainer[]) tmpContainers.toArray(new indexTreeMapContainer[tmpContainers.size()]); if ((indexContainers == null) || (indexContainers.length == 0)) { log.logFine("No index available for index transfer, hash start-point " + startPointHash); @@ -251,13 +253,13 @@ public class plasmaDHTChunk { return refcount; } catch (kelondroException e) { log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e); - indexContainers = new plasmaWordIndexEntryContainer[0]; + indexContainers = new indexTreeMapContainer[0]; urlCache = new HashMap(); this.status = chunkStatus_FAILED; return 0; } catch (IOException e) { log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e); - indexContainers = new plasmaWordIndexEntryContainer[0]; + indexContainers = new indexTreeMapContainer[0]; urlCache = new HashMap(); this.status = chunkStatus_FAILED; return 0; @@ -267,7 +269,7 @@ public class plasmaDHTChunk { public int deleteTransferIndexes() { Iterator urlIter; - plasmaWordIndexEntryInstance indexEntry; + indexURLEntry indexEntry; String[] urlHashes; int count = 0; for (int i = 0; i < this.indexContainers.length; i++) { @@ -276,7 +278,7 @@ public class plasmaDHTChunk { urlHashes = new String[this.indexContainers[i].size()]; urlIter = this.indexContainers[i].entries(); while (urlIter.hasNext()) { - indexEntry = (plasmaWordIndexEntryInstance) urlIter.next(); + indexEntry = (indexURLEntry) urlIter.next(); urlHashes[c++] = indexEntry.getUrlHash(); } count += wordIndex.removeEntries(this.indexContainers[i].wordHash(), urlHashes, true); diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 6d02e6f18..d0c3b02cc 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -52,6 +52,8 @@ import de.anomic.server.logging.serverLog; import de.anomic.server.serverInstantThread; import de.anomic.yacy.yacySearch; import de.anomic.index.indexContainer; +import de.anomic.index.indexTreeMapContainer; +import de.anomic.index.indexURLEntry; public final class plasmaSearchEvent extends Thread implements Runnable { @@ -84,8 +86,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable { this.ranking = ranking; this.urlStore = urlStore; this.snippetCache = snippetCache; - this.rcLocal = new plasmaWordIndexEntryContainer(null); - this.rcGlobal = new plasmaWordIndexEntryContainer(null); + this.rcLocal = new indexTreeMapContainer(null); + this.rcGlobal = new indexTreeMapContainer(null); this.rcGlobalCount = 0; this.profileLocal = localTiming; this.profileGlobal = remoteTiming; @@ -176,13 +178,13 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // since this is a conjunction we return an empty entity if any word // is not known if (containers == null) { - rcLocal = new plasmaWordIndexEntryContainer(null); + rcLocal = new indexTreeMapContainer(null); return 0; } // join the result profileLocal.startTimer(); - rcLocal = plasmaWordIndexEntryContainer.joinContainer(containers, + rcLocal = indexTreeMapContainer.joinContainer(containers, profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_JOIN), query.maxDistance); profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_JOIN); @@ -218,7 +220,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime - plasmaWordIndexEntryContainer searchResult = new plasmaWordIndexEntryContainer(null); + indexTreeMapContainer searchResult = new indexTreeMapContainer(null); long preorderTime = profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_PRESORT); profileLocal.startTimer(); @@ -240,7 +242,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { //if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty //if (searchResult.size() == 0) return acc; // case that we have nothing to do - plasmaWordIndexEntryInstance entry; + indexURLEntry entry; plasmaCrawlLURL.Entry page; int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT); try { diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index 9ea5de0c3..ed879f40f 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -49,6 +49,8 @@ import java.util.Iterator; import de.anomic.server.serverCodings; import de.anomic.server.serverFileUtils; +import de.anomic.index.indexTreeMapContainer; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBinSearch; public final class plasmaSearchPreOrder { @@ -56,7 +58,7 @@ public final class plasmaSearchPreOrder { public static kelondroBinSearch[] ybrTables = null; // block-rank tables private static boolean useYBR = true; - private plasmaWordIndexEntryInstance entryMin, entryMax; + private indexURLEntry entryMin, entryMax; private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry private plasmaSearchQuery query; private plasmaSearchRankingProfile ranking; @@ -116,36 +118,36 @@ public final class plasmaSearchPreOrder { return pageAcc.size() > 0; } - public plasmaWordIndexEntryInstance next() { + public indexURLEntry next() { Object top = pageAcc.lastKey(); - return (plasmaWordIndexEntryInstance) pageAcc.remove(top); + return (indexURLEntry) pageAcc.remove(top); } - public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) { + public void addContainer(indexTreeMapContainer container, long maxTime) { long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; - plasmaWordIndexEntryInstance indexEntry; + indexURLEntry indexEntry; // first pass: find min/max to obtain limits for normalization Iterator i = container.entries(); int count = 0; while (i.hasNext()) { if (System.currentTimeMillis() > limitTime) break; - indexEntry = (plasmaWordIndexEntryInstance) i.next(); - if (entryMin == null) entryMin = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMin.min(indexEntry); - if (entryMax == null) entryMax = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMax.max(indexEntry); + indexEntry = (indexURLEntry) i.next(); + if (entryMin == null) entryMin = (indexURLEntry) indexEntry.clone(); else entryMin.min(indexEntry); + if (entryMax == null) entryMax = (indexURLEntry) indexEntry.clone(); else entryMax.max(indexEntry); count++; } // second pass: normalize entries and get ranking i = container.entries(); for (int j = 0; j < count; j++) { - indexEntry = (plasmaWordIndexEntryInstance) i.next(); + indexEntry = (indexURLEntry) i.next(); pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.getUrlHash(), indexEntry); } } - public plasmaWordIndexEntryInstance[] getNormalizer() { - return new plasmaWordIndexEntryInstance[] {entryMin, entryMax}; + public indexURLEntry[] getNormalizer() { + return new indexURLEntry[] {entryMin, entryMax}; } public static int ybr_p(String urlHash) { diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index b6608d7c6..1c0a632ef 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -47,6 +47,7 @@ import java.util.Map; import java.util.Set; import de.anomic.index.indexEntry; +import de.anomic.index.indexURLEntry; public class plasmaSearchRankingProfile { @@ -165,8 +166,8 @@ public class plasmaSearchRankingProfile { public long preRanking(indexEntry entry) { long ranking = 0; - if (entry instanceof plasmaWordIndexEntryInstance) { - plasmaWordIndexEntryInstance normalizedEntry = (plasmaWordIndexEntryInstance) entry; + if (entry instanceof indexURLEntry) { + indexURLEntry normalizedEntry = (indexURLEntry) entry; ranking += normalizedEntry.getQuality() << ((Integer) coeff.get(ENTROPY)).intValue(); ranking += normalizedEntry.getVirtualAge() << ((Integer) coeff.get(DATE)).intValue(); ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << ((Integer) coeff.get(YBR)).intValue(); diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java index d111810bc..df956b4dd 100644 --- a/source/de/anomic/plasma/plasmaSearchResult.java +++ b/source/de/anomic/plasma/plasmaSearchResult.java @@ -57,10 +57,11 @@ import de.anomic.server.serverCodings; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; +import de.anomic.index.indexURLEntry; public final class plasmaSearchResult { - private plasmaWordIndexEntryInstance entryMin, entryMax; + private indexURLEntry entryMin, entryMax; private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects @@ -107,11 +108,11 @@ public final class plasmaSearchResult { return (plasmaCrawlLURL.Entry) pageAcc.remove(top); } - protected void addResult(plasmaWordIndexEntryInstance indexEntry, plasmaCrawlLURL.Entry page) { + protected void addResult(indexURLEntry indexEntry, plasmaCrawlLURL.Entry page) { // make min/max for normalization - if (entryMin == null) entryMin = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMin.min(indexEntry); - if (entryMax == null) entryMax = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMax.max(indexEntry); + if (entryMin == null) entryMin = (indexURLEntry) indexEntry.clone(); else entryMin.min(indexEntry); + if (entryMax == null) entryMax = (indexURLEntry) indexEntry.clone(); else entryMax.max(indexEntry); // take out relevant information for reference computation URL url = page.url(); @@ -139,13 +140,13 @@ public final class plasmaSearchResult { for (int i = 0; i < references.length; i++) commonSense.add(references[i]); Object[] resultVector; - plasmaWordIndexEntryInstance indexEntry; + indexURLEntry indexEntry; plasmaCrawlLURL.Entry page; long ranking; for (int i = 0; i < results.size(); i++) { // take out values from result array resultVector = (Object[]) results.get(i); - indexEntry = (plasmaWordIndexEntryInstance) resultVector[0]; + indexEntry = (indexURLEntry) resultVector[0]; page = (plasmaCrawlLURL.Entry) resultVector[1]; // calculate ranking diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index c1c80982a..39ff6bb26 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -131,7 +131,9 @@ import de.anomic.http.httpHeader; import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpc; import de.anomic.index.indexEntryAttribute; +import de.anomic.index.indexTreeMapContainer; import de.anomic.index.indexURL; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMSetTools; @@ -1473,8 +1475,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String word = (String) wentry.getKey(); wordStat = (plasmaCondenser.wordStatProp) wentry.getValue(); String wordHash = indexEntryAttribute.word2hash(word); - plasmaWordIndexEntryContainer wordIdxContainer = new plasmaWordIndexEntryContainer(wordHash); - plasmaWordIndexEntryInstance wordIdxEntry = new plasmaWordIndexEntryInstance(urlHash, + indexTreeMapContainer wordIdxContainer = new indexTreeMapContainer(wordHash); + indexURLEntry wordIdxEntry = new indexURLEntry(urlHash, urlLength, urlComps, wordStat.count, document.longTitle.length(), @@ -1503,7 +1505,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // transfering the index to the storage peer String error = yacyClient.transferIndex( seed, - (plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]), + (indexTreeMapContainer[])tmpContainers.toArray(new indexTreeMapContainer[tmpContainers.size()]), urlCache, true, 120000); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 252e4be65..2af25484e 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -61,8 +61,11 @@ import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.index.indexContainer; import de.anomic.index.indexEntry; import de.anomic.index.indexEntryAttribute; +import de.anomic.index.indexRAMCacheRI; import de.anomic.index.indexRI; import de.anomic.index.indexAbstractRI; +import de.anomic.index.indexTreeMapContainer; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMergeIterator; @@ -76,7 +79,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { private static final int assortmentCount = 64; private final File databaseRoot; - private final plasmaWordIndexCache ramCache; + private final indexRAMCacheRI ramCache; private final plasmaWordIndexAssortmentCluster assortmentCluster; private int assortmentBufferSize; //kb private final plasmaWordIndexClassicDB backend; @@ -85,7 +88,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { public plasmaWordIndex(File databaseRoot, int bufferkb, serverLog log) { this.databaseRoot = databaseRoot; this.backend = new plasmaWordIndexClassicDB(databaseRoot, log); - this.ramCache = new plasmaWordIndexCache(databaseRoot, log); + this.ramCache = new indexRAMCacheRI(databaseRoot, log); // create new assortment cluster path File assortmentClusterPath = new File(databaseRoot, indexAssortmentClusterPath); @@ -149,7 +152,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { public void flushControl() { // check for forced flush synchronized (this) { ramCache.shiftK2W(); } - while (ramCache.maxURLinWCache() > plasmaWordIndexCache.wCacheReferenceLimit) { + while (ramCache.maxURLinWCache() > indexRAMCacheRI.wCacheReferenceLimit) { flushCache(1); } if (ramCache.wSize() > ramCache.getMaxWordCount()) { @@ -242,7 +245,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { Iterator i = condenser.words(); Map.Entry wentry; String word; - plasmaWordIndexEntryInstance ientry; + indexURLEntry ientry; plasmaCondenser.wordStatProp wprop; String wordHash; int urlLength = url.toString().length(); @@ -254,7 +257,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { wprop = (plasmaCondenser.wordStatProp) wentry.getValue(); // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c); wordHash = indexEntryAttribute.word2hash(word); - ientry = new plasmaWordIndexEntryInstance(urlHash, + ientry = new indexURLEntry(urlHash, urlLength, urlComps, (document == null) ? urlLength : document.longTitle.length(), wprop.count, condenser.RESULT_SIMI_WORDS, @@ -281,7 +284,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { long start = System.currentTimeMillis(); - plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); + indexTreeMapContainer container = new indexTreeMapContainer(wordHash); // get from cache // We must not use the container from cache to store everything we find, // as that container remains linked to in the cache and might be changed later @@ -359,7 +362,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { public synchronized indexContainer deleteContainer(String wordHash) { indexContainer c = ramCache.deleteContainer(wordHash); - if (c == null) c = new plasmaWordIndexEntryContainer(wordHash); + if (c == null) c = new indexTreeMapContainer(wordHash); c.add(assortmentCluster.deleteContainer(wordHash, -1), -1); c.add(backend.deleteContainer(wordHash), -1); return c; @@ -518,11 +521,11 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { // the combined container will fit, read the container try { Iterator entries = entity.elements(true); - plasmaWordIndexEntryInstance entry; + indexURLEntry entry; while (entries.hasNext()) { - entry = (plasmaWordIndexEntryInstance) entries.next(); + entry = (indexURLEntry) entries.next(); // System.out.println("ENTRY = " + entry.getUrlHash()); - container.add(new plasmaWordIndexEntryInstance[]{entry}, System.currentTimeMillis()); + container.add(new indexURLEntry[]{entry}, System.currentTimeMillis()); } // we have read all elements, now delete the entity entity.deleteComplete(); @@ -570,7 +573,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started"); String wordHash = ""; indexContainer wordContainer = null; - plasmaWordIndexEntryInstance entry = null; + indexURLEntry entry = null; URL url = null; HashSet urlHashs = new HashSet(); try { @@ -583,7 +586,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { wordHashNow = wordHash; while (containerIterator.hasNext() && run) { waiter(); - entry = (plasmaWordIndexEntryInstance) containerIterator.next(); + entry = (indexURLEntry) containerIterator.next(); // System.out.println("Wordhash: "+wordHash+" UrlHash: // "+entry.getUrlHash()); try { diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java index 628334e5f..13571b1ad 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java @@ -58,6 +58,8 @@ import java.util.Iterator; import de.anomic.index.indexContainer; import de.anomic.index.indexEntryAttribute; +import de.anomic.index.indexTreeMapContainer; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroRecords; import de.anomic.kelondro.kelondroTree; @@ -72,7 +74,7 @@ public final class plasmaWordIndexAssortment { 4, // occurrence counter 8, // timestamp of last access indexEntryAttribute.urlHashLength, // corresponding URL hash - plasmaWordIndexEntryInstance.encodedStringFormLength() // URL attributes + indexURLEntry.encodedStringFormLength() // URL attributes }; // class variables @@ -136,9 +138,9 @@ public final class plasmaWordIndexAssortment { row[1] = kelondroRecords.long2bytes(1, 4); row[2] = kelondroRecords.long2bytes(newContainer.updated(), 8); Iterator entries = newContainer.entries(); - plasmaWordIndexEntryInstance entry; + indexURLEntry entry; for (int i = 0; i < assortmentLength; i++) { - entry = (plasmaWordIndexEntryInstance) entries.next(); + entry = (indexURLEntry) entries.next(); row[3 + 2 * i] = entry.getUrlHash().getBytes(); row[4 + 2 * i] = entry.toEncodedStringForm().getBytes(); } @@ -215,10 +217,10 @@ public final class plasmaWordIndexAssortment { public indexContainer row2container(String wordHash, byte[][] row) { if (row == null) return null; final long updateTime = kelondroRecords.bytes2long(row[2]); - plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); + indexTreeMapContainer container = new indexTreeMapContainer(wordHash); for (int i = 0; i < assortmentLength; i++) { container.add( - new plasmaWordIndexEntryInstance[] { new plasmaWordIndexEntryInstance( + new indexURLEntry[] { new indexURLEntry( new String(row[3 + 2 * i]), new String(row[4 + 2 * i])) }, updateTime); } return container; diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java index 12cc15777..1fed05ef3 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java @@ -54,6 +54,8 @@ import java.util.Iterator; import de.anomic.index.indexContainer; import de.anomic.index.indexRI; import de.anomic.index.indexAbstractRI; +import de.anomic.index.indexTreeMapContainer; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroObjectCache; import de.anomic.kelondro.kelondroRecords; @@ -144,13 +146,13 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl int clusterStart = clusterCount - (int) (Math.random() * (clusterCount - clusterMinStart)); // do the insert - plasmaWordIndexEntryContainer c; + indexTreeMapContainer c; Iterator i = newContainer.entries(); for (int j = clusterStart; j >= 1; j--) { - c = new plasmaWordIndexEntryContainer(newContainer.wordHash()); + c = new indexTreeMapContainer(newContainer.wordHash()); for (int k = 0; k < j; k++) { if (i.hasNext()) { - c.add((plasmaWordIndexEntryInstance) i.next(), newContainer.updated()); + c.add((indexURLEntry) i.next(), newContainer.updated()); } else { storeForced(c); return; @@ -186,14 +188,14 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl } if (need == 0) { // we found spaces so that we can put in the newContainer into these spaces - plasmaWordIndexEntryContainer c; + indexTreeMapContainer c; Iterator i = newContainer.entries(); for (int j = testsize - 1; j >= 0; j--) { if (spaces[j] == 0) continue; - c = new plasmaWordIndexEntryContainer(newContainer.wordHash()); + c = new indexTreeMapContainer(newContainer.wordHash()); for (int k = 0; k <= j; k++) { assert (i.hasNext()); - c.add((plasmaWordIndexEntryInstance) i.next(), newContainer.updated()); + c.add((indexURLEntry) i.next(), newContainer.updated()); } storeForced(c); } @@ -216,7 +218,7 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl public indexContainer deleteContainer(String wordHash, long maxTime) { // removes all records from all the assortments and return them - indexContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash); + indexContainer buffer, record = new indexTreeMapContainer(wordHash); long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long remainingTime; for (int i = 0; i < clusterCount; i++) { @@ -240,7 +242,7 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { // collect all records from all the assortments and return them - indexContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash); + indexContainer buffer, record = new indexTreeMapContainer(wordHash); long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long remainingTime; for (int i = 0; i < clusterCount; i++) { diff --git a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java index 233a0412d..10f8c918f 100644 --- a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java +++ b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java @@ -52,6 +52,8 @@ import java.util.TreeSet; import de.anomic.index.indexContainer; import de.anomic.index.indexRI; import de.anomic.index.indexAbstractRI; +import de.anomic.index.indexTreeMapContainer; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySeedDB; @@ -193,16 +195,16 @@ public class plasmaWordIndexClassicDB extends indexAbstractRI implements indexRI if ((maxTime < 0) || (maxTime > 60000)) maxTime=60000; // maximum is one minute if (plasmaWordIndexEntity.wordHash2path(databaseRoot, wordHash).exists()) { plasmaWordIndexEntity entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10); - plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); - plasmaWordIndexEntryInstance entry; + indexTreeMapContainer container = new indexTreeMapContainer(wordHash); + indexURLEntry entry; Iterator i = entity.elements(true); while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) { - entry = (plasmaWordIndexEntryInstance) i.next(); + entry = (indexURLEntry) i.next(); container.add(entry); } return container; } else { - return new plasmaWordIndexEntryContainer(wordHash); + return new indexTreeMapContainer(wordHash); } } @@ -217,7 +219,7 @@ public class plasmaWordIndexClassicDB extends indexAbstractRI implements indexRI public indexContainer deleteContainer(String wordHash) { plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash); - return new plasmaWordIndexEntryContainer(wordHash); + return new indexTreeMapContainer(wordHash); } public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) { diff --git a/source/de/anomic/plasma/plasmaWordIndexEntity.java b/source/de/anomic/plasma/plasmaWordIndexEntity.java index 779b8a9d1..775910a4b 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntity.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntity.java @@ -50,6 +50,7 @@ import java.util.Iterator; import de.anomic.index.indexContainer; import de.anomic.index.indexURL; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroException; import de.anomic.server.logging.serverLog; @@ -93,10 +94,10 @@ public final class plasmaWordIndexEntity { kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent); } catch (IOException e) { theLocation.delete(); - kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, indexURL.urlHashLength, plasmaWordIndexEntryInstance.encodedStringFormLength(), false); + kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, indexURL.urlHashLength, indexURLEntry.encodedStringFormLength(), false); } else { // create new index file - kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, indexURL.urlHashLength, plasmaWordIndexEntryInstance.encodedStringFormLength(), false); + kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, indexURL.urlHashLength, indexURLEntry.encodedStringFormLength(), false); } return kt; // everyone who get this should close it when finished! } @@ -135,23 +136,23 @@ public final class plasmaWordIndexEntity { } catch (IOException e) {} } - public plasmaWordIndexEntryInstance getEntry(String urlhash) throws IOException { + public indexURLEntry getEntry(String urlhash) throws IOException { byte[][] n = theIndex.get(urlhash.getBytes()); if (n == null) return null; - return new plasmaWordIndexEntryInstance(new String(n[0]), new String(n[1])); + return new indexURLEntry(new String(n[0]), new String(n[1])); } public boolean contains(String urlhash) throws IOException { return (theIndex.get(urlhash.getBytes()) != null); } - public boolean contains(plasmaWordIndexEntryInstance entry) throws IOException { + public boolean contains(indexURLEntry entry) throws IOException { return (theIndex.get(entry.getUrlHash().getBytes()) != null); } - public boolean addEntry(plasmaWordIndexEntryInstance entry) throws IOException { + public boolean addEntry(indexURLEntry entry) throws IOException { if (entry == null) return false; - plasmaWordIndexEntryInstance oldEntry = getEntry(entry.getUrlHash()); + indexURLEntry oldEntry = getEntry(entry.getUrlHash()); if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity return false; } @@ -170,7 +171,7 @@ public final class plasmaWordIndexEntity { if (container != null) { Iterator i = container.entries(); while (i.hasNext()) { - if (addEntry((plasmaWordIndexEntryInstance) i.next())) count++; + if (addEntry((indexURLEntry) i.next())) count++; } } @@ -235,7 +236,7 @@ public final class plasmaWordIndexEntity { public Object next() { if (i == null) return null; byte[][] n = (byte[][]) i.next(); - return new plasmaWordIndexEntryInstance(new String(n[0]), new String(n[1])); + return new indexURLEntry(new String(n[0]), new String(n[1])); } public void remove() { throw new UnsupportedOperationException(); @@ -255,7 +256,7 @@ public final class plasmaWordIndexEntity { long timeout = (time == -1) ? Long.MAX_VALUE : System.currentTimeMillis() + time; try { while ((i.hasNext()) && (System.currentTimeMillis() < timeout)) { - addEntry((plasmaWordIndexEntryInstance) i.next()); + addEntry((indexURLEntry) i.next()); } } catch (kelondroException e) { serverLog.logSevere("PLASMA", "plasmaWordIndexEntity.merge: " + e.getMessage()); diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 19e61dbd4..1626dab5a 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -55,13 +55,13 @@ import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpc; import de.anomic.index.indexContainer; import de.anomic.index.indexEntryAttribute; +import de.anomic.index.indexTreeMapContainer; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaWordIndexEntryInstance; -import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.plasma.plasmaURLPattern; import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.server.serverCore; @@ -467,9 +467,9 @@ public final class yacyClient { // create containers final int words = wordhashes.length() / indexEntryAttribute.wordHashLength; - plasmaWordIndexEntryContainer[] container = new plasmaWordIndexEntryContainer[words]; + indexTreeMapContainer[] container = new indexTreeMapContainer[words]; for (int i = 0; i < words; i++) { - container[i] = new plasmaWordIndexEntryContainer(wordhashes.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength)); + container[i] = new indexTreeMapContainer(wordhashes.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength)); } // insert results to containers @@ -484,10 +484,10 @@ public final class yacyClient { urlManager.stackEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); // save the url entry - final plasmaWordIndexEntryInstance entry; + final indexURLEntry entry; if (urlEntry.word() == null) { // the old way to define words - entry = new plasmaWordIndexEntryInstance( + entry = new indexURLEntry( urlEntry.hash(), urlLength, urlComps, urlEntry.descr().length(), @@ -514,7 +514,7 @@ public final class yacyClient { } // add the url entry to the word indexes for (int m = 0; m < words; m++) { - container[m].add(new plasmaWordIndexEntryInstance[]{entry}, System.currentTimeMillis()); + container[m].add(new indexURLEntry[]{entry}, System.currentTimeMillis()); } } @@ -882,11 +882,11 @@ public final class yacyClient { // check if we got all necessary urls in the urlCache (only for debugging) Iterator eenum; - plasmaWordIndexEntryInstance entry; + indexURLEntry entry; for (int i = 0; i < indexes.length; i++) { eenum = indexes[i].entries(); while (eenum.hasNext()) { - entry = (plasmaWordIndexEntryInstance) eenum.next(); + entry = (indexURLEntry) eenum.next(); if (urlCache.get(entry.getUrlHash()) == null) { yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.getUrlHash() + "' is not contained in urlCache"); } @@ -962,11 +962,11 @@ public final class yacyClient { int indexcount = 0; final StringBuffer entrypost = new StringBuffer(indexes.length*73); Iterator eenum; - plasmaWordIndexEntryInstance entry; + indexURLEntry entry; for (int i = 0; i < indexes.length; i++) { eenum = indexes[i].entries(); while (eenum.hasNext()) { - entry = (plasmaWordIndexEntryInstance) eenum.next(); + entry = (indexURLEntry) eenum.next(); entrypost.append(indexes[i].wordHash()) .append(entry.toPropertyForm()) .append(serverCore.crlfString); diff --git a/source/yacy.java b/source/yacy.java index e096ff35a..1bfcf4b5f 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -85,7 +85,7 @@ import de.anomic.plasma.plasmaWordIndexAssortment; import de.anomic.plasma.plasmaWordIndexAssortmentCluster; import de.anomic.plasma.plasmaWordIndexClassicDB; import de.anomic.plasma.plasmaWordIndexEntity; -import de.anomic.plasma.plasmaWordIndexEntryInstance; +import de.anomic.index.indexURLEntry; import de.anomic.server.serverCore; import de.anomic.server.serverDate; import de.anomic.server.serverFileUtils; @@ -858,7 +858,7 @@ public final class yacy { // the combined container will fit, read the container Iterator importWordIdxEntries = newContainer.entries(); - plasmaWordIndexEntryInstance importWordIdxEntry; + indexURLEntry importWordIdxEntry; while (importWordIdxEntries.hasNext()) { // testing if import process was aborted @@ -866,7 +866,7 @@ public final class yacy { // getting next word index entry entryCounter++; - importWordIdxEntry = (plasmaWordIndexEntryInstance) importWordIdxEntries.next(); + importWordIdxEntry = (indexURLEntry) importWordIdxEntries.next(); String urlHash = importWordIdxEntry.getUrlHash(); if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) try { // importing the new url @@ -969,9 +969,9 @@ public final class yacy { // the combined container will fit, read the container Iterator wordIdxEntries = wordIdxContainer.entries(); - plasmaWordIndexEntryInstance wordIdxEntry; + indexURLEntry wordIdxEntry; while (wordIdxEntries.hasNext()) { - wordIdxEntry = (plasmaWordIndexEntryInstance) wordIdxEntries.next(); + wordIdxEntry = (indexURLEntry) wordIdxEntries.next(); String urlHash = wordIdxEntry.getUrlHash(); if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null);