refactoring

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2150 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 7b3b12888c
commit 5041d330ce

@ -63,7 +63,7 @@ import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.plasmaWordIndexEntryInstance; import de.anomic.index.indexURLEntry;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyClient; import de.anomic.yacy.yacyClient;
@ -153,7 +153,7 @@ public class IndexControl_p {
int i = 0; int i = 0;
urlx = new String[index.size()]; urlx = new String[index.size()];
while (en.hasNext()) { while (en.hasNext()) {
urlx[i++] = ((plasmaWordIndexEntryInstance) en.next()).getUrlHash(); urlx[i++] = ((indexURLEntry) en.next()).getUrlHash();
} }
index = null; index = null;
} }
@ -254,10 +254,10 @@ public class IndexControl_p {
Iterator urlIter = index.entries(); Iterator urlIter = index.entries();
HashMap knownURLs = new HashMap(); HashMap knownURLs = new HashMap();
HashSet unknownURLEntries = new HashSet(); HashSet unknownURLEntries = new HashSet();
plasmaWordIndexEntryInstance indexEntry; indexURLEntry indexEntry;
plasmaCrawlLURL.Entry lurl; plasmaCrawlLURL.Entry lurl;
while (urlIter.hasNext()) { while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntryInstance) urlIter.next(); indexEntry = (indexURLEntry) urlIter.next();
try { try {
lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), null); lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), null);
if (lurl.toString() == null) { if (lurl.toString() == null) {
@ -437,9 +437,9 @@ public class IndexControl_p {
int i = 0; int i = 0;
final TreeMap tm = new TreeMap(); final TreeMap tm = new TreeMap();
plasmaWordIndexEntryInstance xi; indexURLEntry xi;
while (en.hasNext()) { while (en.hasNext()) {
xi = (plasmaWordIndexEntryInstance) en.next(); xi = (indexURLEntry) en.next();
uh = new String[]{xi.getUrlHash(), Integer.toString(xi.posintext())}; uh = new String[]{xi.getUrlHash(), Integer.toString(xi.posintext())};
try { try {
us = switchboard.urlPool.loadedURL.getEntry(uh[0], null).url().toString(); us = switchboard.urlPool.loadedURL.getEntry(uh[0], null).url().toString();

@ -52,7 +52,7 @@ import java.util.LinkedList;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndexEntryInstance; import de.anomic.index.indexURLEntry;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
@ -123,7 +123,7 @@ public final class transferRWI {
int p; int p;
String wordHash; String wordHash;
String urlHash; String urlHash;
plasmaWordIndexEntryInstance entry; indexURLEntry entry;
int wordhashesSize = v.size(); int wordhashesSize = v.size();
final HashSet unknownURL = new HashSet(); final HashSet unknownURL = new HashSet();
String[] wordhashes = new String[v.size()]; String[] wordhashes = new String[v.size()];
@ -136,7 +136,7 @@ public final class transferRWI {
if (p > 0) { if (p > 0) {
wordHash = estring.substring(0, p); wordHash = estring.substring(0, p);
wordhashes[received] = wordHash; wordhashes[received] = wordHash;
entry = new plasmaWordIndexEntryInstance(estring.substring(p)); entry = new indexURLEntry(estring.substring(p));
sb.wordIndex.addEntry(wordHash, entry, System.currentTimeMillis(), true); sb.wordIndex.addEntry(wordHash, entry, System.currentTimeMillis(), true);
serverCore.checkInterruption(); serverCore.checkInterruption();

@ -26,12 +26,11 @@
package de.anomic.index; package de.anomic.index;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
public abstract class indexAbstractRI implements indexRI { public abstract class indexAbstractRI implements indexRI {
public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) {
plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); indexTreeMapContainer container = new indexTreeMapContainer(wordHash);
container.add(newEntry); container.add(newEntry);
return addEntries(container, updateTime, dhtCase); return addEntries(container, updateTime, dhtCase);
} }

@ -1,14 +1,15 @@
// plasmaWordIndexCache.java // indexRAMCacheRI.java
// ------------------------- // (C) 2005, 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// part of YACY // first published 2005 on http://www.anomic.de
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// //
// $LastChangedDate$ // This is a part of YaCy, a peer-to-peer based web search engine
// $LastChangedRevision$
// $LastChangedBy$
// //
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify // This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by // it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or // the Free Software Foundation; either version 2 of the License, or
@ -22,27 +23,8 @@
// You should have received a copy of the GNU General Public License // You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software // along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma; package de.anomic.index;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
@ -50,18 +32,15 @@ import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexRI;
import de.anomic.index.indexAbstractRI;
import de.anomic.kelondro.kelondroArray; import de.anomic.kelondro.kelondroArray;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroRecords; import de.anomic.kelondro.kelondroRecords;
import de.anomic.plasma.plasmaWordIndexAssortment;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
public final class plasmaWordIndexCache extends indexAbstractRI implements indexRI { public final class indexRAMCacheRI extends indexAbstractRI implements indexRI {
// environment constants // environment constants
private static final String indexArrayFileName = "indexDump1.array"; private static final String indexArrayFileName = "indexDump1.array";
@ -87,7 +66,7 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index
//minKey = ""; for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-'; //minKey = ""; for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-';
} }
public plasmaWordIndexCache(File databaseRoot, serverLog log) { public indexRAMCacheRI(File databaseRoot, serverLog log) {
// creates a new index cache // creates a new index cache
// the cache has a back-end where indexes that do not fit in the cache are flushed // the cache has a back-end where indexes that do not fit in the cache are flushed
@ -120,22 +99,22 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index
long wordsPerSecond = 0, wordcount = 0, urlcount = 0; long wordsPerSecond = 0, wordcount = 0, urlcount = 0;
Map.Entry entry; Map.Entry entry;
String wordHash; String wordHash;
plasmaWordIndexEntryContainer container; indexTreeMapContainer container;
long updateTime; long updateTime;
plasmaWordIndexEntryInstance wordEntry; indexURLEntry wordEntry;
byte[][] row = new byte[5][]; byte[][] row = new byte[5][];
// write kCache, this will be melted with the wCache upon load // write kCache, this will be melted with the wCache upon load
synchronized (kCache) { synchronized (kCache) {
Iterator i = kCache.values().iterator(); Iterator i = kCache.values().iterator();
while (i.hasNext()) { while (i.hasNext()) {
container = (plasmaWordIndexEntryContainer) i.next(); container = (indexTreeMapContainer) i.next();
// put entries on stack // put entries on stack
if (container != null) { if (container != null) {
Iterator ci = container.entries(); Iterator ci = container.entries();
while (ci.hasNext()) { while (ci.hasNext()) {
wordEntry = (plasmaWordIndexEntryInstance) ci.next(); wordEntry = (indexURLEntry) ci.next();
row[0] = container.wordHash().getBytes(); row[0] = container.wordHash().getBytes();
row[1] = kelondroRecords.long2bytes(container.size(), 4); row[1] = kelondroRecords.long2bytes(container.size(), 4);
row[2] = kelondroRecords.long2bytes(container.updated(), 8); row[2] = kelondroRecords.long2bytes(container.updated(), 8);
@ -158,13 +137,13 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index
entry = (Map.Entry) i.next(); entry = (Map.Entry) i.next();
wordHash = (String) entry.getKey(); wordHash = (String) entry.getKey();
updateTime = getUpdateTime(wordHash); updateTime = getUpdateTime(wordHash);
container = (plasmaWordIndexEntryContainer) entry.getValue(); container = (indexTreeMapContainer) entry.getValue();
// put entries on stack // put entries on stack
if (container != null) { if (container != null) {
Iterator ci = container.entries(); Iterator ci = container.entries();
while (ci.hasNext()) { while (ci.hasNext()) {
wordEntry = (plasmaWordIndexEntryInstance) ci.next(); wordEntry = (indexURLEntry) ci.next();
row[0] = wordHash.getBytes(); row[0] = wordHash.getBytes();
row[1] = kelondroRecords.long2bytes(container.size(), 4); row[1] = kelondroRecords.long2bytes(container.size(), 4);
row[2] = kelondroRecords.long2bytes(updateTime, 8); row[2] = kelondroRecords.long2bytes(updateTime, 8);
@ -203,7 +182,7 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index
int i = dumpArray.size(); int i = dumpArray.size();
String wordHash; String wordHash;
//long creationTime; //long creationTime;
plasmaWordIndexEntryInstance wordEntry; indexURLEntry wordEntry;
byte[][] row; byte[][] row;
//Runtime rt = Runtime.getRuntime(); //Runtime rt = Runtime.getRuntime();
while (i-- > 0) { while (i-- > 0) {
@ -212,7 +191,7 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index
if ((row[0] == null) || (row[1] == null) || (row[2] == null) || (row[3] == null) || (row[4] == null)) continue; if ((row[0] == null) || (row[1] == null) || (row[2] == null) || (row[3] == null) || (row[4] == null)) continue;
wordHash = new String(row[0], "UTF-8"); wordHash = new String(row[0], "UTF-8");
//creationTime = kelondroRecords.bytes2long(row[2]); //creationTime = kelondroRecords.bytes2long(row[2]);
wordEntry = new plasmaWordIndexEntryInstance(new String(row[3], "UTF-8"), new String(row[4], "UTF-8")); wordEntry = new indexURLEntry(new String(row[3], "UTF-8"), new String(row[4], "UTF-8"));
// store to cache // store to cache
addEntry(wordHash, wordEntry, startTime, false); addEntry(wordHash, wordEntry, startTime, false);
urlCount++; urlCount++;
@ -288,7 +267,7 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index
public int indexSize(String wordHash) { public int indexSize(String wordHash) {
int size = 0; int size = 0;
plasmaWordIndexEntryContainer cacheIndex = (plasmaWordIndexEntryContainer) wCache.get(wordHash); indexTreeMapContainer cacheIndex = (indexTreeMapContainer) wCache.get(wordHash);
if (cacheIndex != null) size += cacheIndex.size(); if (cacheIndex != null) size += cacheIndex.size();
return size; return size;
} }
@ -302,13 +281,13 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index
// find entries in kCache that are too old for that place and shift them to the wCache // find entries in kCache that are too old for that place and shift them to the wCache
long time; long time;
Long l; Long l;
plasmaWordIndexEntryContainer container; indexTreeMapContainer container;
synchronized (kCache) { synchronized (kCache) {
while (kCache.size() > 0) { while (kCache.size() > 0) {
l = (Long) kCache.firstKey(); l = (Long) kCache.firstKey();
time = l.longValue(); time = l.longValue();
if (System.currentTimeMillis() - time < kCacheMaxAge) return; if (System.currentTimeMillis() - time < kCacheMaxAge) return;
container = (plasmaWordIndexEntryContainer) kCache.remove(l); container = (indexTreeMapContainer) kCache.remove(l);
addEntries(container, container.updated(), false); addEntries(container, container.updated(), false);
} }
} }
@ -362,13 +341,13 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index
} }
public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime_dummy) { public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime_dummy) {
return (plasmaWordIndexEntryContainer) wCache.get(wordHash); return (indexTreeMapContainer) wCache.get(wordHash);
} }
public indexContainer deleteContainer(String wordHash) { public indexContainer deleteContainer(String wordHash) {
// returns the index that had been deleted // returns the index that had been deleted
synchronized (wCache) { synchronized (wCache) {
plasmaWordIndexEntryContainer container = (plasmaWordIndexEntryContainer) wCache.remove(wordHash); indexTreeMapContainer container = (indexTreeMapContainer) wCache.remove(wordHash);
hashScore.deleteScore(wordHash); hashScore.deleteScore(wordHash);
hashDate.deleteScore(wordHash); hashDate.deleteScore(wordHash);
return container; return container;
@ -379,7 +358,7 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index
if (urlHashes.length == 0) return 0; if (urlHashes.length == 0) return 0;
int count = 0; int count = 0;
synchronized (wCache) { synchronized (wCache) {
plasmaWordIndexEntryContainer c = (plasmaWordIndexEntryContainer) deleteContainer(wordHash); indexTreeMapContainer c = (indexTreeMapContainer) deleteContainer(wordHash);
if (c != null) { if (c != null) {
count = c.removeEntries(wordHash, urlHashes, deleteComplete); count = c.removeEntries(wordHash, urlHashes, deleteComplete);
if (c.size() != 0) this.addEntries(c, System.currentTimeMillis(), false); if (c.size() != 0) this.addEntries(c, System.currentTimeMillis(), false);
@ -397,13 +376,13 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index
Iterator i = kCache.entrySet().iterator(); Iterator i = kCache.entrySet().iterator();
Map.Entry entry; Map.Entry entry;
Long l; Long l;
plasmaWordIndexEntryContainer c; indexTreeMapContainer c;
while (i.hasNext()) { while (i.hasNext()) {
entry = (Map.Entry) i.next(); entry = (Map.Entry) i.next();
l = (Long) entry.getKey(); l = (Long) entry.getKey();
// get container // get container
c = (plasmaWordIndexEntryContainer) entry.getValue(); c = (indexTreeMapContainer) entry.getValue();
if (c.remove(urlHash) != null) { if (c.remove(urlHash) != null) {
if (c.size() == 0) { if (c.size() == 0) {
i.remove(); i.remove();
@ -431,8 +410,8 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index
} else synchronized (wCache) { } else synchronized (wCache) {
// put container into wCache // put container into wCache
String wordHash = container.wordHash(); String wordHash = container.wordHash();
plasmaWordIndexEntryContainer entries = (plasmaWordIndexEntryContainer) wCache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null indexTreeMapContainer entries = (indexTreeMapContainer) wCache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null
if (entries == null) entries = new plasmaWordIndexEntryContainer(wordHash); if (entries == null) entries = new indexTreeMapContainer(wordHash);
added = entries.add(container, -1); added = entries.add(container, -1);
if (added > 0) { if (added > 0) {
wCache.put(wordHash, entries); wCache.put(wordHash, entries);
@ -447,15 +426,15 @@ public final class plasmaWordIndexCache extends indexAbstractRI implements index
public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) {
if (dhtCase) synchronized (kCache) { if (dhtCase) synchronized (kCache) {
// put container into kCache // put container into kCache
plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); indexTreeMapContainer container = new indexTreeMapContainer(wordHash);
container.add(newEntry); container.add(newEntry);
kCache.put(new Long(updateTime + kCacheInc), container); kCache.put(new Long(updateTime + kCacheInc), container);
kCacheInc++; kCacheInc++;
if (kCacheInc > 10000) kCacheInc = 0; if (kCacheInc > 10000) kCacheInc = 0;
return null; return null;
} else synchronized (wCache) { } else synchronized (wCache) {
plasmaWordIndexEntryContainer container = (plasmaWordIndexEntryContainer) wCache.get(wordHash); indexTreeMapContainer container = (indexTreeMapContainer) wCache.get(wordHash);
if (container == null) container = new plasmaWordIndexEntryContainer(wordHash); if (container == null) container = new indexTreeMapContainer(wordHash);
indexEntry[] entries = new indexEntry[] { newEntry }; indexEntry[] entries = new indexEntry[] { newEntry };
if (container.add(entries, updateTime) > 0) { if (container.add(entries, updateTime) > 0) {
wCache.put(wordHash, container); wCache.put(wordHash, container);

@ -1,11 +1,15 @@
// plasmaIndexEntryContainer.java // indexTreeMapContainer.java
// ------------------------------ // (C) 2005, 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// part of YaCy // first published 07.05.2005 on http://www.anomic.de
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// last major change: 07.05.2005
// //
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify // This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by // it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or // the Free Software Foundation; either version 2 of the License, or
@ -19,26 +23,6 @@
// You should have received a copy of the GNU General Public License // You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software // along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
/* /*
an indexContainer is a bag of indexEntries for a single word an indexContainer is a bag of indexEntries for a single word
@ -50,32 +34,29 @@
the creationTime is necessary to organize caching of containers the creationTime is necessary to organize caching of containers
*/ */
package de.anomic.plasma; package de.anomic.index;
import java.util.ConcurrentModificationException; import java.util.ConcurrentModificationException;
import java.util.Iterator; import java.util.Iterator;
import java.util.Set; import java.util.Set;
import java.util.TreeMap; import java.util.TreeMap;
import de.anomic.index.indexContainer;
import de.anomic.index.indexAbstractContainer;
import de.anomic.index.indexEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroOrder; import de.anomic.kelondro.kelondroOrder;
public final class plasmaWordIndexEntryContainer extends indexAbstractContainer implements indexContainer { public final class indexTreeMapContainer extends indexAbstractContainer implements indexContainer {
private String wordHash; private String wordHash;
private final TreeMap container; // urlHash/plasmaWordIndexEntry - Mapping private final TreeMap container; // urlHash/plasmaWordIndexEntry - Mapping
private long updateTime; private long updateTime;
private kelondroOrder ordering; private kelondroOrder ordering;
public plasmaWordIndexEntryContainer(String wordHash) { public indexTreeMapContainer(String wordHash) {
this(wordHash, new kelondroNaturalOrder(true)); this(wordHash, new kelondroNaturalOrder(true));
} }
public plasmaWordIndexEntryContainer(String wordHash, kelondroOrder ordering) { public indexTreeMapContainer(String wordHash, kelondroOrder ordering) {
this.wordHash = wordHash; this.wordHash = wordHash;
this.updateTime = 0; this.updateTime = 0;
this.ordering = ordering; this.ordering = ordering;
@ -131,7 +112,7 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer
int x = 0; int x = 0;
while ((i.hasNext()) && ((maxTime < 0) || ((startTime + maxTime) > System.currentTimeMillis()))) { while ((i.hasNext()) && ((maxTime < 0) || ((startTime + maxTime) > System.currentTimeMillis()))) {
try { try {
if (addi((plasmaWordIndexEntryInstance) i.next())) x++; if (addi((indexURLEntry) i.next())) x++;
} catch (ConcurrentModificationException e) {} } catch (ConcurrentModificationException e) {}
} }
this.updateTime = java.lang.Math.max(this.updateTime, c.updated()); this.updateTime = java.lang.Math.max(this.updateTime, c.updated());
@ -140,7 +121,7 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer
private boolean addi(indexEntry entry) { private boolean addi(indexEntry entry) {
// returns true if the new entry was added, false if it already existed // returns true if the new entry was added, false if it already existed
plasmaWordIndexEntryInstance oldEntry = (plasmaWordIndexEntryInstance) container.put(entry.getUrlHash(), entry); indexURLEntry oldEntry = (indexURLEntry) container.put(entry.getUrlHash(), entry);
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this container if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this container
container.put(entry.getUrlHash(), oldEntry); // put it back container.put(entry.getUrlHash(), oldEntry); // put it back
return false; return false;
@ -153,15 +134,15 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer
} }
public indexEntry get(String urlHash) { public indexEntry get(String urlHash) {
return (plasmaWordIndexEntryInstance) container.get(urlHash); return (indexURLEntry) container.get(urlHash);
} }
public indexEntry[] getEntryArray() { public indexEntry[] getEntryArray() {
return (plasmaWordIndexEntryInstance[]) container.values().toArray(); return (indexURLEntry[]) container.values().toArray();
} }
public indexEntry remove(String urlHash) { public indexEntry remove(String urlHash) {
return (plasmaWordIndexEntryInstance) container.remove(urlHash); return (indexURLEntry) container.remove(urlHash);
} }
public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) { public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) {
@ -190,15 +171,15 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer
// order entities by their size // order entities by their size
TreeMap map = new TreeMap(); TreeMap map = new TreeMap();
plasmaWordIndexEntryContainer singleContainer; indexTreeMapContainer singleContainer;
Iterator i = containers.iterator(); Iterator i = containers.iterator();
int count = 0; int count = 0;
while (i.hasNext()) { while (i.hasNext()) {
// get next entity: // get next entity:
singleContainer = (plasmaWordIndexEntryContainer) i.next(); singleContainer = (indexTreeMapContainer) i.next();
// check result // check result
if ((singleContainer == null) || (singleContainer.size() == 0)) return new plasmaWordIndexEntryContainer(null); // as this is a cunjunction of searches, we have no result if any word is not known if ((singleContainer == null) || (singleContainer.size() == 0)) return new indexTreeMapContainer(null); // as this is a cunjunction of searches, we have no result if any word is not known
// store result in order of result size // store result in order of result size
map.put(new Long(singleContainer.size() * 1000 + count), singleContainer); map.put(new Long(singleContainer.size() * 1000 + count), singleContainer);
@ -206,7 +187,7 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer
} }
// check if there is any result // check if there is any result
if (map.size() == 0) return new plasmaWordIndexEntryContainer(null); // no result, nothing found if (map.size() == 0) return new indexTreeMapContainer(null); // no result, nothing found
// the map now holds the search results in order of number of hits per word // the map now holds the search results in order of number of hits per word
// we now must pairwise build up a conjunction of these sets // we now must pairwise build up a conjunction of these sets
@ -218,14 +199,14 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer
time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis(); time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis();
searchA = searchResult; searchA = searchResult;
searchB = (indexContainer) map.remove(k); searchB = (indexContainer) map.remove(k);
searchResult = plasmaWordIndexEntryContainer.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1), maxDistance); searchResult = indexTreeMapContainer.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1), maxDistance);
// free resources // free resources
searchA = null; searchA = null;
searchB = null; searchB = null;
} }
// in 'searchResult' is now the combined search result // in 'searchResult' is now the combined search result
if (searchResult.size() == 0) return new plasmaWordIndexEntryContainer(null); if (searchResult.size() == 0) return new indexTreeMapContainer(null);
return searchResult; return searchResult;
} }
@ -238,7 +219,7 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer
public static indexContainer joinConstructive(indexContainer i1, indexContainer i2, long time, int maxDistance) { public static indexContainer joinConstructive(indexContainer i1, indexContainer i2, long time, int maxDistance) {
if ((i1 == null) || (i2 == null)) return null; if ((i1 == null) || (i2 == null)) return null;
if ((i1.size() == 0) || (i2.size() == 0)) return new plasmaWordIndexEntryContainer(null); if ((i1.size() == 0) || (i2.size() == 0)) return new indexTreeMapContainer(null);
// decide which method to use // decide which method to use
int high = ((i1.size() > i2.size()) ? i1.size() : i2.size()); int high = ((i1.size() > i2.size()) ? i1.size() : i2.size());
@ -259,7 +240,7 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer
private static indexContainer joinConstructiveByTest(indexContainer small, indexContainer large, long time, int maxDistance) { private static indexContainer joinConstructiveByTest(indexContainer small, indexContainer large, long time, int maxDistance) {
System.out.println("DEBUG: JOIN METHOD BY TEST"); System.out.println("DEBUG: JOIN METHOD BY TEST");
plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result indexTreeMapContainer conj = new indexTreeMapContainer(null); // start with empty search result
Iterator se = small.entries(); Iterator se = small.entries();
indexEntry ie0, ie1; indexEntry ie0, ie1;
long stamp = System.currentTimeMillis(); long stamp = System.currentTimeMillis();
@ -277,31 +258,31 @@ public final class plasmaWordIndexEntryContainer extends indexAbstractContainer
private static indexContainer joinConstructiveByEnumeration(indexContainer i1, indexContainer i2, long time, int maxDistance) { private static indexContainer joinConstructiveByEnumeration(indexContainer i1, indexContainer i2, long time, int maxDistance) {
System.out.println("DEBUG: JOIN METHOD BY ENUMERATION"); System.out.println("DEBUG: JOIN METHOD BY ENUMERATION");
plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result indexTreeMapContainer conj = new indexTreeMapContainer(null); // start with empty search result
if (!(i1.order().signature().equals(i2.order().signature()))) return conj; // ordering must be equal if (!(i1.order().signature().equals(i2.order().signature()))) return conj; // ordering must be equal
Iterator e1 = i1.entries(); Iterator e1 = i1.entries();
Iterator e2 = i2.entries(); Iterator e2 = i2.entries();
int c; int c;
if ((e1.hasNext()) && (e2.hasNext())) { if ((e1.hasNext()) && (e2.hasNext())) {
plasmaWordIndexEntryInstance ie1; indexURLEntry ie1;
plasmaWordIndexEntryInstance ie2; indexURLEntry ie2;
ie1 = (plasmaWordIndexEntryInstance) e1.next(); ie1 = (indexURLEntry) e1.next();
ie2 = (plasmaWordIndexEntryInstance) e2.next(); ie2 = (indexURLEntry) e2.next();
long stamp = System.currentTimeMillis(); long stamp = System.currentTimeMillis();
while ((System.currentTimeMillis() - stamp) < time) { while ((System.currentTimeMillis() - stamp) < time) {
c = i1.order().compare(ie1.getUrlHash(), ie2.getUrlHash()); c = i1.order().compare(ie1.getUrlHash(), ie2.getUrlHash());
//System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c); //System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
if (c < 0) { if (c < 0) {
if (e1.hasNext()) ie1 = (plasmaWordIndexEntryInstance) e1.next(); else break; if (e1.hasNext()) ie1 = (indexURLEntry) e1.next(); else break;
} else if (c > 0) { } else if (c > 0) {
if (e2.hasNext()) ie2 = (plasmaWordIndexEntryInstance) e2.next(); else break; if (e2.hasNext()) ie2 = (indexURLEntry) e2.next(); else break;
} else { } else {
// we have found the same urls in different searches! // we have found the same urls in different searches!
ie1.combineDistance(ie2); ie1.combineDistance(ie2);
if (ie1.worddistance() <= maxDistance) conj.add(ie1); if (ie1.worddistance() <= maxDistance) conj.add(ie1);
if (e1.hasNext()) ie1 = (plasmaWordIndexEntryInstance) e1.next(); else break; if (e1.hasNext()) ie1 = (indexURLEntry) e1.next(); else break;
if (e2.hasNext()) ie2 = (plasmaWordIndexEntryInstance) e2.next(); else break; if (e2.hasNext()) ie2 = (indexURLEntry) e2.next(); else break;
} }
} }
} }

@ -1,220 +1,203 @@
// plasmaIndexEntry.java // indexURLEntry.java
// ----------------------- // (C) 2004, 2005, 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// part of YaCy // first published 2004 on http://www.anomic.de
// (C) by Michael Peter Christen; mc@anomic.de //
// first published on http://www.anomic.de // This is a part of YaCy, a peer-to-peer based web search engine
// Frankfurt, Germany, 2004 //
// // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedDate$ // $LastChangedRevision: 1986 $
// $LastChangedRevision$ // $LastChangedBy: orbiter $
// $LastChangedBy$ //
// // LICENSE
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by // This program is free software; you can redistribute it and/or modify
// the Free Software Foundation; either version 2 of the License, or // it under the terms of the GNU General Public License as published by
// (at your option) any later version. // the Free Software Foundation; either version 2 of the License, or
// // (at your option) any later version.
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of // This program is distributed in the hope that it will be useful,
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// // GNU General Public License for more details.
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software // You should have received a copy of the GNU General Public License
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // along with this program; if not, write to the Free Software
// // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible /*
// for cost, loss of data or any harm that may be caused directly or indirectly This class defines the structures of an index entry for URLs
// by usage of this softare or this documentation. The usage of this software */
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and package de.anomic.index;
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is import java.util.Properties;
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with import de.anomic.index.indexEntry;
// the software. import de.anomic.index.indexEntryAttribute;
// import de.anomic.index.indexAbstractEntry;
// Any changes to this file according to the GPL as documented in the file import de.anomic.index.indexURL;
// gpl.txt aside this file in the shipment you received can be done to the import de.anomic.kelondro.kelondroBase64Order;
// lines that follows this copyright notice here, but changes must not be import de.anomic.plasma.plasmaWordIndex;
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice. public final class indexURLEntry extends indexAbstractEntry implements Cloneable, indexEntry {
// Contributions and changes to the program code must be marked as such.
// an wordEntry can be filled in either of two ways:
/* // by the discrete values of the entry
This class defines the structures of an index entry // or by the encoded entry-string
*/
// the class instantiation can only be done by a plasmaStore method
package de.anomic.plasma; // therefore they are all public
public indexURLEntry(String urlHash,
import java.util.Properties; int urlLength, // byte-length of complete URL
int urlComps, // number of path components
import de.anomic.index.indexEntry; int titleLength, // length of description/length (longer are better?)
import de.anomic.index.indexEntryAttribute; int hitcount, //*how often appears this word in the text
import de.anomic.index.indexAbstractEntry; int wordcount, //*total number of words
import de.anomic.index.indexURL; int phrasecount, //*total number of phrases
import de.anomic.kelondro.kelondroBase64Order; int posintext, //*position of word in all words
int posinphrase, //*position of word in its phrase
public final class plasmaWordIndexEntryInstance extends indexAbstractEntry implements Cloneable, indexEntry { int posofphrase, //*number of the phrase where word appears
int distance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
// an wordEntry can be filled in either of two ways: int sizeOfPage, // # of bytes of the page
// by the discrete values of the entry long lastmodified, //*last-modified time of the document where word appears
// or by the encoded entry-string long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
int quality, //*the entropy value
// the class instantiation can only be done by a plasmaStore method String language, //*(guessed) language of document
// therefore they are all public char doctype, //*type of document
public plasmaWordIndexEntryInstance(String urlHash, int outlinksSame, // outlinks to same domain
int urlLength, // byte-length of complete URL int outlinksOther,// outlinks to other domain
int urlComps, // number of path components boolean local //*flag shows that this index was generated locally; othervise its from a remote peer
int titleLength, // length of description/length (longer are better?) ) {
int hitcount, //*how often appears this word in the text
int wordcount, //*total number of words // more needed attributes:
int phrasecount, //*total number of phrases // - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc
int posintext, //*position of word in all words // - boolean: URL attributes
int posinphrase, //*position of word in its phrase
int posofphrase, //*number of the phrase where word appears if ((language == null) || (language.length() != indexURL.urlLanguageLength)) language = "uk";
int distance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search this.urlHash = urlHash;
int sizeOfPage, // # of bytes of the page this.hitcount = hitcount;
long lastmodified, //*last-modified time of the document where word appears this.wordcount = wordcount;
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short this.phrasecount = phrasecount;
int quality, //*the entropy value this.posintext = posintext;
String language, //*(guessed) language of document this.posinphrase = posinphrase;
char doctype, //*type of document this.posofphrase = posofphrase;
int outlinksSame, // outlinks to same domain this.worddistance = distance;
int outlinksOther,// outlinks to other domain this.lastModified = lastmodified;
boolean local //*flag shows that this index was generated locally; othervise its from a remote peer this.quality = quality;
) { this.language = language.getBytes();
this.doctype = doctype;
// more needed attributes: this.localflag = (local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL;
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc }
// - boolean: URL attributes
public indexURLEntry(String urlHash, String code) {
if ((language == null) || (language.length() != indexURL.urlLanguageLength)) language = "uk"; // the code is not parsed but used later on
this.urlHash = urlHash; this.urlHash = urlHash;
this.hitcount = hitcount; this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(6, 8));
this.wordcount = wordcount; this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(3, 6)));
this.phrasecount = phrasecount; this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(0, 3));
this.posintext = posintext; this.language = code.substring(8, 10).getBytes();
this.posinphrase = posinphrase; this.doctype = code.charAt(10);
this.posofphrase = posofphrase; this.localflag = code.charAt(11);
this.worddistance = distance; this.posintext = (code.length() >= 14) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(12, 14)) : 0;
this.lastModified = lastmodified; this.posinphrase = (code.length() >= 15) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(14, 16)) : 0;
this.quality = quality; this.posofphrase = (code.length() >= 17) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(16, 18)) : 0;
this.language = language.getBytes(); this.worddistance = (code.length() >= 19) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(18, 20)) : 0;
this.doctype = doctype; this.wordcount = (code.length() >= 21) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(20, 22)) : 0;
this.localflag = (local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL; this.phrasecount = (code.length() >= 23) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(22, 24)) : 0;
} if (hitcount == 0) hitcount = 1;
if (wordcount == 0) wordcount = 1000;
public plasmaWordIndexEntryInstance(String urlHash, String code) { if (phrasecount == 0) phrasecount = 100;
// the code is not parsed but used later on }
this.urlHash = urlHash;
this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(6, 8)); public indexURLEntry(String external) {
this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(3, 6))); // parse external form
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(0, 3)); String[] elts = external.substring(1, external.length() - 1).split(",");
this.language = code.substring(8, 10).getBytes(); Properties pr = new Properties();
this.doctype = code.charAt(10); int p;
this.localflag = code.charAt(11); for (int i = 0; i < elts.length; i++) {
this.posintext = (code.length() >= 14) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(12, 14)) : 0; pr.put(elts[i].substring(0, (p = elts[i].indexOf("="))), elts[i].substring(p + 1));
this.posinphrase = (code.length() >= 15) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(14, 16)) : 0; }
this.posofphrase = (code.length() >= 17) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(16, 18)) : 0; // set values
this.worddistance = (code.length() >= 19) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(18, 20)) : 0; this.urlHash = pr.getProperty("h", "");
this.wordcount = (code.length() >= 21) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(20, 22)) : 0; this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("c", "A"));
this.phrasecount = (code.length() >= 23) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(22, 24)) : 0; this.wordcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("w", "__"));
if (hitcount == 0) hitcount = 1; this.phrasecount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("p", "__"));
if (wordcount == 0) wordcount = 1000; this.posintext = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("t", "__"));
if (phrasecount == 0) phrasecount = 100; this.posinphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("r", "__"));
} this.posofphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("o", "__"));
this.worddistance = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("i", "__"));
public plasmaWordIndexEntryInstance(String external) { this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("a", "A")));
// parse external form this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("q", "__"));
String[] elts = external.substring(1, external.length() - 1).split(","); this.language = pr.getProperty("l", "uk").getBytes();
Properties pr = new Properties(); this.doctype = pr.getProperty("d", "u").charAt(0);
int p; this.localflag = pr.getProperty("f", ""+indexEntryAttribute.LT_LOCAL).charAt(0);
for (int i = 0; i < elts.length; i++) { }
pr.put(elts[i].substring(0, (p = elts[i].indexOf("="))), elts[i].substring(p + 1));
} public Object clone() {
// set values return new indexURLEntry(this.toPropertyForm());
this.urlHash = pr.getProperty("h", ""); }
this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("c", "A"));
this.wordcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("w", "__")); public static int encodedStringFormLength() {
this.phrasecount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("p", "__")); // the size of the index entry attributes when encoded to string
this.posintext = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("t", "__")); return 24;
this.posinphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("r", "__")); }
this.posofphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("o", "__"));
this.worddistance = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("i", "__")); public String toEncodedStringForm() {
this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("a", "A"))); // attention: this integrates NOT the URL hash into the encoding
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("q", "__")); // if you need a complete dump, use toExternalForm()
this.language = pr.getProperty("l", "uk").getBytes(); StringBuffer buf = new StringBuffer(encodedStringFormLength());
this.doctype = pr.getProperty("d", "u").charAt(0);
this.localflag = pr.getProperty("f", ""+indexEntryAttribute.LT_LOCAL).charAt(0); buf.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, indexURL.urlQualityLength))
} .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.hitcount, 2))
public Object clone() { .append(new String(this.language))
return new plasmaWordIndexEntryInstance(this.toPropertyForm()); .append(this.doctype)
} .append(this.localflag)
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2))
public static int encodedStringFormLength() { .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2))
// the size of the index entry attributes when encoded to string .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2))
return 24; .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.worddistance, 2))
} .append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.wordcount, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.phrasecount, 2)); // 3+3+2+2+1+1+2+2+2+2+2+2= 24 bytes
public String toEncodedStringForm() {
// attention: this integrates NOT the URL hash into the encoding return buf.toString();
// if you need a complete dump, use toExternalForm() }
StringBuffer buf = new StringBuffer(encodedStringFormLength());
public static int encodedByteArrayFormLength() {
buf.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, indexURL.urlQualityLength)) // the size of the index entry attributes when encoded to string
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3)) return encodedStringFormLength();
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.hitcount, 2)) }
.append(new String(this.language))
.append(this.doctype) public byte[] toEncodedByteArrayForm() {
.append(this.localflag) return toEncodedStringForm().getBytes();
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2)) }
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2)) public String toPropertyForm() {
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.worddistance, 2)) StringBuffer str = new StringBuffer(61);
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.wordcount, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.phrasecount, 2)); // 3+3+2+2+1+1+2+2+2+2+2+2= 24 bytes str.append("{")
.append( "h=").append(this.urlHash)
return buf.toString(); .append(",q=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, indexURL.urlQualityLength))
} .append(",a=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3))
.append(",c=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.hitcount, 2))
public static int encodedByteArrayFormLength() { .append(",l=").append(new String(this.language))
// the size of the index entry attributes when encoded to string .append(",d=").append(this.doctype)
return encodedStringFormLength(); .append(",f=").append(this.localflag)
} .append(",t=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2))
.append(",r=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2))
public byte[] toEncodedByteArrayForm() { .append(",o=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2))
return toEncodedStringForm().getBytes(); .append(",i=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.worddistance, 2))
} .append(",w=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.wordcount, 2))
.append(",p=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.phrasecount, 2))
public String toPropertyForm() { .append("}");
StringBuffer str = new StringBuffer(61);
return str.toString();
str.append("{") }
.append( "h=").append(this.urlHash)
.append(",q=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, indexURL.urlQualityLength)) public static void main(String[] args) {
.append(",a=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3)) // outputs the word hash to a given word
.append(",c=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.hitcount, 2)) if (args.length != 1) System.exit(0);
.append(",l=").append(new String(this.language)) System.out.println("WORDHASH: " + indexEntryAttribute.word2hash(args[0]));
.append(",d=").append(this.doctype) }
.append(",f=").append(this.localflag)
.append(",t=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2)) }
.append(",r=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2))
.append(",o=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2))
.append(",i=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.worddistance, 2))
.append(",w=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.wordcount, 2))
.append(",p=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.phrasecount, 2))
.append("}");
return str.toString();
}
public static void main(String[] args) {
// outputs the word hash to a given word
if (args.length != 1) System.exit(0);
System.out.println("WORDHASH: " + indexEntryAttribute.word2hash(args[0]));
}
}

@ -11,7 +11,7 @@ import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.plasmaWordIndexEntryInstance; import de.anomic.index.indexURLEntry;
import de.anomic.server.serverDate; import de.anomic.server.serverDate;
public class plasmaDbImporter extends AbstractImporter implements dbImporter { public class plasmaDbImporter extends AbstractImporter implements dbImporter {
@ -128,13 +128,13 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// loop throug the entities of the container and get the // loop throug the entities of the container and get the
// urlhash // urlhash
Iterator importWordIdxEntries = newContainer.entries(); Iterator importWordIdxEntries = newContainer.entries();
plasmaWordIndexEntryInstance importWordIdxEntry; indexURLEntry importWordIdxEntry;
while (importWordIdxEntries.hasNext()) { while (importWordIdxEntries.hasNext()) {
// testing if import process was aborted // testing if import process was aborted
if (isAborted()) break; if (isAborted()) break;
// getting next word index entry // getting next word index entry
importWordIdxEntry = (plasmaWordIndexEntryInstance) importWordIdxEntries.next(); importWordIdxEntry = (indexURLEntry) importWordIdxEntries.next();
String urlHash = importWordIdxEntry.getUrlHash(); String urlHash = importWordIdxEntry.getUrlHash();
entityUrls.add(urlHash); entityUrls.add(urlHash);
} }

@ -67,6 +67,7 @@ import java.util.Properties;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.http.httpc.response; import de.anomic.http.httpc.response;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroTree;
import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaHTCache;
@ -159,7 +160,7 @@ public final class plasmaCrawlLURL extends indexURL {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash); gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
} }
public Entry getEntry(String hash, plasmaWordIndexEntryInstance searchedWord) throws IOException { public Entry getEntry(String hash, indexURLEntry searchedWord) throws IOException {
return new Entry(hash, searchedWord); return new Entry(hash, searchedWord);
} }
@ -414,7 +415,7 @@ public final class plasmaCrawlLURL extends indexURL {
private int size; private int size;
private int wordCount; private int wordCount;
private String snippet; private String snippet;
private plasmaWordIndexEntryInstance word; // this is only used if the url is transported via remote search requests private indexURLEntry word; // this is only used if the url is transported via remote search requests
private boolean stored; private boolean stored;
// more needed attributes: // more needed attributes:
@ -449,7 +450,7 @@ public final class plasmaCrawlLURL extends indexURL {
this.stored = false; this.stored = false;
} }
public Entry(String urlHash, plasmaWordIndexEntryInstance searchedWord) throws IOException { public Entry(String urlHash, indexURLEntry searchedWord) throws IOException {
// generates an plasmaLURLEntry using the url hash // generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered // to speed up the access, the url-hashes are buffered
// in the hash cache. // in the hash cache.
@ -510,7 +511,7 @@ public final class plasmaCrawlLURL extends indexURL {
this.wordCount = Integer.parseInt(prop.getProperty("wc", "0")); this.wordCount = Integer.parseInt(prop.getProperty("wc", "0"));
this.snippet = prop.getProperty("snippet", ""); this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null); if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null);
this.word = (prop.containsKey("word")) ? new plasmaWordIndexEntryInstance(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null; this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null;
this.stored = false; this.stored = false;
//} //}
} catch (Exception e) { } catch (Exception e) {
@ -647,7 +648,7 @@ public final class plasmaCrawlLURL extends indexURL {
return snippet; return snippet;
} }
public plasmaWordIndexEntryInstance word() { public indexURLEntry word() {
return word; return word;
} }

@ -47,6 +47,8 @@ import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
@ -72,22 +74,22 @@ public class plasmaDHTChunk {
private int status = chunkStatus_UNDEFINED; private int status = chunkStatus_UNDEFINED;
private String startPointHash; private String startPointHash;
private plasmaWordIndexEntryContainer[] indexContainers = null; private indexTreeMapContainer[] indexContainers = null;
private HashMap urlCache; // String (url-hash) / plasmaCrawlLURL.Entry private HashMap urlCache; // String (url-hash) / plasmaCrawlLURL.Entry
private int idxCount; private int idxCount;
private long selectionStartTime = 0; private long selectionStartTime = 0;
private long selectionEndTime = 0; private long selectionEndTime = 0;
public plasmaWordIndexEntryContainer firstContainer() { public indexTreeMapContainer firstContainer() {
return indexContainers[0]; return indexContainers[0];
} }
public plasmaWordIndexEntryContainer lastContainer() { public indexTreeMapContainer lastContainer() {
return indexContainers[indexContainers.length - 1]; return indexContainers[indexContainers.length - 1];
} }
public plasmaWordIndexEntryContainer[] containers() { public indexTreeMapContainer[] containers() {
return indexContainers; return indexContainers;
} }
@ -189,7 +191,7 @@ public class plasmaDHTChunk {
Iterator wordHashIterator = wordIndex.wordHashSet(hash, resourceLevel, true, maxcount).iterator(); Iterator wordHashIterator = wordIndex.wordHashSet(hash, resourceLevel, true, maxcount).iterator();
indexContainer indexContainer; indexContainer indexContainer;
Iterator urlIter; Iterator urlIter;
plasmaWordIndexEntryInstance indexEntry; indexURLEntry indexEntry;
plasmaCrawlLURL.Entry lurl; plasmaCrawlLURL.Entry lurl;
int refcount = 0; int refcount = 0;
@ -197,7 +199,7 @@ public class plasmaDHTChunk {
double maximumDistance = ((double) peerRedundancy * 2) / ((double) yacyCore.seedDB.sizeConnected()); double maximumDistance = ((double) peerRedundancy * 2) / ((double) yacyCore.seedDB.sizeConnected());
while ((maxcount > refcount) && (wordHashIterator.hasNext()) && ((nexthash = (String) wordHashIterator.next()) != null) && (nexthash.trim().length() > 0) while ((maxcount > refcount) && (wordHashIterator.hasNext()) && ((nexthash = (String) wordHashIterator.next()) != null) && (nexthash.trim().length() > 0)
&& ((tmpContainers.size() == 0) || (yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntryContainer) tmpContainers.get(0)).wordHash()) < maximumDistance))) { && ((tmpContainers.size() == 0) || (yacyDHTAction.dhtDistance(nexthash, ((indexTreeMapContainer) tmpContainers.get(0)).wordHash()) < maximumDistance))) {
// make an on-the-fly entity and insert values // make an on-the-fly entity and insert values
indexContainer = wordIndex.getContainer(nexthash, true, 10000); indexContainer = wordIndex.getContainer(nexthash, true, 10000);
int notBoundCounter = 0; int notBoundCounter = 0;
@ -205,7 +207,7 @@ public class plasmaDHTChunk {
urlIter = indexContainer.entries(); urlIter = indexContainer.entries();
// iterate over indexes to fetch url entries and store them in the urlCache // iterate over indexes to fetch url entries and store them in the urlCache
while ((urlIter.hasNext()) && (maxcount > refcount)) { while ((urlIter.hasNext()) && (maxcount > refcount)) {
indexEntry = (plasmaWordIndexEntryInstance) urlIter.next(); indexEntry = (indexURLEntry) urlIter.next();
try { try {
lurl = lurls.getEntry(indexEntry.getUrlHash(), indexEntry); lurl = lurls.getEntry(indexEntry.getUrlHash(), indexEntry);
if ((lurl == null) || (lurl.url() == null)) { if ((lurl == null) || (lurl.url() == null)) {
@ -225,7 +227,7 @@ public class plasmaDHTChunk {
// remove all remaining; we have enough // remove all remaining; we have enough
while (urlIter.hasNext()) { while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntryInstance) urlIter.next(); indexEntry = (indexURLEntry) urlIter.next();
urlIter.remove(); urlIter.remove();
} }
@ -238,7 +240,7 @@ public class plasmaDHTChunk {
} }
} }
// create result // create result
indexContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]); indexContainers = (indexTreeMapContainer[]) tmpContainers.toArray(new indexTreeMapContainer[tmpContainers.size()]);
if ((indexContainers == null) || (indexContainers.length == 0)) { if ((indexContainers == null) || (indexContainers.length == 0)) {
log.logFine("No index available for index transfer, hash start-point " + startPointHash); log.logFine("No index available for index transfer, hash start-point " + startPointHash);
@ -251,13 +253,13 @@ public class plasmaDHTChunk {
return refcount; return refcount;
} catch (kelondroException e) { } catch (kelondroException e) {
log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e); log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e);
indexContainers = new plasmaWordIndexEntryContainer[0]; indexContainers = new indexTreeMapContainer[0];
urlCache = new HashMap(); urlCache = new HashMap();
this.status = chunkStatus_FAILED; this.status = chunkStatus_FAILED;
return 0; return 0;
} catch (IOException e) { } catch (IOException e) {
log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e); log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e);
indexContainers = new plasmaWordIndexEntryContainer[0]; indexContainers = new indexTreeMapContainer[0];
urlCache = new HashMap(); urlCache = new HashMap();
this.status = chunkStatus_FAILED; this.status = chunkStatus_FAILED;
return 0; return 0;
@ -267,7 +269,7 @@ public class plasmaDHTChunk {
public int deleteTransferIndexes() { public int deleteTransferIndexes() {
Iterator urlIter; Iterator urlIter;
plasmaWordIndexEntryInstance indexEntry; indexURLEntry indexEntry;
String[] urlHashes; String[] urlHashes;
int count = 0; int count = 0;
for (int i = 0; i < this.indexContainers.length; i++) { for (int i = 0; i < this.indexContainers.length; i++) {
@ -276,7 +278,7 @@ public class plasmaDHTChunk {
urlHashes = new String[this.indexContainers[i].size()]; urlHashes = new String[this.indexContainers[i].size()];
urlIter = this.indexContainers[i].entries(); urlIter = this.indexContainers[i].entries();
while (urlIter.hasNext()) { while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntryInstance) urlIter.next(); indexEntry = (indexURLEntry) urlIter.next();
urlHashes[c++] = indexEntry.getUrlHash(); urlHashes[c++] = indexEntry.getUrlHash();
} }
count += wordIndex.removeEntries(this.indexContainers[i].wordHash(), urlHashes, true); count += wordIndex.removeEntries(this.indexContainers[i].wordHash(), urlHashes, true);

@ -52,6 +52,8 @@ import de.anomic.server.logging.serverLog;
import de.anomic.server.serverInstantThread; import de.anomic.server.serverInstantThread;
import de.anomic.yacy.yacySearch; import de.anomic.yacy.yacySearch;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
public final class plasmaSearchEvent extends Thread implements Runnable { public final class plasmaSearchEvent extends Thread implements Runnable {
@ -84,8 +86,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
this.ranking = ranking; this.ranking = ranking;
this.urlStore = urlStore; this.urlStore = urlStore;
this.snippetCache = snippetCache; this.snippetCache = snippetCache;
this.rcLocal = new plasmaWordIndexEntryContainer(null); this.rcLocal = new indexTreeMapContainer(null);
this.rcGlobal = new plasmaWordIndexEntryContainer(null); this.rcGlobal = new indexTreeMapContainer(null);
this.rcGlobalCount = 0; this.rcGlobalCount = 0;
this.profileLocal = localTiming; this.profileLocal = localTiming;
this.profileGlobal = remoteTiming; this.profileGlobal = remoteTiming;
@ -176,13 +178,13 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// since this is a conjunction we return an empty entity if any word // since this is a conjunction we return an empty entity if any word
// is not known // is not known
if (containers == null) { if (containers == null) {
rcLocal = new plasmaWordIndexEntryContainer(null); rcLocal = new indexTreeMapContainer(null);
return 0; return 0;
} }
// join the result // join the result
profileLocal.startTimer(); profileLocal.startTimer();
rcLocal = plasmaWordIndexEntryContainer.joinContainer(containers, rcLocal = indexTreeMapContainer.joinContainer(containers,
profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_JOIN), profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_JOIN),
query.maxDistance); query.maxDistance);
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_JOIN); profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_JOIN);
@ -218,7 +220,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// we collect the urlhashes and construct a list with urlEntry objects // we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime // attention: if minEntries is too high, this method will not terminate within the maxTime
plasmaWordIndexEntryContainer searchResult = new plasmaWordIndexEntryContainer(null); indexTreeMapContainer searchResult = new indexTreeMapContainer(null);
long preorderTime = profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_PRESORT); long preorderTime = profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_PRESORT);
profileLocal.startTimer(); profileLocal.startTimer();
@ -240,7 +242,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
//if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty //if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty
//if (searchResult.size() == 0) return acc; // case that we have nothing to do //if (searchResult.size() == 0) return acc; // case that we have nothing to do
plasmaWordIndexEntryInstance entry; indexURLEntry entry;
plasmaCrawlLURL.Entry page; plasmaCrawlLURL.Entry page;
int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT); int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);
try { try {

@ -49,6 +49,8 @@ import java.util.Iterator;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
import de.anomic.server.serverFileUtils; import de.anomic.server.serverFileUtils;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBinSearch; import de.anomic.kelondro.kelondroBinSearch;
public final class plasmaSearchPreOrder { public final class plasmaSearchPreOrder {
@ -56,7 +58,7 @@ public final class plasmaSearchPreOrder {
public static kelondroBinSearch[] ybrTables = null; // block-rank tables public static kelondroBinSearch[] ybrTables = null; // block-rank tables
private static boolean useYBR = true; private static boolean useYBR = true;
private plasmaWordIndexEntryInstance entryMin, entryMax; private indexURLEntry entryMin, entryMax;
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private plasmaSearchQuery query; private plasmaSearchQuery query;
private plasmaSearchRankingProfile ranking; private plasmaSearchRankingProfile ranking;
@ -116,36 +118,36 @@ public final class plasmaSearchPreOrder {
return pageAcc.size() > 0; return pageAcc.size() > 0;
} }
public plasmaWordIndexEntryInstance next() { public indexURLEntry next() {
Object top = pageAcc.lastKey(); Object top = pageAcc.lastKey();
return (plasmaWordIndexEntryInstance) pageAcc.remove(top); return (indexURLEntry) pageAcc.remove(top);
} }
public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) { public void addContainer(indexTreeMapContainer container, long maxTime) {
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
plasmaWordIndexEntryInstance indexEntry; indexURLEntry indexEntry;
// first pass: find min/max to obtain limits for normalization // first pass: find min/max to obtain limits for normalization
Iterator i = container.entries(); Iterator i = container.entries();
int count = 0; int count = 0;
while (i.hasNext()) { while (i.hasNext()) {
if (System.currentTimeMillis() > limitTime) break; if (System.currentTimeMillis() > limitTime) break;
indexEntry = (plasmaWordIndexEntryInstance) i.next(); indexEntry = (indexURLEntry) i.next();
if (entryMin == null) entryMin = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMin.min(indexEntry); if (entryMin == null) entryMin = (indexURLEntry) indexEntry.clone(); else entryMin.min(indexEntry);
if (entryMax == null) entryMax = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMax.max(indexEntry); if (entryMax == null) entryMax = (indexURLEntry) indexEntry.clone(); else entryMax.max(indexEntry);
count++; count++;
} }
// second pass: normalize entries and get ranking // second pass: normalize entries and get ranking
i = container.entries(); i = container.entries();
for (int j = 0; j < count; j++) { for (int j = 0; j < count; j++) {
indexEntry = (plasmaWordIndexEntryInstance) i.next(); indexEntry = (indexURLEntry) i.next();
pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.getUrlHash(), indexEntry); pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.getUrlHash(), indexEntry);
} }
} }
public plasmaWordIndexEntryInstance[] getNormalizer() { public indexURLEntry[] getNormalizer() {
return new plasmaWordIndexEntryInstance[] {entryMin, entryMax}; return new indexURLEntry[] {entryMin, entryMax};
} }
public static int ybr_p(String urlHash) { public static int ybr_p(String urlHash) {

@ -47,6 +47,7 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import de.anomic.index.indexEntry; import de.anomic.index.indexEntry;
import de.anomic.index.indexURLEntry;
public class plasmaSearchRankingProfile { public class plasmaSearchRankingProfile {
@ -165,8 +166,8 @@ public class plasmaSearchRankingProfile {
public long preRanking(indexEntry entry) { public long preRanking(indexEntry entry) {
long ranking = 0; long ranking = 0;
if (entry instanceof plasmaWordIndexEntryInstance) { if (entry instanceof indexURLEntry) {
plasmaWordIndexEntryInstance normalizedEntry = (plasmaWordIndexEntryInstance) entry; indexURLEntry normalizedEntry = (indexURLEntry) entry;
ranking += normalizedEntry.getQuality() << ((Integer) coeff.get(ENTROPY)).intValue(); ranking += normalizedEntry.getQuality() << ((Integer) coeff.get(ENTROPY)).intValue();
ranking += normalizedEntry.getVirtualAge() << ((Integer) coeff.get(DATE)).intValue(); ranking += normalizedEntry.getVirtualAge() << ((Integer) coeff.get(DATE)).intValue();
ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << ((Integer) coeff.get(YBR)).intValue(); ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << ((Integer) coeff.get(YBR)).intValue();

@ -57,10 +57,11 @@ import de.anomic.server.serverCodings;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
public final class plasmaSearchResult { public final class plasmaSearchResult {
private plasmaWordIndexEntryInstance entryMin, entryMax; private indexURLEntry entryMin, entryMax;
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
@ -107,11 +108,11 @@ public final class plasmaSearchResult {
return (plasmaCrawlLURL.Entry) pageAcc.remove(top); return (plasmaCrawlLURL.Entry) pageAcc.remove(top);
} }
protected void addResult(plasmaWordIndexEntryInstance indexEntry, plasmaCrawlLURL.Entry page) { protected void addResult(indexURLEntry indexEntry, plasmaCrawlLURL.Entry page) {
// make min/max for normalization // make min/max for normalization
if (entryMin == null) entryMin = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMin.min(indexEntry); if (entryMin == null) entryMin = (indexURLEntry) indexEntry.clone(); else entryMin.min(indexEntry);
if (entryMax == null) entryMax = (plasmaWordIndexEntryInstance) indexEntry.clone(); else entryMax.max(indexEntry); if (entryMax == null) entryMax = (indexURLEntry) indexEntry.clone(); else entryMax.max(indexEntry);
// take out relevant information for reference computation // take out relevant information for reference computation
URL url = page.url(); URL url = page.url();
@ -139,13 +140,13 @@ public final class plasmaSearchResult {
for (int i = 0; i < references.length; i++) commonSense.add(references[i]); for (int i = 0; i < references.length; i++) commonSense.add(references[i]);
Object[] resultVector; Object[] resultVector;
plasmaWordIndexEntryInstance indexEntry; indexURLEntry indexEntry;
plasmaCrawlLURL.Entry page; plasmaCrawlLURL.Entry page;
long ranking; long ranking;
for (int i = 0; i < results.size(); i++) { for (int i = 0; i < results.size(); i++) {
// take out values from result array // take out values from result array
resultVector = (Object[]) results.get(i); resultVector = (Object[]) results.get(i);
indexEntry = (plasmaWordIndexEntryInstance) resultVector[0]; indexEntry = (indexURLEntry) resultVector[0];
page = (plasmaCrawlLURL.Entry) resultVector[1]; page = (plasmaCrawlLURL.Entry) resultVector[1];
// calculate ranking // calculate ranking

@ -131,7 +131,9 @@ import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroMSetTools;
@ -1473,8 +1475,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String word = (String) wentry.getKey(); String word = (String) wentry.getKey();
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue(); wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
String wordHash = indexEntryAttribute.word2hash(word); String wordHash = indexEntryAttribute.word2hash(word);
plasmaWordIndexEntryContainer wordIdxContainer = new plasmaWordIndexEntryContainer(wordHash); indexTreeMapContainer wordIdxContainer = new indexTreeMapContainer(wordHash);
plasmaWordIndexEntryInstance wordIdxEntry = new plasmaWordIndexEntryInstance(urlHash, indexURLEntry wordIdxEntry = new indexURLEntry(urlHash,
urlLength, urlComps, urlLength, urlComps,
wordStat.count, wordStat.count,
document.longTitle.length(), document.longTitle.length(),
@ -1503,7 +1505,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// transfering the index to the storage peer // transfering the index to the storage peer
String error = yacyClient.transferIndex( String error = yacyClient.transferIndex(
seed, seed,
(plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]), (indexTreeMapContainer[])tmpContainers.toArray(new indexTreeMapContainer[tmpContainers.size()]),
urlCache, urlCache,
true, true,
120000); 120000);

@ -61,8 +61,11 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry; import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexRAMCacheRI;
import de.anomic.index.indexRI; import de.anomic.index.indexRI;
import de.anomic.index.indexAbstractRI; import de.anomic.index.indexAbstractRI;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMergeIterator; import de.anomic.kelondro.kelondroMergeIterator;
@ -76,7 +79,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
private static final int assortmentCount = 64; private static final int assortmentCount = 64;
private final File databaseRoot; private final File databaseRoot;
private final plasmaWordIndexCache ramCache; private final indexRAMCacheRI ramCache;
private final plasmaWordIndexAssortmentCluster assortmentCluster; private final plasmaWordIndexAssortmentCluster assortmentCluster;
private int assortmentBufferSize; //kb private int assortmentBufferSize; //kb
private final plasmaWordIndexClassicDB backend; private final plasmaWordIndexClassicDB backend;
@ -85,7 +88,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
public plasmaWordIndex(File databaseRoot, int bufferkb, serverLog log) { public plasmaWordIndex(File databaseRoot, int bufferkb, serverLog log) {
this.databaseRoot = databaseRoot; this.databaseRoot = databaseRoot;
this.backend = new plasmaWordIndexClassicDB(databaseRoot, log); this.backend = new plasmaWordIndexClassicDB(databaseRoot, log);
this.ramCache = new plasmaWordIndexCache(databaseRoot, log); this.ramCache = new indexRAMCacheRI(databaseRoot, log);
// create new assortment cluster path // create new assortment cluster path
File assortmentClusterPath = new File(databaseRoot, indexAssortmentClusterPath); File assortmentClusterPath = new File(databaseRoot, indexAssortmentClusterPath);
@ -149,7 +152,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
public void flushControl() { public void flushControl() {
// check for forced flush // check for forced flush
synchronized (this) { ramCache.shiftK2W(); } synchronized (this) { ramCache.shiftK2W(); }
while (ramCache.maxURLinWCache() > plasmaWordIndexCache.wCacheReferenceLimit) { while (ramCache.maxURLinWCache() > indexRAMCacheRI.wCacheReferenceLimit) {
flushCache(1); flushCache(1);
} }
if (ramCache.wSize() > ramCache.getMaxWordCount()) { if (ramCache.wSize() > ramCache.getMaxWordCount()) {
@ -242,7 +245,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
Iterator i = condenser.words(); Iterator i = condenser.words();
Map.Entry wentry; Map.Entry wentry;
String word; String word;
plasmaWordIndexEntryInstance ientry; indexURLEntry ientry;
plasmaCondenser.wordStatProp wprop; plasmaCondenser.wordStatProp wprop;
String wordHash; String wordHash;
int urlLength = url.toString().length(); int urlLength = url.toString().length();
@ -254,7 +257,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
wprop = (plasmaCondenser.wordStatProp) wentry.getValue(); wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c); // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = indexEntryAttribute.word2hash(word); wordHash = indexEntryAttribute.word2hash(word);
ientry = new plasmaWordIndexEntryInstance(urlHash, ientry = new indexURLEntry(urlHash,
urlLength, urlComps, (document == null) ? urlLength : document.longTitle.length(), urlLength, urlComps, (document == null) ? urlLength : document.longTitle.length(),
wprop.count, wprop.count,
condenser.RESULT_SIMI_WORDS, condenser.RESULT_SIMI_WORDS,
@ -281,7 +284,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); indexTreeMapContainer container = new indexTreeMapContainer(wordHash);
// get from cache // get from cache
// We must not use the container from cache to store everything we find, // We must not use the container from cache to store everything we find,
// as that container remains linked to in the cache and might be changed later // as that container remains linked to in the cache and might be changed later
@ -359,7 +362,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
public synchronized indexContainer deleteContainer(String wordHash) { public synchronized indexContainer deleteContainer(String wordHash) {
indexContainer c = ramCache.deleteContainer(wordHash); indexContainer c = ramCache.deleteContainer(wordHash);
if (c == null) c = new plasmaWordIndexEntryContainer(wordHash); if (c == null) c = new indexTreeMapContainer(wordHash);
c.add(assortmentCluster.deleteContainer(wordHash, -1), -1); c.add(assortmentCluster.deleteContainer(wordHash, -1), -1);
c.add(backend.deleteContainer(wordHash), -1); c.add(backend.deleteContainer(wordHash), -1);
return c; return c;
@ -518,11 +521,11 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
// the combined container will fit, read the container // the combined container will fit, read the container
try { try {
Iterator entries = entity.elements(true); Iterator entries = entity.elements(true);
plasmaWordIndexEntryInstance entry; indexURLEntry entry;
while (entries.hasNext()) { while (entries.hasNext()) {
entry = (plasmaWordIndexEntryInstance) entries.next(); entry = (indexURLEntry) entries.next();
// System.out.println("ENTRY = " + entry.getUrlHash()); // System.out.println("ENTRY = " + entry.getUrlHash());
container.add(new plasmaWordIndexEntryInstance[]{entry}, System.currentTimeMillis()); container.add(new indexURLEntry[]{entry}, System.currentTimeMillis());
} }
// we have read all elements, now delete the entity // we have read all elements, now delete the entity
entity.deleteComplete(); entity.deleteComplete();
@ -570,7 +573,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started"); serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started");
String wordHash = ""; String wordHash = "";
indexContainer wordContainer = null; indexContainer wordContainer = null;
plasmaWordIndexEntryInstance entry = null; indexURLEntry entry = null;
URL url = null; URL url = null;
HashSet urlHashs = new HashSet(); HashSet urlHashs = new HashSet();
try { try {
@ -583,7 +586,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
wordHashNow = wordHash; wordHashNow = wordHash;
while (containerIterator.hasNext() && run) { while (containerIterator.hasNext() && run) {
waiter(); waiter();
entry = (plasmaWordIndexEntryInstance) containerIterator.next(); entry = (indexURLEntry) containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash: // System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash()); // "+entry.getUrlHash());
try { try {

@ -58,6 +58,8 @@ import java.util.Iterator;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRecords; import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroTree;
@ -72,7 +74,7 @@ public final class plasmaWordIndexAssortment {
4, // occurrence counter 4, // occurrence counter
8, // timestamp of last access 8, // timestamp of last access
indexEntryAttribute.urlHashLength, // corresponding URL hash indexEntryAttribute.urlHashLength, // corresponding URL hash
plasmaWordIndexEntryInstance.encodedStringFormLength() // URL attributes indexURLEntry.encodedStringFormLength() // URL attributes
}; };
// class variables // class variables
@ -136,9 +138,9 @@ public final class plasmaWordIndexAssortment {
row[1] = kelondroRecords.long2bytes(1, 4); row[1] = kelondroRecords.long2bytes(1, 4);
row[2] = kelondroRecords.long2bytes(newContainer.updated(), 8); row[2] = kelondroRecords.long2bytes(newContainer.updated(), 8);
Iterator entries = newContainer.entries(); Iterator entries = newContainer.entries();
plasmaWordIndexEntryInstance entry; indexURLEntry entry;
for (int i = 0; i < assortmentLength; i++) { for (int i = 0; i < assortmentLength; i++) {
entry = (plasmaWordIndexEntryInstance) entries.next(); entry = (indexURLEntry) entries.next();
row[3 + 2 * i] = entry.getUrlHash().getBytes(); row[3 + 2 * i] = entry.getUrlHash().getBytes();
row[4 + 2 * i] = entry.toEncodedStringForm().getBytes(); row[4 + 2 * i] = entry.toEncodedStringForm().getBytes();
} }
@ -215,10 +217,10 @@ public final class plasmaWordIndexAssortment {
public indexContainer row2container(String wordHash, byte[][] row) { public indexContainer row2container(String wordHash, byte[][] row) {
if (row == null) return null; if (row == null) return null;
final long updateTime = kelondroRecords.bytes2long(row[2]); final long updateTime = kelondroRecords.bytes2long(row[2]);
plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); indexTreeMapContainer container = new indexTreeMapContainer(wordHash);
for (int i = 0; i < assortmentLength; i++) { for (int i = 0; i < assortmentLength; i++) {
container.add( container.add(
new plasmaWordIndexEntryInstance[] { new plasmaWordIndexEntryInstance( new indexURLEntry[] { new indexURLEntry(
new String(row[3 + 2 * i]), new String(row[4 + 2 * i])) }, updateTime); new String(row[3 + 2 * i]), new String(row[4 + 2 * i])) }, updateTime);
} }
return container; return container;

@ -54,6 +54,8 @@ import java.util.Iterator;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexRI; import de.anomic.index.indexRI;
import de.anomic.index.indexAbstractRI; import de.anomic.index.indexAbstractRI;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroObjectCache; import de.anomic.kelondro.kelondroObjectCache;
import de.anomic.kelondro.kelondroRecords; import de.anomic.kelondro.kelondroRecords;
@ -144,13 +146,13 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl
int clusterStart = clusterCount - (int) (Math.random() * (clusterCount - clusterMinStart)); int clusterStart = clusterCount - (int) (Math.random() * (clusterCount - clusterMinStart));
// do the insert // do the insert
plasmaWordIndexEntryContainer c; indexTreeMapContainer c;
Iterator i = newContainer.entries(); Iterator i = newContainer.entries();
for (int j = clusterStart; j >= 1; j--) { for (int j = clusterStart; j >= 1; j--) {
c = new plasmaWordIndexEntryContainer(newContainer.wordHash()); c = new indexTreeMapContainer(newContainer.wordHash());
for (int k = 0; k < j; k++) { for (int k = 0; k < j; k++) {
if (i.hasNext()) { if (i.hasNext()) {
c.add((plasmaWordIndexEntryInstance) i.next(), newContainer.updated()); c.add((indexURLEntry) i.next(), newContainer.updated());
} else { } else {
storeForced(c); storeForced(c);
return; return;
@ -186,14 +188,14 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl
} }
if (need == 0) { if (need == 0) {
// we found spaces so that we can put in the newContainer into these spaces // we found spaces so that we can put in the newContainer into these spaces
plasmaWordIndexEntryContainer c; indexTreeMapContainer c;
Iterator i = newContainer.entries(); Iterator i = newContainer.entries();
for (int j = testsize - 1; j >= 0; j--) { for (int j = testsize - 1; j >= 0; j--) {
if (spaces[j] == 0) continue; if (spaces[j] == 0) continue;
c = new plasmaWordIndexEntryContainer(newContainer.wordHash()); c = new indexTreeMapContainer(newContainer.wordHash());
for (int k = 0; k <= j; k++) { for (int k = 0; k <= j; k++) {
assert (i.hasNext()); assert (i.hasNext());
c.add((plasmaWordIndexEntryInstance) i.next(), newContainer.updated()); c.add((indexURLEntry) i.next(), newContainer.updated());
} }
storeForced(c); storeForced(c);
} }
@ -216,7 +218,7 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl
public indexContainer deleteContainer(String wordHash, long maxTime) { public indexContainer deleteContainer(String wordHash, long maxTime) {
// removes all records from all the assortments and return them // removes all records from all the assortments and return them
indexContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash); indexContainer buffer, record = new indexTreeMapContainer(wordHash);
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
long remainingTime; long remainingTime;
for (int i = 0; i < clusterCount; i++) { for (int i = 0; i < clusterCount; i++) {
@ -240,7 +242,7 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl
public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
// collect all records from all the assortments and return them // collect all records from all the assortments and return them
indexContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash); indexContainer buffer, record = new indexTreeMapContainer(wordHash);
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
long remainingTime; long remainingTime;
for (int i = 0; i < clusterCount; i++) { for (int i = 0; i < clusterCount; i++) {

@ -52,6 +52,8 @@ import java.util.TreeSet;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexRI; import de.anomic.index.indexRI;
import de.anomic.index.indexAbstractRI; import de.anomic.index.indexAbstractRI;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
@ -193,16 +195,16 @@ public class plasmaWordIndexClassicDB extends indexAbstractRI implements indexRI
if ((maxTime < 0) || (maxTime > 60000)) maxTime=60000; // maximum is one minute if ((maxTime < 0) || (maxTime > 60000)) maxTime=60000; // maximum is one minute
if (plasmaWordIndexEntity.wordHash2path(databaseRoot, wordHash).exists()) { if (plasmaWordIndexEntity.wordHash2path(databaseRoot, wordHash).exists()) {
plasmaWordIndexEntity entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10); plasmaWordIndexEntity entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10);
plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); indexTreeMapContainer container = new indexTreeMapContainer(wordHash);
plasmaWordIndexEntryInstance entry; indexURLEntry entry;
Iterator i = entity.elements(true); Iterator i = entity.elements(true);
while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) { while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) {
entry = (plasmaWordIndexEntryInstance) i.next(); entry = (indexURLEntry) i.next();
container.add(entry); container.add(entry);
} }
return container; return container;
} else { } else {
return new plasmaWordIndexEntryContainer(wordHash); return new indexTreeMapContainer(wordHash);
} }
} }
@ -217,7 +219,7 @@ public class plasmaWordIndexClassicDB extends indexAbstractRI implements indexRI
public indexContainer deleteContainer(String wordHash) { public indexContainer deleteContainer(String wordHash) {
plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash); plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash);
return new plasmaWordIndexEntryContainer(wordHash); return new indexTreeMapContainer(wordHash);
} }
public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) { public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) {

@ -50,6 +50,7 @@ import java.util.Iterator;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
@ -93,10 +94,10 @@ public final class plasmaWordIndexEntity {
kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent); kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent);
} catch (IOException e) { } catch (IOException e) {
theLocation.delete(); theLocation.delete();
kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, indexURL.urlHashLength, plasmaWordIndexEntryInstance.encodedStringFormLength(), false); kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, indexURL.urlHashLength, indexURLEntry.encodedStringFormLength(), false);
} else { } else {
// create new index file // create new index file
kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, indexURL.urlHashLength, plasmaWordIndexEntryInstance.encodedStringFormLength(), false); kt = new kelondroTree(theLocation, cacheSize, kelondroTree.defaultObjectCachePercent, indexURL.urlHashLength, indexURLEntry.encodedStringFormLength(), false);
} }
return kt; // everyone who get this should close it when finished! return kt; // everyone who get this should close it when finished!
} }
@ -135,23 +136,23 @@ public final class plasmaWordIndexEntity {
} catch (IOException e) {} } catch (IOException e) {}
} }
public plasmaWordIndexEntryInstance getEntry(String urlhash) throws IOException { public indexURLEntry getEntry(String urlhash) throws IOException {
byte[][] n = theIndex.get(urlhash.getBytes()); byte[][] n = theIndex.get(urlhash.getBytes());
if (n == null) return null; if (n == null) return null;
return new plasmaWordIndexEntryInstance(new String(n[0]), new String(n[1])); return new indexURLEntry(new String(n[0]), new String(n[1]));
} }
public boolean contains(String urlhash) throws IOException { public boolean contains(String urlhash) throws IOException {
return (theIndex.get(urlhash.getBytes()) != null); return (theIndex.get(urlhash.getBytes()) != null);
} }
public boolean contains(plasmaWordIndexEntryInstance entry) throws IOException { public boolean contains(indexURLEntry entry) throws IOException {
return (theIndex.get(entry.getUrlHash().getBytes()) != null); return (theIndex.get(entry.getUrlHash().getBytes()) != null);
} }
public boolean addEntry(plasmaWordIndexEntryInstance entry) throws IOException { public boolean addEntry(indexURLEntry entry) throws IOException {
if (entry == null) return false; if (entry == null) return false;
plasmaWordIndexEntryInstance oldEntry = getEntry(entry.getUrlHash()); indexURLEntry oldEntry = getEntry(entry.getUrlHash());
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity
return false; return false;
} }
@ -170,7 +171,7 @@ public final class plasmaWordIndexEntity {
if (container != null) { if (container != null) {
Iterator i = container.entries(); Iterator i = container.entries();
while (i.hasNext()) { while (i.hasNext()) {
if (addEntry((plasmaWordIndexEntryInstance) i.next())) count++; if (addEntry((indexURLEntry) i.next())) count++;
} }
} }
@ -235,7 +236,7 @@ public final class plasmaWordIndexEntity {
public Object next() { public Object next() {
if (i == null) return null; if (i == null) return null;
byte[][] n = (byte[][]) i.next(); byte[][] n = (byte[][]) i.next();
return new plasmaWordIndexEntryInstance(new String(n[0]), new String(n[1])); return new indexURLEntry(new String(n[0]), new String(n[1]));
} }
public void remove() { public void remove() {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
@ -255,7 +256,7 @@ public final class plasmaWordIndexEntity {
long timeout = (time == -1) ? Long.MAX_VALUE : System.currentTimeMillis() + time; long timeout = (time == -1) ? Long.MAX_VALUE : System.currentTimeMillis() + time;
try { try {
while ((i.hasNext()) && (System.currentTimeMillis() < timeout)) { while ((i.hasNext()) && (System.currentTimeMillis() < timeout)) {
addEntry((plasmaWordIndexEntryInstance) i.next()); addEntry((indexURLEntry) i.next());
} }
} catch (kelondroException e) { } catch (kelondroException e) {
serverLog.logSevere("PLASMA", "plasmaWordIndexEntity.merge: " + e.getMessage()); serverLog.logSevere("PLASMA", "plasmaWordIndexEntity.merge: " + e.getMessage());

@ -55,13 +55,13 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexTreeMapContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndexEntryInstance;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.plasma.plasmaURLPattern; import de.anomic.plasma.plasmaURLPattern;
import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.plasma.plasmaSearchTimingProfile;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
@ -467,9 +467,9 @@ public final class yacyClient {
// create containers // create containers
final int words = wordhashes.length() / indexEntryAttribute.wordHashLength; final int words = wordhashes.length() / indexEntryAttribute.wordHashLength;
plasmaWordIndexEntryContainer[] container = new plasmaWordIndexEntryContainer[words]; indexTreeMapContainer[] container = new indexTreeMapContainer[words];
for (int i = 0; i < words; i++) { for (int i = 0; i < words; i++) {
container[i] = new plasmaWordIndexEntryContainer(wordhashes.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength)); container[i] = new indexTreeMapContainer(wordhashes.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength));
} }
// insert results to containers // insert results to containers
@ -484,10 +484,10 @@ public final class yacyClient {
urlManager.stackEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); urlManager.stackEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry // save the url entry
final plasmaWordIndexEntryInstance entry; final indexURLEntry entry;
if (urlEntry.word() == null) { if (urlEntry.word() == null) {
// the old way to define words // the old way to define words
entry = new plasmaWordIndexEntryInstance( entry = new indexURLEntry(
urlEntry.hash(), urlEntry.hash(),
urlLength, urlComps, urlLength, urlComps,
urlEntry.descr().length(), urlEntry.descr().length(),
@ -514,7 +514,7 @@ public final class yacyClient {
} }
// add the url entry to the word indexes // add the url entry to the word indexes
for (int m = 0; m < words; m++) { for (int m = 0; m < words; m++) {
container[m].add(new plasmaWordIndexEntryInstance[]{entry}, System.currentTimeMillis()); container[m].add(new indexURLEntry[]{entry}, System.currentTimeMillis());
} }
} }
@ -882,11 +882,11 @@ public final class yacyClient {
// check if we got all necessary urls in the urlCache (only for debugging) // check if we got all necessary urls in the urlCache (only for debugging)
Iterator eenum; Iterator eenum;
plasmaWordIndexEntryInstance entry; indexURLEntry entry;
for (int i = 0; i < indexes.length; i++) { for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].entries(); eenum = indexes[i].entries();
while (eenum.hasNext()) { while (eenum.hasNext()) {
entry = (plasmaWordIndexEntryInstance) eenum.next(); entry = (indexURLEntry) eenum.next();
if (urlCache.get(entry.getUrlHash()) == null) { if (urlCache.get(entry.getUrlHash()) == null) {
yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.getUrlHash() + "' is not contained in urlCache"); yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.getUrlHash() + "' is not contained in urlCache");
} }
@ -962,11 +962,11 @@ public final class yacyClient {
int indexcount = 0; int indexcount = 0;
final StringBuffer entrypost = new StringBuffer(indexes.length*73); final StringBuffer entrypost = new StringBuffer(indexes.length*73);
Iterator eenum; Iterator eenum;
plasmaWordIndexEntryInstance entry; indexURLEntry entry;
for (int i = 0; i < indexes.length; i++) { for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].entries(); eenum = indexes[i].entries();
while (eenum.hasNext()) { while (eenum.hasNext()) {
entry = (plasmaWordIndexEntryInstance) eenum.next(); entry = (indexURLEntry) eenum.next();
entrypost.append(indexes[i].wordHash()) entrypost.append(indexes[i].wordHash())
.append(entry.toPropertyForm()) .append(entry.toPropertyForm())
.append(serverCore.crlfString); .append(serverCore.crlfString);

@ -85,7 +85,7 @@ import de.anomic.plasma.plasmaWordIndexAssortment;
import de.anomic.plasma.plasmaWordIndexAssortmentCluster; import de.anomic.plasma.plasmaWordIndexAssortmentCluster;
import de.anomic.plasma.plasmaWordIndexClassicDB; import de.anomic.plasma.plasmaWordIndexClassicDB;
import de.anomic.plasma.plasmaWordIndexEntity; import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntryInstance; import de.anomic.index.indexURLEntry;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
import de.anomic.server.serverDate; import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils; import de.anomic.server.serverFileUtils;
@ -858,7 +858,7 @@ public final class yacy {
// the combined container will fit, read the container // the combined container will fit, read the container
Iterator importWordIdxEntries = newContainer.entries(); Iterator importWordIdxEntries = newContainer.entries();
plasmaWordIndexEntryInstance importWordIdxEntry; indexURLEntry importWordIdxEntry;
while (importWordIdxEntries.hasNext()) { while (importWordIdxEntries.hasNext()) {
// testing if import process was aborted // testing if import process was aborted
@ -866,7 +866,7 @@ public final class yacy {
// getting next word index entry // getting next word index entry
entryCounter++; entryCounter++;
importWordIdxEntry = (plasmaWordIndexEntryInstance) importWordIdxEntries.next(); importWordIdxEntry = (indexURLEntry) importWordIdxEntries.next();
String urlHash = importWordIdxEntry.getUrlHash(); String urlHash = importWordIdxEntry.getUrlHash();
if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) try { if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) try {
// importing the new url // importing the new url
@ -969,9 +969,9 @@ public final class yacy {
// the combined container will fit, read the container // the combined container will fit, read the container
Iterator wordIdxEntries = wordIdxContainer.entries(); Iterator wordIdxEntries = wordIdxContainer.entries();
plasmaWordIndexEntryInstance wordIdxEntry; indexURLEntry wordIdxEntry;
while (wordIdxEntries.hasNext()) { while (wordIdxEntries.hasNext()) {
wordIdxEntry = (plasmaWordIndexEntryInstance) wordIdxEntries.next(); wordIdxEntry = (indexURLEntry) wordIdxEntries.next();
String urlHash = wordIdxEntry.getUrlHash(); String urlHash = wordIdxEntry.getUrlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null); plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null);

Loading…
Cancel
Save