git-svn-id: 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
@ -0,0 +1,41 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "">
<title>YaCy: Press Material, Publications, Presentations</title>
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
<!-- <meta name="Content-Language" content="German, Deutsch, de, at, ch"> -->
<meta name="Content-Language" content="English, Englisch">
<meta name="keywords" content="YaCy HTTP Proxy search engine spider indexer java network open free download Mac Windwos Software development">
<meta name="description" content="YaCy Software HTTP Proxy Freeware Home Page">
<meta name="copyright" content="Michael Christen">
<script src="navigation.js" type="text/javascript"></script>
<link rel="stylesheet" media="all" href="style.css">
<!-- Realisation: Michael Christen; Contact: mc<at>>
<body bgcolor="#fefefe" marginheight="0" marginwidth="0" leftmargin="0" topmargin="0">
<SCRIPT LANGUAGE="JavaScript1.1"><!--
<!-- ----- HERE STARTS CONTENT PART ----- -->
<h2>Press Material, Publications, Presentations</h2>
<p>Here you can find links to documents that had been published about YaCy by YaCy-Authors</p><br>
<p>Deutsche Dokumentation / German-only documents
<li><a href=""><b>"YaCy -- Peer-to-Peer Web-Suchmaschine"</b></a> - Veröffentlichung in der Datenschleuder #086; technische Details zur Funktionsweise</li>
<li><a href=""><b>Vortrag zur SuMa-eV Veranstaltung: "Portale/Suchmaschinen - und ihre Grenzen"</b> - pdf/präsentierfertige Folien</a></li>
<li><a href=""><b>Vortrag zur SuMa-eV Veranstaltung: "Portale/Suchmaschinen - und ihre Grenzen"</b> - Web-Präsentation</a></li>
<li><a href=""><b>Flyer "Das Wichtigste zu YaCy im Überblick"</b></a></li>
<!-- ----- HERE ENDS CONTENT PART ----- -->
<SCRIPT LANGUAGE="JavaScript1.1"><!--
@ -0,0 +1,380 @@
// -------------------------
// part of YACY
// (C) by Michael Peter Christen;
// first published on
// Frankfurt, Germany, 2005
// last major change: 6.5.2005
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma;
import java.util.*;
import java.lang.RuntimeException;
import de.anomic.kelondro.*;
import de.anomic.server.serverLog;
import de.anomic.yacy.yacySeedDB;
public class plasmaWordIndexCache implements plasmaWordIndexInterface {
private static final String indexDumpFileName = "indexDump.stack";
static String minKey, maxKey;
// class variables
private File databaseRoot;
private plasmaWordIndexInterface backend;
private TreeMap cache;
private kelondroMScoreCluster hashScore;
private HashMap hashDate;
private int maxWords;
private serverLog log;
static {
maxKey = "";
for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += 'z';
minKey = "";
for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-';
public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, serverLog log) {
this.databaseRoot = databaseRoot;
this.cache = new TreeMap();
this.hashScore = new kelondroMScoreCluster();
this.hashDate = new HashMap();
this.maxWords = 10000;
this.backend = backend;
this.log = log;
try {
} catch (IOException e){
log.logError("unable to restore cache dump: " + e.getMessage());
private void dump(int waitingSeconds) throws IOException {
log.logSystem("creating dump for index cache, " + cache.size() + " words (and much more urls)");
File indexDumpFile = new File(databaseRoot, indexDumpFileName);
if (indexDumpFile.exists()) indexDumpFile.delete();
kelondroStack dumpStack = new kelondroStack(indexDumpFile, 0, new int[]{plasmaWordIndexEntry.wordHashLength, 4, 8, plasmaWordIndexEntry.attrSpaceLong});
long startTime = System.currentTimeMillis();
long messageTime = System.currentTimeMillis() + 5000;
long wordsPerSecond = 0, wordcount = 0, urlcount = 0;
synchronized (cache) {
Iterator i = cache.entrySet().iterator();
Map.Entry entry;
String wordHash;
plasmaWordIndexEntryContainer container;
long creationTime;
plasmaWordIndexEntry wordEntry;
byte[][] row = new byte[4][];
while (i.hasNext()) {
// get entries
entry = (Map.Entry);
wordHash = (String) entry.getKey();
creationTime = getCreationTime(wordHash);
container = (plasmaWordIndexEntryContainer) entry.getValue();
// put entries on stack
if (container != null) {
Iterator ci = container.entries();
while (ci.hasNext()) {
wordEntry = (plasmaWordIndexEntry);
row[0] = wordHash.getBytes();
row[1] = kelondroRecords.long2bytes(container.size(), 4);
row[2] = kelondroRecords.long2bytes(creationTime, 8);
row[3] = wordEntry.toEncodedForm(true).getBytes();
// write a log
if (System.currentTimeMillis() > messageTime) {
wordsPerSecond = wordcount * 1000 / (1 + System.currentTimeMillis() - startTime);
log.logInfo("dumping status: " + wordcount + " words done, " + ((cache.size() - wordcount) / wordsPerSecond) + " seconds remaining");
messageTime = System.currentTimeMillis() + 5000;
log.logSystem("dumped " + urlcount + " word/url relations in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds");
private long restore() throws IOException {
File indexDumpFile = new File(databaseRoot, indexDumpFileName);
if (!(indexDumpFile.exists())) return 0;
kelondroStack dumpStack = new kelondroStack(indexDumpFile, 0);
log.logSystem("restore dump of index cache, " + dumpStack.size() + " word/url relations");
long startTime = System.currentTimeMillis();
long messageTime = System.currentTimeMillis() + 5000;
long urlCount = 0, urlsPerSecond = 0;
synchronized (cache) {
Iterator i = dumpStack.iterator();
kelondroRecords.Node node;
String wordHash;
plasmaWordIndexEntryContainer container;
long creationTime;
plasmaWordIndexEntry wordEntry;
byte[][] row = new byte[4][];
while (i.hasNext()) {
// get out one entry
node = (kelondroRecords.Node);
row = node.getValues();
wordHash = new String(row[0]);
creationTime = kelondroRecords.bytes2long(row[2]);
wordEntry = new plasmaWordIndexEntry(wordHash, new String(row[3]));
// store to cache
addEntry(wordHash, wordEntry, creationTime);
// write a log
if (System.currentTimeMillis() > messageTime) {
urlsPerSecond = urlCount * 1000 / (1 + System.currentTimeMillis() - startTime);
log.logInfo("restoring status: " + urlCount + " urls done, " + ((dumpStack.size() - urlCount) / urlsPerSecond) + " seconds remaining");
messageTime = System.currentTimeMillis() + 5000;
log.logSystem("restored " + cache.size() + " words in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds");
return urlCount;
public int maxURLinWordCache() {
return hashScore.getScore(hashScore.getMaxObject());
public int wordCacheRAMSize() {
return cache.size();
public void setMaxWords(int maxWords) {
this.maxWords = maxWords;
public int size() {
if (backend.size() > cache.size()) return backend.size(); else return cache.size();
public Iterator wordHashes(String startWordHash, boolean up) {
if (!(up)) throw new RuntimeException("plasmaWordIndexCache.wordHashes can only count up");
return new iterateCombined(cache.keySet().iterator(), backend.wordHashes(startWordHash, true), true);
public class iterateCombined implements Iterator {
Comparator comp;
Iterator a, b;
String na, nb;
boolean up;
public iterateCombined(Iterator a, Iterator b, boolean up) {
this.a = a;
this.b = b;
this.up = up;
this.comp = kelondroMSetTools.fastStringComparator(up);
private void nexta() {
if (a.hasNext()) na = (String); else na = null;
private void nextb() {
if (b.hasNext()) nb = (String); else nb = null;
public boolean hasNext() {
return (na != null) || (nb != null);
public Object next() {
String s;
if (na == null) {
s = nb;
return s;
if (nb == null) {
s = na;
return s;
// compare the strings
int c =, nb);
if (c == 0) {
s = na;
//System.out.println("Iterate Hash: take " + s + " from file&cache");
return s;
} else if ((up) && (c < 0)) {
s = na;
return s;
} else {
s = nb;
return s;
public void remove() {
private int flushKey(String key) {
plasmaWordIndexEntryContainer container = null;
long time;
synchronized (cache) {
container = (plasmaWordIndexEntryContainer) cache.get(key);
if (container == null) return 0; // flushing of nonexisting key
time = getCreationTime(key);
return backend.addEntries(container, time);
private int flushToLimit() {
if ((hashScore.size() == 0) && (cache.size() == 0)) {
serverLog.logDebug("PLASMA INDEXING", "flushToLimit: called but cache is empty");
return 0;
if ((hashScore.size() == 0) && (cache.size() != 0)) {
serverLog.logError("PLASMA INDEXING", "flushToLimit: hashScore.size=0 but cache.size=" + cache.size());
return 0;
if ((hashScore.size() != 0) && (cache.size() == 0)) {
serverLog.logError("PLASMA INDEXING", "flushToLimit: hashScore.size=" + hashScore.size() + " but cache.size=0");
return 0;
//serverLog.logDebug("PLASMA INDEXING", "flushSpecific: hashScore.size=" + hashScore.size() + ", cache.size=" + cache.size());
int total = 0;
synchronized (hashScore) {
String key;
int count;
Long createTime;
while (hashScore.size() >= maxWords) {
key = (String) hashScore.getMaxObject();
createTime = (Long) hashDate.get(key);
count = hashScore.getScore(key);
if ((createTime != null) && ((System.currentTimeMillis() - createTime.longValue()) < 9000)) {
log.logDebug("key " + key + " is too fresh, abandon flush (count=" + count + ", cachesize=" + cache.size() + ")");
if (count < 5) log.logWarning("flushing of key " + key + " not appropriate (too less entries, count=" + count + "): increase cache size");
log.logDebug("flushing key " + key + ", count=" + count + ", cachesize=" + cache.size());
total += flushKey(key);
if (total > 100) break;
return total;
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) {
return backend.getIndex(wordHash, deleteIfEmpty);
public long getCreationTime(String wordHash) {
Long time = (Long) hashDate.get(wordHash);
if (time == null) return 0;
return time.longValue();
public void deleteIndex(String wordHash) {
synchronized (cache) {
public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) {
return backend.removeEntries(wordHash, urlHashes, deleteComplete);
public int addEntries(plasmaWordIndexEntryContainer container, long creationTime) {
//serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size());
//if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries");
// put new words into cache
int added = 0;
synchronized (cache) {
String wordHash = container.wordHash();
plasmaWordIndexEntryContainer entries = (plasmaWordIndexEntryContainer) cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null
if (entries == null) entries = new plasmaWordIndexEntryContainer(wordHash);
added = entries.add(container);
if (added > 0) {
cache.put(wordHash, entries);
hashScore.addScore(wordHash, added);
hashDate.put(wordHash, new Long(creationTime));
//System.out.println("DEBUG: cache = " + cache.toString());
return added;
private void addEntry(String wordHash, plasmaWordIndexEntry newEntry, long creationTime) {
plasmaWordIndexEntryContainer entries = (plasmaWordIndexEntryContainer) cache.get(wordHash);
if (entries == null) entries = new plasmaWordIndexEntryContainer(wordHash);
if (entries.add(newEntry)) {
cache.put(wordHash, entries);
hashDate.put(wordHash, new Long(creationTime));
public void close(int waitingSeconds) {
try {
} catch (IOException e){
log.logError("unable to dump cache: " + e.getMessage());
@ -0,0 +1,255 @@
// -----------------------------
// part of YACY
// (C) by Michael Peter Christen;
// first published on
// Frankfurt, Germany, 2005
// last major change: 6.5.2005
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma;
import java.util.*;
import de.anomic.kelondro.*;
import de.anomic.server.serverLog;
import de.anomic.yacy.yacySeedDB;
public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
// class variables
private File databaseRoot;
private serverLog log;
private int size;
public plasmaWordIndexClassicDB(File databaseRoot, serverLog log) throws IOException {
this.databaseRoot = databaseRoot;
this.log = log;
this.size = 0;
public int size() {
return size;
public Iterator wordHashes(String startHash, boolean up) {
return new iterateFiles(startHash, up);
public class iterateFiles implements Iterator {
private ArrayList hierarchy; // contains TreeSet elements, earch TreeSet contains File Entries
private Comparator comp; // for string-compare
private String buffer; // the prefetch-buffer
public iterateFiles(String startHash, boolean up) {
this.hierarchy = new ArrayList();
this.comp = kelondroMSetTools.fastStringComparator(up);
// the we initially fill the hierarchy with the content of the root folder
String path = "WORDS";
TreeSet list = list(new File(databaseRoot, path));
// if we have a start hash then we find the appropriate subdirectory to start
if ((startHash != null) && (startHash.length() == yacySeedDB.commonHashLength)) {
delete(startHash.substring(0, 1), list);
if (list.size() > 0) {
String[] paths = new String[]{startHash.substring(0, 1), startHash.substring(1, 2), startHash.substring(2, 4), startHash.substring(4, 6)};
int pathc = 0;
while ((pathc < paths.length) &&
( list.first(), paths[pathc]) == 0)) {
path = path + "/" + paths[pathc];
list = list(new File(databaseRoot, path));
delete(paths[pathc], list);
if (list.size() == 0) break;
while (((buffer = next0()) != null) && (, startHash) < 0)) {};
} else {
buffer = next0();
private synchronized void delete(String pattern, TreeSet names) {
String name;
while ((names.size() > 0) && ( File(name = (String) names.first())).getName(), pattern) < 0)) names.remove(name);
private TreeSet list(File path) {
//System.out.println("PATH: " + path);
TreeSet t = new TreeSet(comp);
String[] l = path.list();
if (l != null) for (int i = 0; i < l.length; i++) t.add(path + "/" + l[i]);
//else System.out.println("DEBUG: wrong path " + path);
return t;
private synchronized String next0() {
// the object is a File pointing to the corresponding file
File f;
String n;
TreeSet t;
do {
t = null;
while ((t == null) && (hierarchy.size() > 0)) {
t = (TreeSet) hierarchy.get(hierarchy.size() - 1);
if (t.size() == 0) {
hierarchy.remove(hierarchy.size() - 1); // we step up one hierarchy
t = null;
if ((hierarchy.size() == 0) || (t.size() == 0)) return null; // this is the end
// fetch value
f = new File(n = (String) t.first());
// if the value represents another folder, we step into the next hierarchy
if (f.isDirectory()) {
t = list(f);
if (t.size() == 0) {
// the folder is empty, delete it
} else {
f = null;
} while (f == null);
// thats it
if ((f == null) || ((n = f.getName()) == null) || (n.length() < yacySeedDB.commonHashLength)) {
return null;
} else {
return n.substring(0, yacySeedDB.commonHashLength);
public boolean hasNext() {
return buffer != null;
public Object next() {
String r = buffer;
while (((buffer = next0()) != null) && (, r) < 0)) {};
return r;
public void remove() {
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) {
try {
return new plasmaWordIndexEntity(databaseRoot, wordHash, deleteIfEmpty);
} catch (IOException e) {
log.logError("plasmaWordIndexClassic.getIndex: " + e.getMessage());
return null;
public long getCreationTime(String wordHash) {
File f = plasmaWordIndexEntity.wordHash2path(databaseRoot, wordHash);
if (f.exists()) return f.lastModified(); else return -1;
public void deleteIndex(String wordHash) {
try {
plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash);
} catch (IOException e) {
log.logError("plasmaWordIndexClassic.deleteIndex: " + e.getMessage());
public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) {
// removes all given url hashes from a single word index. Returns number of deletions.
plasmaWordIndexEntity pi = getIndex(wordHash, true);
int count = 0;
try {
for (int i = 0; i < urlHashes.length; i++)
if (pi.removeEntry(urlHashes[i], deleteComplete)) count++;
int size = pi.size();
pi.close(); pi = null;
// check if we can remove the index completely
if ((deleteComplete) && (size == 0)) deleteIndex(wordHash);
return count;
} catch (IOException e) {
log.logError("plasmaWordIndexClassic.removeEntries: " + e.getMessage());
return count;
public int addEntries(plasmaWordIndexEntryContainer container, long creationTime) {
//System.out.println("* adding " + newEntries.size() + " cached word index entries for word " + wordHash); // debug
// fetch the index cache
if (container.size() == 0) return 0;
// open file
try {
plasmaWordIndexEntity pi = new plasmaWordIndexEntity(databaseRoot, container.wordHash(), false);
int count = 0;
// write from vector
if (container != null) {
Iterator i = container.entries();
while (i.hasNext()) {
if (pi.addEntry((plasmaWordIndexEntry) count++;
// close and return
pi = null;
return count;
} catch (IOException e) {
log.logError("plasmaWordIndexClassic.addEntries: " + e.getMessage());
return 0;
public void close(int waitingSeconds) {
@ -0,0 +1,97 @@
// ------------------------------
// part of YaCy
// (C) by Michael Peter Christen;
// first published on
// Frankfurt, Germany, 2005
// last major change: 07.05.2005
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma;
import java.util.*;
public class plasmaWordIndexEntryContainer {
private String wordHash;
private HashMap container;
public plasmaWordIndexEntryContainer(String wordHash) {
this.wordHash = wordHash;
container = new HashMap(); // a urlhash/plasmaWordIndexEntry - relation
public int size() {
return container.size();
public String wordHash() {
return wordHash;
public boolean add(plasmaWordIndexEntry entry) {
// returns true if the new entry was added, false if it already existet
String urlHash = entry.getUrlHash();
if (container.containsKey(urlHash)) return false;
container.put(urlHash, entry);
return true;
public int add(plasmaWordIndexEntryContainer c) {
// returns the number of new elements
Iterator i = c.entries();
int x = 0;
while (i.hasNext()) {
if (add((plasmaWordIndexEntry) x++;
return x;
public Iterator entries() {
// returns an iterator of plasmaWordIndexEntry objects
return container.values().iterator();
public static plasmaWordIndexEntryContainer instantContainer(String wordHash, plasmaWordIndexEntry entry) {
plasmaWordIndexEntryContainer c = new plasmaWordIndexEntryContainer(wordHash);
return c;
public String toString() {
return "C[" + wordHash + "] has " + container.size() + " entries";
@ -1,275 +0,0 @@
// -----------------------------
// part of YACY
// (C) by Michael Peter Christen;
// first published on
// Frankfurt, Germany, 2004
// last major change: 22.01.2004
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
The plasmaIndexCache manages a database table with a list of
indexEntries in it. This is done in a completely different fashion
as organized by the plasmaIndex tables. The entries are not
sorted and just stored in a buffer.
Whenever during a seach an index is retrieved, first it's buffer
is flushed into the corresponding index table, so that it can be
sorted into the remaining index entry elements.
The cache database consist of
- the word hash as primary key
- one column with a one-byte counter
- a number of more columns with indexEntry elements
// compile with
// javac -classpath classes -sourcepath source -d classes -g source/de/anomic/plasma/*.java
package de.anomic.plasma;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Vector;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroTree;
import de.anomic.server.serverLog;
public class plasmaWordIndexFileCache {
private static final String indexCacheFileName = "indexCache.db";
private static final int buffers = 50; // number of buffered entries per word
// class variables
private File databaseRoot;
private kelondroTree indexCache;
private int bufferkb;
public plasmaWordIndexFileCache(File databaseRoot, int bufferkb) throws IOException {
this.databaseRoot = databaseRoot;
this.bufferkb = bufferkb;
File indexCacheFile = new File(databaseRoot, indexCacheFileName);
if (indexCacheFile.exists()) {
// simply open the file
indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400);
} else {
private void resetCacheFile() {
// this has to be used in emergencies only
// it can happen that there is a serious db inconsistency; in that case we re-create the indexCache
try { indexCache.close(); } catch (IOException e) {}
File indexCacheFile = new File(databaseRoot, indexCacheFileName);
if (indexCacheFile.exists()) indexCacheFile.delete();
try {
} catch (IOException e) {
de.anomic.server.serverLog.logError("PLASMA", "plasmaWordIndexFileCache.resetCacheFile: serious failure creating the cache file: " + e.getMessage());
indexCache = null;
private void createCacheFile(File indexCacheFile) throws IOException {
// create a new file
int[] columns = new int[buffers + 2];
columns[0] = plasmaWordIndexEntry.wordHashLength;
columns[1] = 1;
for (int i = 0; i < buffers; i++) columns[i + 2] = plasmaCrawlLURL.urlHashLength + plasmaWordIndexEntry.attrSpaceShort;
indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400, columns);
protected void close() throws IOException {
indexCache = null;
private byte[][] getCache(String wordHash) throws IOException {
// read one line from the cache; if none exists: construct one
byte[][] row;
try {
row = indexCache.get(wordHash.getBytes());
} catch (Exception e) {
// we had some negativeSeekOffsetExceptions here, and also loops may cause this
// in that case the indexCache is corrupt
System.out.println("Error in plasmaWordINdexFileCache.getCache: index for hash " + wordHash + " is corrupt:" + e.toString());
row = null;
if (row == null) {
row = new byte[indexCache.columns()][];
row[0] = wordHash.getBytes();
row[1] = new byte[1];
row[1][0] = (byte) 0;
return row;
protected Iterator wordHashes(String wordHash, boolean up) throws IOException {
try {
return indexCache.rows(up, false, (wordHash == null) ? null : wordHash.getBytes());
} catch (kelondroException e) {
de.anomic.server.serverLog.logError("PLASMA", "kelondro error in plasmaWordIndexFileCache: " + e.getMessage() + "; deleting index for " + wordHash);
return new HashSet().iterator();
protected plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) throws IOException {
// first flush the index cache, if there is any for that word hash
byte[][] row = indexCache.get(wordHash.getBytes());
if (row != null) {
int entries = (int) row[1][0];
if (entries != 0) flushCache(row, null); // if the cache has entries, flush it
indexCache.remove(wordHash.getBytes()); // delete the cache index row; suppose to be empty now
// then return the index from the uncached file (with new entries)
return new plasmaWordIndexEntity(databaseRoot, wordHash, deleteIfEmpty);
protected void addEntriesToIndex(String wordHash, Vector /* of plasmaIndexEntry */ newEntries) throws IOException {
//System.out.println("* adding " + newEntries.size() + " cached word index entries for word " + wordHash); // debug
// fetch the index cache
if (newEntries.size() == 0) return;
byte[][] row = getCache(wordHash);
int entries = (int) row[1][0];
// check if the index cache is full
if (entries + 2 + newEntries.size() >= indexCache.columns()) {
flushCache(row, newEntries); // and put in new values
entries = 0;
row[1][0] = (byte) 0; // set number of entries to zero
} else {
// put in the new values
String newEntry;
for (int i = 0; i < newEntries.size(); i++) {
newEntry = ((plasmaWordIndexEntry) newEntries.elementAt(i)).getUrlHash() + ((plasmaWordIndexEntry) newEntries.elementAt(i)).toEncodedForm(false);
row[entries + 2] = newEntry.getBytes();
row[1][0] = (byte) entries;
try {
} catch (kelondroException e) {
// this is a very bad case; a database inconsistency occurred
serverLog.logError("PLASMA", "fatal error in plasmaWordIndexFileCache.addEntriesToIndex: write of " + wordHash + " to index cache failed - " + e.getMessage() + " - indexCache.db deleted");
} catch (IOException e) {
// this is a very bad case; a database inconsistency occurred
serverLog.logError("PLASMA", "fatal error in plasmaWordIndexFileCache.addEntriesToIndex: write of " + wordHash + " to index cache failed - " + e.getMessage() + " - indexCache.db deleted");
// finished!
protected void deleteComplete(String wordHash) throws IOException {
plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash);
protected int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) throws IOException {
// removes all given url hashes from a single word index. Returns number of deletions.
plasmaWordIndexEntity pi = getIndex(wordHash, true);
int count = 0;
for (int i = 0; i < urlHashes.length; i++) if (pi.removeEntry(urlHashes[i], deleteComplete)) count++;
int size = pi.size();
pi.close(); pi = null;
// check if we can remove the index completely
if ((deleteComplete) && (size == 0)) {
// remove index
if (!(plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash)))
System.out.println("DEBUG: cannot remove index file for word hash " + wordHash);
// remove cache
return count;
private synchronized void flushCache(byte[][] row, Vector indexEntries) throws IOException {
String wordHash = new String(row[0]);
int entries = (int) row[1][0];
if ((entries == 0) && ((indexEntries == null) || (indexEntries.size() == 0))) return;
// open file
plasmaWordIndexEntity pi = new plasmaWordIndexEntity(databaseRoot, wordHash, false);
// write from array
plasmaWordIndexEntry entry;
for (int i = 0; i < entries; i++) {
entry = new plasmaWordIndexEntry(new String(row[i + 2], 0, plasmaCrawlLURL.urlHashLength),
new String(row[i + 2], plasmaCrawlLURL.urlHashLength, plasmaWordIndexEntry.attrSpaceShort));
// write from vector
if (indexEntries != null) {
for (int i = 0; i < indexEntries.size(); i++)
pi.addEntry((plasmaWordIndexEntry) indexEntries.elementAt(i));
// close and return
pi = null;
private int size(String wordHash) throws IOException {
// return number of entries in specific cache
byte[][] row = indexCache.get(wordHash.getBytes());
if (row == null) return 0;
return (int) row[1][0];
protected int size() {
if (indexCache == null) return 0; else return indexCache.size();
private plasmaIndex getIndexF(String wordHash) throws IOException {
return new plasmaIndex(databaseRoot, wordHash);
private void addEntryToIndexF(String wordHash, plasmaIndexEntry entry) throws IOException {
plasmaIndex pi = new plasmaIndex(databaseRoot, wordHash);
@ -0,0 +1,62 @@
// -----------------------------
// part of YACY
// (C) by Michael Peter Christen;
// first published on
// Frankfurt, Germany, 2005
// last major change: 6.5.2005
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma;
import java.util.*;
public interface plasmaWordIndexInterface {
public int size();
public Iterator wordHashes(String startWordHash, boolean up);
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty);
public long getCreationTime(String wordHash);
public void deleteIndex(String wordHash);
public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete);
public int addEntries(plasmaWordIndexEntryContainer newEntries, long creationTime);
public void close(int waitingSeconds);
@ -1,253 +0,0 @@
// -----------------------
// part of YACY
// (C) by Michael Peter Christen;
// first published on
// Frankfurt, Germany, 2004
// last major change: 22.12.2004
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
// compile with
// javac -classpath classes -sourcepath source -d classes -g source/de/anomic/plasma/*.java
package de.anomic.plasma;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Vector;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.serverLog;
import de.anomic.yacy.yacySeedDB;
public class plasmaWordIndexRAMCache extends Thread {
static String minKey, maxKey;
// class variables
TreeMap cache;
kelondroMScoreCluster hashScore;
plasmaWordIndexFileCache pic;
boolean terminate;
long terminateUntil;
int maxWords;
static {
maxKey = "";
for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += 'z';
minKey = "";
for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-';
public plasmaWordIndexRAMCache(File databaseRoot, int bufferkb) throws IOException {
this.pic = new plasmaWordIndexFileCache(databaseRoot, bufferkb);
this.cache = new TreeMap();
this.hashScore = new kelondroMScoreCluster();
this.maxWords = 1000;
this.terminate = false;
public int maxURLinWordCache() {
return hashScore.getScore(hashScore.getMaxObject());
public int wordCacheRAMSize() {
return cache.size();
public void setMaxWords(int maxWords) {
this.maxWords = maxWords;
public void run() {
serverLog.logSystem("PLASMA INDEXING", "started word cache management");
int check;
// permanently flush cache elements
while (!(terminate)) {
if (hashScore.size() < 100) try {Thread.currentThread().sleep(10000);} catch (InterruptedException e) {}
while ((!(terminate)) && (cache != null) && (hashScore.size() > 0)) try {
check = hashScore.size();
//serverLog.logDebug("PLASMA INDEXING", "single flush. bevore=" + check + "; after=" + hashScore.size());
try {Thread.currentThread().sleep(10 + ((maxWords / 10) / (1 + hashScore.size())));} catch (InterruptedException e) {}
} catch (IOException e) {
serverLog.logError("PLASMA INDEXING", "PANIK! exception in main cache loop: " + e.getMessage());
terminate = true;
cache = null;
serverLog.logSystem("PLASMA INDEXING", "CATCHED TERMINATION SIGNAL: start final flush");
// close all;
try {
// first flush everything
while ((hashScore.size() > 0) && (System.currentTimeMillis() < terminateUntil)) {
// then close file cache:
} catch (IOException e) {
serverLog.logDebug("PLASMA INDEXING", "interrupted final flush: " + e.toString());
// report
if (hashScore.size() == 0)
serverLog.logSystem("PLASMA INDEXING", "finished final flush; flushed all words");
serverLog.logError("PLASMA INDEXING", "terminated final flush; " + hashScore.size() + " words lost");
// delete data
cache = null;
hashScore = null;
public void close(int waitingBoundSeconds) {
terminate = true;
// wait until terination is done
// we can do at least 6 flushes/second
int waitingtime = 10 + (((cache == null) ? 0 : cache.size()) / 5); // seconds
if (waitingtime > waitingBoundSeconds) waitingtime = waitingBoundSeconds; // upper bound
this.terminateUntil = System.currentTimeMillis() + (waitingtime * 1000);
terminate = true;
while ((cache != null) && (waitingtime > 0)) {
serverLog.logDebug("PLASMA INDEXING", "final word flush; cache.size=" + cache.size() + "; time-out in " + waitingtime + " seconds");
try {Thread.currentThread().sleep(5000);} catch (InterruptedException e) {}
waitingtime -= 5;
private int flushSpecific(boolean greatest) throws IOException {
//System.out.println("DEBUG: plasmaIndexRAMCache.flushSpecific(" + ((greatest) ? "greatest" : "smallest") + "); cache.size() = " + cache.size());
if ((hashScore.size() == 0) && (cache.size() == 0)) {
serverLog.logDebug("PLASMA INDEXING", "flushSpecific: called but cache is empty");
return 0;
if ((hashScore.size() == 0) && (cache.size() != 0)) {
serverLog.logError("PLASMA INDEXING", "flushSpecific: hashScore.size=0 but cache.size=" + cache.size());
return 0;
if ((hashScore.size() != 0) && (cache.size() == 0)) {
serverLog.logError("PLASMA INDEXING", "flushSpecific: hashScore.size=" + hashScore.size() + " but cache.size=0");
return 0;
//serverLog.logDebug("PLASMA INDEXING", "flushSpecific: hashScore.size=" + hashScore.size() + ", cache.size=" + cache.size());
String key = (String) ((greatest) ? hashScore.getMaxObject() : hashScore.getMinObject());
return flushKey(key, "flushSpecific");
private int flushKey(String key, String caller) throws IOException {
Vector v = null;
v = (Vector) cache.get(key);
if (v == null) return 0; // flushing of nonexisting key
synchronized (cache) {
pic.addEntriesToIndex(key, v);
return v.size();
public synchronized Iterator wordHashesMem(String wordHash, int count) throws IOException {
// returns a list of hashes from a specific start point
// we need to flush some of the elements in the cache first
// maybe we flush too much, but this is not easy to find out and it does not matter
TreeMap subMap = new TreeMap(cache.subMap((wordHash == null) ? minKey : wordHash, maxKey));
int flushcount = subMap.size();
if (flushcount > count) flushcount = count;
String key;
for (int i = 0; i < flushcount ; i++) {
key = (String) subMap.firstKey();
flushKey(key, "getSequentialWordHashesMem");
// finally return the result from the underlying hash list:
return pic.wordHashes(wordHash, true);
public plasmaWordIndexEntity getIndexMem(String wordHash, boolean deleteIfEmpty) throws IOException {
flushKey(wordHash, "getIndexMem");
return pic.getIndex(wordHash, deleteIfEmpty);
public int addEntryToIndexMem(String wordHash, plasmaWordIndexEntry entry) throws IOException {
// make space for new words
int flushc = 0;
//serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size());
synchronized (hashScore) {
while (hashScore.size() > maxWords) flushc += flushSpecific(true);
//if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries");
// put new words into cache
synchronized (cache) {
Vector v = (Vector) cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null
if (v == null) v = new Vector();
cache.put(wordHash, v);
return flushc;
public synchronized void deleteComplete(String wordHash) throws IOException {
public int removeEntriesMem(String wordHash, String[] urlHashes, boolean deleteComplete) throws IOException {
flushKey(wordHash, "removeEntriesMem");
return pic.removeEntries(wordHash, urlHashes, deleteComplete);
public int sizeMin() {
// it is not easy to find out the correct size of the cache
// to make the result correct, it would be necessary to flush the complete ram cache
// instead, we return the minimum size of the cache, which is the maximun of either the
// ram or table cache
if ((hashScore == null) || (pic == null)) return 0;
return (hashScore.size() < pic.size()) ? pic.size() : hashScore.size();
@ -1,8 +1,2 @@
#plasmaParser configuration file
#plasmaParser configuration file
#Mon May 02 10:12:02 CEST 2005
#Sat May 07 22:32:33 CEST 2005
Reference in new issue