git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@86 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
3f85978519
commit
1d7fed87dc
@ -0,0 +1,41 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>YaCy: Press Material, Publications, Presentations</title>
|
||||
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
|
||||
<!-- <meta name="Content-Language" content="German, Deutsch, de, at, ch"> -->
|
||||
<meta name="Content-Language" content="English, Englisch">
|
||||
<meta name="keywords" content="YaCy HTTP Proxy search engine spider indexer java network open free download Mac Windwos Software development">
|
||||
<meta name="description" content="YaCy Software HTTP Proxy Freeware Home Page">
|
||||
<meta name="copyright" content="Michael Christen">
|
||||
<script src="navigation.js" type="text/javascript"></script>
|
||||
<link rel="stylesheet" media="all" href="style.css">
|
||||
<!-- Realisation: Michael Christen; Contact: mc<at>anomic.de-->
|
||||
</head>
|
||||
<body bgcolor="#fefefe" marginheight="0" marginwidth="0" leftmargin="0" topmargin="0">
|
||||
<SCRIPT LANGUAGE="JavaScript1.1"><!--
|
||||
globalheader();
|
||||
//--></SCRIPT>
|
||||
<!-- ----- HERE STARTS CONTENT PART ----- -->
|
||||
|
||||
<h2>Press Material, Publications, Presentations</h2>
|
||||
|
||||
<p>Here you can find links to documents that had been published about YaCy by YaCy-Authors</p><br>
|
||||
|
||||
<p>Deutsche Dokumentation / German-only documents
|
||||
<ul>
|
||||
<li><a href="http://www.yacy.net/yacy/material/YaCy-Datenschleuder086.pdf"><b>"YaCy -- Peer-to-Peer Web-Suchmaschine"</b></a> - Veröffentlichung in der Datenschleuder #086; technische Details zur Funktionsweise</li>
|
||||
<li><a href="http://www.yacy.net/yacy/material/YaCy-nichtMonopolisierbar.pdf"><b>Vortrag zur SuMa-eV Veranstaltung: "Portale/Suchmaschinen - und ihre Grenzen"</b> - pdf/präsentierfertige Folien</a></li>
|
||||
<li><a href="http://www.yacy.net/yacy/material/YaCy-nichtMonopolisierbar/index.html"><b>Vortrag zur SuMa-eV Veranstaltung: "Portale/Suchmaschinen - und ihre Grenzen"</b> - Web-Präsentation</a></li>
|
||||
<li><a href="http://www.yacy.net/yacy/material/YaCy-FlyerD.pdf"><b>Flyer "Das Wichtigste zu YaCy im Überblick"</b></a></li>
|
||||
</ul></p><br>
|
||||
|
||||
|
||||
|
||||
|
||||
<!-- ----- HERE ENDS CONTENT PART ----- -->
|
||||
<SCRIPT LANGUAGE="JavaScript1.1"><!--
|
||||
globalfooter();
|
||||
//--></SCRIPT>
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,380 @@
|
||||
// plasmaWordIndexCache.java
|
||||
// -------------------------
|
||||
// part of YACY
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2005
|
||||
// last major change: 6.5.2005
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.lang.RuntimeException;
|
||||
import de.anomic.kelondro.*;
|
||||
import de.anomic.server.serverLog;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
|
||||
public class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
|
||||
private static final String indexDumpFileName = "indexDump.stack";
|
||||
|
||||
static String minKey, maxKey;
|
||||
|
||||
// class variables
|
||||
private File databaseRoot;
|
||||
private plasmaWordIndexInterface backend;
|
||||
private TreeMap cache;
|
||||
private kelondroMScoreCluster hashScore;
|
||||
private HashMap hashDate;
|
||||
private int maxWords;
|
||||
private serverLog log;
|
||||
|
||||
static {
|
||||
maxKey = "";
|
||||
for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += 'z';
|
||||
minKey = "";
|
||||
for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-';
|
||||
}
|
||||
|
||||
public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, serverLog log) {
|
||||
this.databaseRoot = databaseRoot;
|
||||
this.cache = new TreeMap();
|
||||
this.hashScore = new kelondroMScoreCluster();
|
||||
this.hashDate = new HashMap();
|
||||
this.maxWords = 10000;
|
||||
this.backend = backend;
|
||||
this.log = log;
|
||||
try {
|
||||
restore();
|
||||
} catch (IOException e){
|
||||
log.logError("unable to restore cache dump: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private void dump(int waitingSeconds) throws IOException {
|
||||
log.logSystem("creating dump for index cache, " + cache.size() + " words (and much more urls)");
|
||||
File indexDumpFile = new File(databaseRoot, indexDumpFileName);
|
||||
if (indexDumpFile.exists()) indexDumpFile.delete();
|
||||
kelondroStack dumpStack = new kelondroStack(indexDumpFile, 0, new int[]{plasmaWordIndexEntry.wordHashLength, 4, 8, plasmaWordIndexEntry.attrSpaceLong});
|
||||
long startTime = System.currentTimeMillis();
|
||||
long messageTime = System.currentTimeMillis() + 5000;
|
||||
long wordsPerSecond = 0, wordcount = 0, urlcount = 0;
|
||||
synchronized (cache) {
|
||||
Iterator i = cache.entrySet().iterator();
|
||||
Map.Entry entry;
|
||||
String wordHash;
|
||||
plasmaWordIndexEntryContainer container;
|
||||
long creationTime;
|
||||
plasmaWordIndexEntry wordEntry;
|
||||
byte[][] row = new byte[4][];
|
||||
while (i.hasNext()) {
|
||||
// get entries
|
||||
entry = (Map.Entry) i.next();
|
||||
wordHash = (String) entry.getKey();
|
||||
creationTime = getCreationTime(wordHash);
|
||||
container = (plasmaWordIndexEntryContainer) entry.getValue();
|
||||
|
||||
// put entries on stack
|
||||
if (container != null) {
|
||||
Iterator ci = container.entries();
|
||||
while (ci.hasNext()) {
|
||||
wordEntry = (plasmaWordIndexEntry) ci.next();
|
||||
row[0] = wordHash.getBytes();
|
||||
row[1] = kelondroRecords.long2bytes(container.size(), 4);
|
||||
row[2] = kelondroRecords.long2bytes(creationTime, 8);
|
||||
row[3] = wordEntry.toEncodedForm(true).getBytes();
|
||||
dumpStack.push(row);
|
||||
urlcount++;
|
||||
}
|
||||
}
|
||||
wordcount++;
|
||||
|
||||
// write a log
|
||||
if (System.currentTimeMillis() > messageTime) {
|
||||
wordsPerSecond = wordcount * 1000 / (1 + System.currentTimeMillis() - startTime);
|
||||
log.logInfo("dumping status: " + wordcount + " words done, " + ((cache.size() - wordcount) / wordsPerSecond) + " seconds remaining");
|
||||
messageTime = System.currentTimeMillis() + 5000;
|
||||
}
|
||||
}
|
||||
}
|
||||
log.logSystem("dumped " + urlcount + " word/url relations in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds");
|
||||
}
|
||||
|
||||
private long restore() throws IOException {
|
||||
File indexDumpFile = new File(databaseRoot, indexDumpFileName);
|
||||
if (!(indexDumpFile.exists())) return 0;
|
||||
kelondroStack dumpStack = new kelondroStack(indexDumpFile, 0);
|
||||
log.logSystem("restore dump of index cache, " + dumpStack.size() + " word/url relations");
|
||||
long startTime = System.currentTimeMillis();
|
||||
long messageTime = System.currentTimeMillis() + 5000;
|
||||
long urlCount = 0, urlsPerSecond = 0;
|
||||
synchronized (cache) {
|
||||
Iterator i = dumpStack.iterator();
|
||||
kelondroRecords.Node node;
|
||||
String wordHash;
|
||||
plasmaWordIndexEntryContainer container;
|
||||
long creationTime;
|
||||
plasmaWordIndexEntry wordEntry;
|
||||
byte[][] row = new byte[4][];
|
||||
while (i.hasNext()) {
|
||||
// get out one entry
|
||||
node = (kelondroRecords.Node) i.next();
|
||||
row = node.getValues();
|
||||
wordHash = new String(row[0]);
|
||||
creationTime = kelondroRecords.bytes2long(row[2]);
|
||||
wordEntry = new plasmaWordIndexEntry(wordHash, new String(row[3]));
|
||||
|
||||
// store to cache
|
||||
addEntry(wordHash, wordEntry, creationTime);
|
||||
urlCount++;
|
||||
|
||||
// write a log
|
||||
if (System.currentTimeMillis() > messageTime) {
|
||||
urlsPerSecond = urlCount * 1000 / (1 + System.currentTimeMillis() - startTime);
|
||||
log.logInfo("restoring status: " + urlCount + " urls done, " + ((dumpStack.size() - urlCount) / urlsPerSecond) + " seconds remaining");
|
||||
messageTime = System.currentTimeMillis() + 5000;
|
||||
}
|
||||
}
|
||||
}
|
||||
log.logSystem("restored " + cache.size() + " words in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds");
|
||||
return urlCount;
|
||||
}
|
||||
|
||||
public int maxURLinWordCache() {
|
||||
return hashScore.getScore(hashScore.getMaxObject());
|
||||
}
|
||||
|
||||
public int wordCacheRAMSize() {
|
||||
return cache.size();
|
||||
}
|
||||
|
||||
public void setMaxWords(int maxWords) {
|
||||
this.maxWords = maxWords;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
if (backend.size() > cache.size()) return backend.size(); else return cache.size();
|
||||
}
|
||||
|
||||
public Iterator wordHashes(String startWordHash, boolean up) {
|
||||
if (!(up)) throw new RuntimeException("plasmaWordIndexCache.wordHashes can only count up");
|
||||
return new iterateCombined(cache.keySet().iterator(), backend.wordHashes(startWordHash, true), true);
|
||||
}
|
||||
|
||||
public class iterateCombined implements Iterator {
|
||||
|
||||
Comparator comp;
|
||||
Iterator a, b;
|
||||
String na, nb;
|
||||
boolean up;
|
||||
|
||||
public iterateCombined(Iterator a, Iterator b, boolean up) {
|
||||
this.a = a;
|
||||
this.b = b;
|
||||
this.up = up;
|
||||
this.comp = kelondroMSetTools.fastStringComparator(up);
|
||||
nexta();
|
||||
nextb();
|
||||
}
|
||||
|
||||
private void nexta() {
|
||||
if (a.hasNext()) na = (String) a.next(); else na = null;
|
||||
}
|
||||
private void nextb() {
|
||||
if (b.hasNext()) nb = (String) b.next(); else nb = null;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return (na != null) || (nb != null);
|
||||
}
|
||||
|
||||
public Object next() {
|
||||
String s;
|
||||
if (na == null) {
|
||||
s = nb;
|
||||
nextb();
|
||||
return s;
|
||||
}
|
||||
if (nb == null) {
|
||||
s = na;
|
||||
nexta();
|
||||
return s;
|
||||
}
|
||||
// compare the strings
|
||||
int c = comp.compare(na, nb);
|
||||
if (c == 0) {
|
||||
s = na;
|
||||
//System.out.println("Iterate Hash: take " + s + " from file&cache");
|
||||
nexta();
|
||||
nextb();
|
||||
return s;
|
||||
} else if ((up) && (c < 0)) {
|
||||
s = na;
|
||||
nexta();
|
||||
return s;
|
||||
} else {
|
||||
s = nb;
|
||||
nextb();
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private int flushKey(String key) {
|
||||
plasmaWordIndexEntryContainer container = null;
|
||||
long time;
|
||||
synchronized (cache) {
|
||||
container = (plasmaWordIndexEntryContainer) cache.get(key);
|
||||
if (container == null) return 0; // flushing of nonexisting key
|
||||
time = getCreationTime(key);
|
||||
cache.remove(key);
|
||||
hashScore.deleteScore(key);
|
||||
hashDate.remove(key);
|
||||
}
|
||||
return backend.addEntries(container, time);
|
||||
}
|
||||
|
||||
private int flushToLimit() {
|
||||
if ((hashScore.size() == 0) && (cache.size() == 0)) {
|
||||
serverLog.logDebug("PLASMA INDEXING", "flushToLimit: called but cache is empty");
|
||||
return 0;
|
||||
}
|
||||
if ((hashScore.size() == 0) && (cache.size() != 0)) {
|
||||
serverLog.logError("PLASMA INDEXING", "flushToLimit: hashScore.size=0 but cache.size=" + cache.size());
|
||||
return 0;
|
||||
}
|
||||
if ((hashScore.size() != 0) && (cache.size() == 0)) {
|
||||
serverLog.logError("PLASMA INDEXING", "flushToLimit: hashScore.size=" + hashScore.size() + " but cache.size=0");
|
||||
return 0;
|
||||
}
|
||||
|
||||
//serverLog.logDebug("PLASMA INDEXING", "flushSpecific: hashScore.size=" + hashScore.size() + ", cache.size=" + cache.size());
|
||||
int total = 0;
|
||||
synchronized (hashScore) {
|
||||
String key;
|
||||
int count;
|
||||
Long createTime;
|
||||
while (hashScore.size() >= maxWords) {
|
||||
key = (String) hashScore.getMaxObject();
|
||||
createTime = (Long) hashDate.get(key);
|
||||
count = hashScore.getScore(key);
|
||||
if ((createTime != null) && ((System.currentTimeMillis() - createTime.longValue()) < 9000)) {
|
||||
log.logDebug("key " + key + " is too fresh, abandon flush (count=" + count + ", cachesize=" + cache.size() + ")");
|
||||
break;
|
||||
}
|
||||
if (count < 5) log.logWarning("flushing of key " + key + " not appropriate (too less entries, count=" + count + "): increase cache size");
|
||||
log.logDebug("flushing key " + key + ", count=" + count + ", cachesize=" + cache.size());
|
||||
total += flushKey(key);
|
||||
if (total > 100) break;
|
||||
}
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) {
|
||||
flushKey(wordHash);
|
||||
return backend.getIndex(wordHash, deleteIfEmpty);
|
||||
}
|
||||
|
||||
public long getCreationTime(String wordHash) {
|
||||
Long time = (Long) hashDate.get(wordHash);
|
||||
if (time == null) return 0;
|
||||
return time.longValue();
|
||||
}
|
||||
|
||||
public void deleteIndex(String wordHash) {
|
||||
synchronized (cache) {
|
||||
cache.remove(wordHash);
|
||||
hashScore.deleteScore(wordHash);
|
||||
hashDate.remove(wordHash);
|
||||
}
|
||||
backend.deleteIndex(wordHash);
|
||||
}
|
||||
|
||||
public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) {
|
||||
flushKey(wordHash);
|
||||
return backend.removeEntries(wordHash, urlHashes, deleteComplete);
|
||||
}
|
||||
|
||||
public int addEntries(plasmaWordIndexEntryContainer container, long creationTime) {
|
||||
//serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size());
|
||||
flushToLimit();
|
||||
//if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries");
|
||||
|
||||
// put new words into cache
|
||||
int added = 0;
|
||||
synchronized (cache) {
|
||||
String wordHash = container.wordHash();
|
||||
plasmaWordIndexEntryContainer entries = (plasmaWordIndexEntryContainer) cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null
|
||||
if (entries == null) entries = new plasmaWordIndexEntryContainer(wordHash);
|
||||
added = entries.add(container);
|
||||
if (added > 0) {
|
||||
cache.put(wordHash, entries);
|
||||
hashScore.addScore(wordHash, added);
|
||||
hashDate.put(wordHash, new Long(creationTime));
|
||||
}
|
||||
}
|
||||
//System.out.println("DEBUG: cache = " + cache.toString());
|
||||
return added;
|
||||
}
|
||||
|
||||
private void addEntry(String wordHash, plasmaWordIndexEntry newEntry, long creationTime) {
|
||||
plasmaWordIndexEntryContainer entries = (plasmaWordIndexEntryContainer) cache.get(wordHash);
|
||||
if (entries == null) entries = new plasmaWordIndexEntryContainer(wordHash);
|
||||
if (entries.add(newEntry)) {
|
||||
cache.put(wordHash, entries);
|
||||
hashScore.incScore(wordHash);
|
||||
hashDate.put(wordHash, new Long(creationTime));
|
||||
}
|
||||
}
|
||||
|
||||
public void close(int waitingSeconds) {
|
||||
try {
|
||||
dump(waitingSeconds);
|
||||
} catch (IOException e){
|
||||
log.logError("unable to dump cache: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,255 @@
|
||||
// plasmaWordIndexClassicDB.java
|
||||
// -----------------------------
|
||||
// part of YACY
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2005
|
||||
// last major change: 6.5.2005
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
import de.anomic.kelondro.*;
|
||||
import de.anomic.server.serverLog;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
|
||||
public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
|
||||
|
||||
|
||||
// class variables
|
||||
private File databaseRoot;
|
||||
private serverLog log;
|
||||
private int size;
|
||||
|
||||
public plasmaWordIndexClassicDB(File databaseRoot, serverLog log) throws IOException {
|
||||
this.databaseRoot = databaseRoot;
|
||||
this.log = log;
|
||||
this.size = 0;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
public Iterator wordHashes(String startHash, boolean up) {
|
||||
return new iterateFiles(startHash, up);
|
||||
}
|
||||
|
||||
public class iterateFiles implements Iterator {
|
||||
|
||||
private ArrayList hierarchy; // contains TreeSet elements, earch TreeSet contains File Entries
|
||||
private Comparator comp; // for string-compare
|
||||
private String buffer; // the prefetch-buffer
|
||||
|
||||
public iterateFiles(String startHash, boolean up) {
|
||||
this.hierarchy = new ArrayList();
|
||||
this.comp = kelondroMSetTools.fastStringComparator(up);
|
||||
|
||||
// the we initially fill the hierarchy with the content of the root folder
|
||||
String path = "WORDS";
|
||||
TreeSet list = list(new File(databaseRoot, path));
|
||||
|
||||
// if we have a start hash then we find the appropriate subdirectory to start
|
||||
if ((startHash != null) && (startHash.length() == yacySeedDB.commonHashLength)) {
|
||||
delete(startHash.substring(0, 1), list);
|
||||
if (list.size() > 0) {
|
||||
hierarchy.add(list);
|
||||
String[] paths = new String[]{startHash.substring(0, 1), startHash.substring(1, 2), startHash.substring(2, 4), startHash.substring(4, 6)};
|
||||
int pathc = 0;
|
||||
while ((pathc < paths.length) &&
|
||||
(comp.compare((String) list.first(), paths[pathc]) == 0)) {
|
||||
path = path + "/" + paths[pathc];
|
||||
list = list(new File(databaseRoot, path));
|
||||
delete(paths[pathc], list);
|
||||
if (list.size() == 0) break;
|
||||
hierarchy.add(list);
|
||||
pathc++;
|
||||
}
|
||||
}
|
||||
while (((buffer = next0()) != null) && (comp.compare(buffer, startHash) < 0)) {};
|
||||
} else {
|
||||
hierarchy.add(list);
|
||||
buffer = next0();
|
||||
}
|
||||
}
|
||||
|
||||
private synchronized void delete(String pattern, TreeSet names) {
|
||||
String name;
|
||||
while ((names.size() > 0) && (comp.compare((new File(name = (String) names.first())).getName(), pattern) < 0)) names.remove(name);
|
||||
}
|
||||
|
||||
private TreeSet list(File path) {
|
||||
//System.out.println("PATH: " + path);
|
||||
TreeSet t = new TreeSet(comp);
|
||||
String[] l = path.list();
|
||||
if (l != null) for (int i = 0; i < l.length; i++) t.add(path + "/" + l[i]);
|
||||
//else System.out.println("DEBUG: wrong path " + path);
|
||||
//System.out.println(t);
|
||||
return t;
|
||||
}
|
||||
|
||||
private synchronized String next0() {
|
||||
// the object is a File pointing to the corresponding file
|
||||
File f;
|
||||
String n;
|
||||
TreeSet t;
|
||||
do {
|
||||
t = null;
|
||||
while ((t == null) && (hierarchy.size() > 0)) {
|
||||
t = (TreeSet) hierarchy.get(hierarchy.size() - 1);
|
||||
if (t.size() == 0) {
|
||||
hierarchy.remove(hierarchy.size() - 1); // we step up one hierarchy
|
||||
t = null;
|
||||
}
|
||||
}
|
||||
if ((hierarchy.size() == 0) || (t.size() == 0)) return null; // this is the end
|
||||
// fetch value
|
||||
f = new File(n = (String) t.first());
|
||||
t.remove(n);
|
||||
// if the value represents another folder, we step into the next hierarchy
|
||||
if (f.isDirectory()) {
|
||||
t = list(f);
|
||||
if (t.size() == 0) {
|
||||
// the folder is empty, delete it
|
||||
f.delete();
|
||||
} else {
|
||||
hierarchy.add(t);
|
||||
}
|
||||
f = null;
|
||||
}
|
||||
} while (f == null);
|
||||
// thats it
|
||||
if ((f == null) || ((n = f.getName()) == null) || (n.length() < yacySeedDB.commonHashLength)) {
|
||||
return null;
|
||||
} else {
|
||||
return n.substring(0, yacySeedDB.commonHashLength);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return buffer != null;
|
||||
}
|
||||
|
||||
public Object next() {
|
||||
String r = buffer;
|
||||
while (((buffer = next0()) != null) && (comp.compare(buffer, r) < 0)) {};
|
||||
return r;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) {
|
||||
try {
|
||||
return new plasmaWordIndexEntity(databaseRoot, wordHash, deleteIfEmpty);
|
||||
} catch (IOException e) {
|
||||
log.logError("plasmaWordIndexClassic.getIndex: " + e.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public long getCreationTime(String wordHash) {
|
||||
File f = plasmaWordIndexEntity.wordHash2path(databaseRoot, wordHash);
|
||||
if (f.exists()) return f.lastModified(); else return -1;
|
||||
}
|
||||
|
||||
|
||||
public void deleteIndex(String wordHash) {
|
||||
try {
|
||||
plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash);
|
||||
} catch (IOException e) {
|
||||
log.logError("plasmaWordIndexClassic.deleteIndex: " + e.getMessage());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) {
|
||||
// removes all given url hashes from a single word index. Returns number of deletions.
|
||||
plasmaWordIndexEntity pi = getIndex(wordHash, true);
|
||||
int count = 0;
|
||||
try {
|
||||
for (int i = 0; i < urlHashes.length; i++)
|
||||
if (pi.removeEntry(urlHashes[i], deleteComplete)) count++;
|
||||
int size = pi.size();
|
||||
pi.close(); pi = null;
|
||||
// check if we can remove the index completely
|
||||
if ((deleteComplete) && (size == 0)) deleteIndex(wordHash);
|
||||
return count;
|
||||
} catch (IOException e) {
|
||||
log.logError("plasmaWordIndexClassic.removeEntries: " + e.getMessage());
|
||||
return count;
|
||||
}
|
||||
}
|
||||
|
||||
public int addEntries(plasmaWordIndexEntryContainer container, long creationTime) {
|
||||
//System.out.println("* adding " + newEntries.size() + " cached word index entries for word " + wordHash); // debug
|
||||
// fetch the index cache
|
||||
if (container.size() == 0) return 0;
|
||||
|
||||
// open file
|
||||
try {
|
||||
plasmaWordIndexEntity pi = new plasmaWordIndexEntity(databaseRoot, container.wordHash(), false);
|
||||
int count = 0;
|
||||
|
||||
// write from vector
|
||||
if (container != null) {
|
||||
Iterator i = container.entries();
|
||||
while (i.hasNext()) {
|
||||
if (pi.addEntry((plasmaWordIndexEntry) i.next())) count++;
|
||||
}
|
||||
}
|
||||
|
||||
// close and return
|
||||
pi.close();
|
||||
pi = null;
|
||||
return count;
|
||||
} catch (IOException e) {
|
||||
log.logError("plasmaWordIndexClassic.addEntries: " + e.getMessage());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
public void close(int waitingSeconds) {
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,97 @@
|
||||
// plasmaIndexEntryContainer.java
|
||||
// ------------------------------
|
||||
// part of YaCy
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2005
|
||||
// last major change: 07.05.2005
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class plasmaWordIndexEntryContainer {
|
||||
|
||||
private String wordHash;
|
||||
private HashMap container;
|
||||
|
||||
public plasmaWordIndexEntryContainer(String wordHash) {
|
||||
this.wordHash = wordHash;
|
||||
container = new HashMap(); // a urlhash/plasmaWordIndexEntry - relation
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return container.size();
|
||||
}
|
||||
|
||||
public String wordHash() {
|
||||
return wordHash;
|
||||
}
|
||||
|
||||
public boolean add(plasmaWordIndexEntry entry) {
|
||||
// returns true if the new entry was added, false if it already existet
|
||||
String urlHash = entry.getUrlHash();
|
||||
if (container.containsKey(urlHash)) return false;
|
||||
container.put(urlHash, entry);
|
||||
return true;
|
||||
}
|
||||
|
||||
public int add(plasmaWordIndexEntryContainer c) {
|
||||
// returns the number of new elements
|
||||
Iterator i = c.entries();
|
||||
int x = 0;
|
||||
while (i.hasNext()) {
|
||||
if (add((plasmaWordIndexEntry) i.next())) x++;
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
public Iterator entries() {
|
||||
// returns an iterator of plasmaWordIndexEntry objects
|
||||
return container.values().iterator();
|
||||
}
|
||||
|
||||
public static plasmaWordIndexEntryContainer instantContainer(String wordHash, plasmaWordIndexEntry entry) {
|
||||
plasmaWordIndexEntryContainer c = new plasmaWordIndexEntryContainer(wordHash);
|
||||
c.add(entry);
|
||||
return c;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "C[" + wordHash + "] has " + container.size() + " entries";
|
||||
}
|
||||
|
||||
}
|
@ -1,275 +0,0 @@
|
||||
// plasmaWordIndexFileCache.java
|
||||
// -----------------------------
|
||||
// part of YACY
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2004
|
||||
// last major change: 22.01.2004
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
|
||||
/*
|
||||
The plasmaIndexCache manages a database table with a list of
|
||||
indexEntries in it. This is done in a completely different fashion
|
||||
as organized by the plasmaIndex tables. The entries are not
|
||||
sorted and just stored in a buffer.
|
||||
Whenever during a seach an index is retrieved, first it's buffer
|
||||
is flushed into the corresponding index table, so that it can be
|
||||
sorted into the remaining index entry elements.
|
||||
The cache database consist of
|
||||
- the word hash as primary key
|
||||
- one column with a one-byte counter
|
||||
- a number of more columns with indexEntry elements
|
||||
*/
|
||||
|
||||
|
||||
// compile with
|
||||
// javac -classpath classes -sourcepath source -d classes -g source/de/anomic/plasma/*.java
|
||||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Vector;
|
||||
|
||||
import de.anomic.kelondro.kelondroException;
|
||||
import de.anomic.kelondro.kelondroTree;
|
||||
import de.anomic.server.serverLog;
|
||||
|
||||
public class plasmaWordIndexFileCache {
|
||||
|
||||
private static final String indexCacheFileName = "indexCache.db";
|
||||
private static final int buffers = 50; // number of buffered entries per word
|
||||
|
||||
// class variables
|
||||
private File databaseRoot;
|
||||
private kelondroTree indexCache;
|
||||
private int bufferkb;
|
||||
|
||||
public plasmaWordIndexFileCache(File databaseRoot, int bufferkb) throws IOException {
|
||||
this.databaseRoot = databaseRoot;
|
||||
this.bufferkb = bufferkb;
|
||||
File indexCacheFile = new File(databaseRoot, indexCacheFileName);
|
||||
if (indexCacheFile.exists()) {
|
||||
// simply open the file
|
||||
indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400);
|
||||
} else {
|
||||
createCacheFile(indexCacheFile);
|
||||
}
|
||||
}
|
||||
|
||||
private void resetCacheFile() {
|
||||
// this has to be used in emergencies only
|
||||
// it can happen that there is a serious db inconsistency; in that case we re-create the indexCache
|
||||
try { indexCache.close(); } catch (IOException e) {}
|
||||
File indexCacheFile = new File(databaseRoot, indexCacheFileName);
|
||||
if (indexCacheFile.exists()) indexCacheFile.delete();
|
||||
try {
|
||||
createCacheFile(indexCacheFile);
|
||||
} catch (IOException e) {
|
||||
de.anomic.server.serverLog.logError("PLASMA", "plasmaWordIndexFileCache.resetCacheFile: serious failure creating the cache file: " + e.getMessage());
|
||||
indexCache = null;
|
||||
}
|
||||
}
|
||||
|
||||
private void createCacheFile(File indexCacheFile) throws IOException {
|
||||
// create a new file
|
||||
int[] columns = new int[buffers + 2];
|
||||
columns[0] = plasmaWordIndexEntry.wordHashLength;
|
||||
columns[1] = 1;
|
||||
for (int i = 0; i < buffers; i++) columns[i + 2] = plasmaCrawlLURL.urlHashLength + plasmaWordIndexEntry.attrSpaceShort;
|
||||
indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400, columns);
|
||||
}
|
||||
|
||||
protected void close() throws IOException {
|
||||
indexCache.close();
|
||||
indexCache = null;
|
||||
}
|
||||
|
||||
private byte[][] getCache(String wordHash) throws IOException {
|
||||
// read one line from the cache; if none exists: construct one
|
||||
byte[][] row;
|
||||
try {
|
||||
row = indexCache.get(wordHash.getBytes());
|
||||
} catch (Exception e) {
|
||||
// we had some negativeSeekOffsetExceptions here, and also loops may cause this
|
||||
// in that case the indexCache is corrupt
|
||||
System.out.println("Error in plasmaWordINdexFileCache.getCache: index for hash " + wordHash + " is corrupt:" + e.toString());
|
||||
//e.printStackTrace();
|
||||
row = null;
|
||||
}
|
||||
if (row == null) {
|
||||
row = new byte[indexCache.columns()][];
|
||||
row[0] = wordHash.getBytes();
|
||||
row[1] = new byte[1];
|
||||
row[1][0] = (byte) 0;
|
||||
}
|
||||
return row;
|
||||
}
|
||||
|
||||
|
||||
protected Iterator wordHashes(String wordHash, boolean up) throws IOException {
|
||||
try {
|
||||
return indexCache.rows(up, false, (wordHash == null) ? null : wordHash.getBytes());
|
||||
} catch (kelondroException e) {
|
||||
de.anomic.server.serverLog.logError("PLASMA", "kelondro error in plasmaWordIndexFileCache: " + e.getMessage() + "; deleting index for " + wordHash);
|
||||
deleteComplete(wordHash);
|
||||
return new HashSet().iterator();
|
||||
}
|
||||
}
|
||||
|
||||
protected plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) throws IOException {
|
||||
// first flush the index cache, if there is any for that word hash
|
||||
byte[][] row = indexCache.get(wordHash.getBytes());
|
||||
if (row != null) {
|
||||
int entries = (int) row[1][0];
|
||||
if (entries != 0) flushCache(row, null); // if the cache has entries, flush it
|
||||
indexCache.remove(wordHash.getBytes()); // delete the cache index row; suppose to be empty now
|
||||
}
|
||||
// then return the index from the uncached file (with new entries)
|
||||
return new plasmaWordIndexEntity(databaseRoot, wordHash, deleteIfEmpty);
|
||||
}
|
||||
|
||||
protected void addEntriesToIndex(String wordHash, Vector /* of plasmaIndexEntry */ newEntries) throws IOException {
|
||||
//System.out.println("* adding " + newEntries.size() + " cached word index entries for word " + wordHash); // debug
|
||||
// fetch the index cache
|
||||
if (newEntries.size() == 0) return;
|
||||
byte[][] row = getCache(wordHash);
|
||||
int entries = (int) row[1][0];
|
||||
// check if the index cache is full
|
||||
if (entries + 2 + newEntries.size() >= indexCache.columns()) {
|
||||
flushCache(row, newEntries); // and put in new values
|
||||
entries = 0;
|
||||
row[1][0] = (byte) 0; // set number of entries to zero
|
||||
} else {
|
||||
// put in the new values
|
||||
String newEntry;
|
||||
for (int i = 0; i < newEntries.size(); i++) {
|
||||
newEntry = ((plasmaWordIndexEntry) newEntries.elementAt(i)).getUrlHash() + ((plasmaWordIndexEntry) newEntries.elementAt(i)).toEncodedForm(false);
|
||||
row[entries + 2] = newEntry.getBytes();
|
||||
entries++;
|
||||
}
|
||||
row[1][0] = (byte) entries;
|
||||
try {
|
||||
indexCache.put(row);
|
||||
} catch (kelondroException e) {
|
||||
// this is a very bad case; a database inconsistency occurred
|
||||
serverLog.logError("PLASMA", "fatal error in plasmaWordIndexFileCache.addEntriesToIndex: write of " + wordHash + " to index cache failed - " + e.getMessage() + " - indexCache.db deleted");
|
||||
resetCacheFile();
|
||||
} catch (IOException e) {
|
||||
// this is a very bad case; a database inconsistency occurred
|
||||
serverLog.logError("PLASMA", "fatal error in plasmaWordIndexFileCache.addEntriesToIndex: write of " + wordHash + " to index cache failed - " + e.getMessage() + " - indexCache.db deleted");
|
||||
resetCacheFile();
|
||||
}
|
||||
}
|
||||
// finished!
|
||||
}
|
||||
|
||||
protected void deleteComplete(String wordHash) throws IOException {
|
||||
plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash);
|
||||
indexCache.remove(wordHash.getBytes());
|
||||
}
|
||||
|
||||
protected int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) throws IOException {
|
||||
// removes all given url hashes from a single word index. Returns number of deletions.
|
||||
plasmaWordIndexEntity pi = getIndex(wordHash, true);
|
||||
int count = 0;
|
||||
for (int i = 0; i < urlHashes.length; i++) if (pi.removeEntry(urlHashes[i], deleteComplete)) count++;
|
||||
int size = pi.size();
|
||||
pi.close(); pi = null;
|
||||
// check if we can remove the index completely
|
||||
if ((deleteComplete) && (size == 0)) {
|
||||
// remove index
|
||||
if (!(plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash)))
|
||||
System.out.println("DEBUG: cannot remove index file for word hash " + wordHash);
|
||||
// remove cache
|
||||
indexCache.remove(wordHash.getBytes());
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private synchronized void flushCache(byte[][] row, Vector indexEntries) throws IOException {
|
||||
String wordHash = new String(row[0]);
|
||||
int entries = (int) row[1][0];
|
||||
if ((entries == 0) && ((indexEntries == null) || (indexEntries.size() == 0))) return;
|
||||
|
||||
// open file
|
||||
plasmaWordIndexEntity pi = new plasmaWordIndexEntity(databaseRoot, wordHash, false);
|
||||
|
||||
// write from array
|
||||
plasmaWordIndexEntry entry;
|
||||
for (int i = 0; i < entries; i++) {
|
||||
entry = new plasmaWordIndexEntry(new String(row[i + 2], 0, plasmaCrawlLURL.urlHashLength),
|
||||
new String(row[i + 2], plasmaCrawlLURL.urlHashLength, plasmaWordIndexEntry.attrSpaceShort));
|
||||
pi.addEntry(entry);
|
||||
}
|
||||
|
||||
// write from vector
|
||||
if (indexEntries != null) {
|
||||
for (int i = 0; i < indexEntries.size(); i++)
|
||||
pi.addEntry((plasmaWordIndexEntry) indexEntries.elementAt(i));
|
||||
}
|
||||
|
||||
// close and return
|
||||
pi.close();
|
||||
pi = null;
|
||||
}
|
||||
|
||||
private int size(String wordHash) throws IOException {
|
||||
// return number of entries in specific cache
|
||||
byte[][] row = indexCache.get(wordHash.getBytes());
|
||||
if (row == null) return 0;
|
||||
return (int) row[1][0];
|
||||
}
|
||||
|
||||
protected int size() {
|
||||
if (indexCache == null) return 0; else return indexCache.size();
|
||||
}
|
||||
|
||||
/*
|
||||
private plasmaIndex getIndexF(String wordHash) throws IOException {
|
||||
return new plasmaIndex(databaseRoot, wordHash);
|
||||
}
|
||||
|
||||
private void addEntryToIndexF(String wordHash, plasmaIndexEntry entry) throws IOException {
|
||||
plasmaIndex pi = new plasmaIndex(databaseRoot, wordHash);
|
||||
pi.addEntry(entry);
|
||||
pi.close();
|
||||
}
|
||||
*/
|
||||
|
||||
}
|
@ -0,0 +1,62 @@
|
||||
// plasmaWordIndexInterface.java
|
||||
// -----------------------------
|
||||
// part of YACY
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2005
|
||||
// last major change: 6.5.2005
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public interface plasmaWordIndexInterface {
|
||||
|
||||
public int size();
|
||||
|
||||
public Iterator wordHashes(String startWordHash, boolean up);
|
||||
|
||||
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty);
|
||||
public long getCreationTime(String wordHash);
|
||||
public void deleteIndex(String wordHash);
|
||||
|
||||
public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete);
|
||||
public int addEntries(plasmaWordIndexEntryContainer newEntries, long creationTime);
|
||||
|
||||
public void close(int waitingSeconds);
|
||||
|
||||
}
|
@ -1,253 +0,0 @@
|
||||
// plasmaIndexRAMCache.java
|
||||
// -----------------------
|
||||
// part of YACY
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2004
|
||||
// last major change: 22.12.2004
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
// compile with
|
||||
// javac -classpath classes -sourcepath source -d classes -g source/de/anomic/plasma/*.java
|
||||
|
||||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.TreeMap;
|
||||
import java.util.Vector;
|
||||
|
||||
import de.anomic.kelondro.kelondroMScoreCluster;
|
||||
import de.anomic.server.serverLog;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
|
||||
public class plasmaWordIndexRAMCache extends Thread {
|
||||
|
||||
static String minKey, maxKey;
|
||||
|
||||
// class variables
|
||||
TreeMap cache;
|
||||
kelondroMScoreCluster hashScore;
|
||||
plasmaWordIndexFileCache pic;
|
||||
boolean terminate;
|
||||
long terminateUntil;
|
||||
int maxWords;
|
||||
|
||||
static {
|
||||
maxKey = "";
|
||||
for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += 'z';
|
||||
minKey = "";
|
||||
for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-';
|
||||
}
|
||||
|
||||
public plasmaWordIndexRAMCache(File databaseRoot, int bufferkb) throws IOException {
|
||||
this.pic = new plasmaWordIndexFileCache(databaseRoot, bufferkb);
|
||||
this.cache = new TreeMap();
|
||||
this.hashScore = new kelondroMScoreCluster();
|
||||
this.maxWords = 1000;
|
||||
this.terminate = false;
|
||||
}
|
||||
|
||||
public int maxURLinWordCache() {
|
||||
return hashScore.getScore(hashScore.getMaxObject());
|
||||
}
|
||||
|
||||
public int wordCacheRAMSize() {
|
||||
return cache.size();
|
||||
}
|
||||
|
||||
public void setMaxWords(int maxWords) {
|
||||
this.maxWords = maxWords;
|
||||
}
|
||||
|
||||
public void run() {
|
||||
serverLog.logSystem("PLASMA INDEXING", "started word cache management");
|
||||
int check;
|
||||
// permanently flush cache elements
|
||||
while (!(terminate)) {
|
||||
if (hashScore.size() < 100) try {Thread.currentThread().sleep(10000);} catch (InterruptedException e) {}
|
||||
while ((!(terminate)) && (cache != null) && (hashScore.size() > 0)) try {
|
||||
check = hashScore.size();
|
||||
flushSpecific(false);
|
||||
//serverLog.logDebug("PLASMA INDEXING", "single flush. bevore=" + check + "; after=" + hashScore.size());
|
||||
try {Thread.currentThread().sleep(10 + ((maxWords / 10) / (1 + hashScore.size())));} catch (InterruptedException e) {}
|
||||
} catch (IOException e) {
|
||||
serverLog.logError("PLASMA INDEXING", "PANIK! exception in main cache loop: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
terminate = true;
|
||||
cache = null;
|
||||
}
|
||||
}
|
||||
|
||||
serverLog.logSystem("PLASMA INDEXING", "CATCHED TERMINATION SIGNAL: start final flush");
|
||||
|
||||
// close all;
|
||||
try {
|
||||
// first flush everything
|
||||
while ((hashScore.size() > 0) && (System.currentTimeMillis() < terminateUntil)) {
|
||||
flushSpecific(false);
|
||||
}
|
||||
|
||||
// then close file cache:
|
||||
pic.close();
|
||||
} catch (IOException e) {
|
||||
serverLog.logDebug("PLASMA INDEXING", "interrupted final flush: " + e.toString());
|
||||
}
|
||||
// report
|
||||
if (hashScore.size() == 0)
|
||||
serverLog.logSystem("PLASMA INDEXING", "finished final flush; flushed all words");
|
||||
else
|
||||
serverLog.logError("PLASMA INDEXING", "terminated final flush; " + hashScore.size() + " words lost");
|
||||
|
||||
// delete data
|
||||
cache = null;
|
||||
hashScore = null;
|
||||
|
||||
}
|
||||
|
||||
public void close(int waitingBoundSeconds) {
|
||||
terminate = true;
|
||||
// wait until terination is done
|
||||
// we can do at least 6 flushes/second
|
||||
int waitingtime = 10 + (((cache == null) ? 0 : cache.size()) / 5); // seconds
|
||||
if (waitingtime > waitingBoundSeconds) waitingtime = waitingBoundSeconds; // upper bound
|
||||
this.terminateUntil = System.currentTimeMillis() + (waitingtime * 1000);
|
||||
terminate = true;
|
||||
while ((cache != null) && (waitingtime > 0)) {
|
||||
serverLog.logDebug("PLASMA INDEXING", "final word flush; cache.size=" + cache.size() + "; time-out in " + waitingtime + " seconds");
|
||||
try {Thread.currentThread().sleep(5000);} catch (InterruptedException e) {}
|
||||
waitingtime -= 5;
|
||||
}
|
||||
}
|
||||
|
||||
private int flushSpecific(boolean greatest) throws IOException {
|
||||
//System.out.println("DEBUG: plasmaIndexRAMCache.flushSpecific(" + ((greatest) ? "greatest" : "smallest") + "); cache.size() = " + cache.size());
|
||||
if ((hashScore.size() == 0) && (cache.size() == 0)) {
|
||||
serverLog.logDebug("PLASMA INDEXING", "flushSpecific: called but cache is empty");
|
||||
return 0;
|
||||
}
|
||||
if ((hashScore.size() == 0) && (cache.size() != 0)) {
|
||||
serverLog.logError("PLASMA INDEXING", "flushSpecific: hashScore.size=0 but cache.size=" + cache.size());
|
||||
return 0;
|
||||
}
|
||||
if ((hashScore.size() != 0) && (cache.size() == 0)) {
|
||||
serverLog.logError("PLASMA INDEXING", "flushSpecific: hashScore.size=" + hashScore.size() + " but cache.size=0");
|
||||
return 0;
|
||||
}
|
||||
|
||||
//serverLog.logDebug("PLASMA INDEXING", "flushSpecific: hashScore.size=" + hashScore.size() + ", cache.size=" + cache.size());
|
||||
|
||||
String key = (String) ((greatest) ? hashScore.getMaxObject() : hashScore.getMinObject());
|
||||
return flushKey(key, "flushSpecific");
|
||||
}
|
||||
|
||||
private int flushKey(String key, String caller) throws IOException {
|
||||
Vector v = null;
|
||||
v = (Vector) cache.get(key);
|
||||
if (v == null) return 0; // flushing of nonexisting key
|
||||
synchronized (cache) {
|
||||
cache.remove(key);
|
||||
hashScore.deleteScore(key);
|
||||
}
|
||||
pic.addEntriesToIndex(key, v);
|
||||
return v.size();
|
||||
}
|
||||
|
||||
public synchronized Iterator wordHashesMem(String wordHash, int count) throws IOException {
|
||||
// returns a list of hashes from a specific start point
|
||||
// we need to flush some of the elements in the cache first
|
||||
// maybe we flush too much, but this is not easy to find out and it does not matter
|
||||
TreeMap subMap = new TreeMap(cache.subMap((wordHash == null) ? minKey : wordHash, maxKey));
|
||||
int flushcount = subMap.size();
|
||||
if (flushcount > count) flushcount = count;
|
||||
String key;
|
||||
for (int i = 0; i < flushcount ; i++) {
|
||||
key = (String) subMap.firstKey();
|
||||
flushKey(key, "getSequentialWordHashesMem");
|
||||
subMap.remove(key);
|
||||
}
|
||||
// finally return the result from the underlying hash list:
|
||||
return pic.wordHashes(wordHash, true);
|
||||
}
|
||||
|
||||
public plasmaWordIndexEntity getIndexMem(String wordHash, boolean deleteIfEmpty) throws IOException {
|
||||
flushKey(wordHash, "getIndexMem");
|
||||
return pic.getIndex(wordHash, deleteIfEmpty);
|
||||
}
|
||||
|
||||
public int addEntryToIndexMem(String wordHash, plasmaWordIndexEntry entry) throws IOException {
|
||||
// make space for new words
|
||||
int flushc = 0;
|
||||
//serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size());
|
||||
synchronized (hashScore) {
|
||||
while (hashScore.size() > maxWords) flushc += flushSpecific(true);
|
||||
}
|
||||
//if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries");
|
||||
|
||||
// put new words into cache
|
||||
synchronized (cache) {
|
||||
Vector v = (Vector) cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null
|
||||
if (v == null) v = new Vector();
|
||||
v.add(entry);
|
||||
cache.put(wordHash, v);
|
||||
hashScore.incScore(wordHash);
|
||||
}
|
||||
return flushc;
|
||||
}
|
||||
|
||||
public synchronized void deleteComplete(String wordHash) throws IOException {
|
||||
cache.remove(wordHash);
|
||||
hashScore.deleteScore(wordHash);
|
||||
pic.deleteComplete(wordHash);
|
||||
}
|
||||
|
||||
public int removeEntriesMem(String wordHash, String[] urlHashes, boolean deleteComplete) throws IOException {
|
||||
flushKey(wordHash, "removeEntriesMem");
|
||||
return pic.removeEntries(wordHash, urlHashes, deleteComplete);
|
||||
}
|
||||
|
||||
public int sizeMin() {
|
||||
// it is not easy to find out the correct size of the cache
|
||||
// to make the result correct, it would be necessary to flush the complete ram cache
|
||||
// instead, we return the minimum size of the cache, which is the maximun of either the
|
||||
// ram or table cache
|
||||
if ((hashScore == null) || (pic == null)) return 0;
|
||||
return (hashScore.size() < pic.size()) ? pic.size() : hashScore.size();
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -1,8 +1,2 @@
|
||||
#plasmaParser configuration file
|
||||
#Mon May 02 10:12:02 CEST 2005
|
||||
application/atom+xml=de.anomic.plasma.parser.rss.rssParser
|
||||
text/rss=de.anomic.plasma.parser.rss.rssParser
|
||||
application/rss+xml=de.anomic.plasma.parser.rss.rssParser
|
||||
application/rdf+xml=de.anomic.plasma.parser.rss.rssParser
|
||||
application/msword=de.anomic.plasma.parser.doc.docParser
|
||||
application/pdf=de.anomic.plasma.parser.pdf.pdfParser
|
||||
#Sat May 07 22:32:33 CEST 2005
|
||||
|
Loading…
Reference in new issue