introduced assortment structure (generalization of singletons)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@139 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 71b9cb0c33
commit 5c6147a54c

@ -240,10 +240,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logSystem("Wiki Cache memory = " + ppRamString(ramWiki));
// make crawl profiles database and default profiles
log.logSystem("Initializing Crawl Profiles");
profiles = new plasmaCrawlProfile(new File(plasmaPath, "crawlProfiles0.db"));
initProfiles();
// start indexing management
log.logSystem("Starting Indexing Management");
loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL);
noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL);
errorURL = new plasmaCrawlEURL(new File(plasmaPath, "urlErr0.db"), ramEURL);
@ -253,19 +255,24 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
searchManager = new plasmaSearch(loadedURL, wordIndex);
// start a cache manager
log.logSystem("Starting HT Cache Manager");
this.cacheManager = new plasmaHTCache(this, ramHTTP);
// make parser
log.logSystem("Starting Parser");
this.parser = new plasmaParser();
// define an extension-blacklist
log.logSystem("Parser: Initializing Media Extensions");
plasmaParser.initMediaExt(getConfig("mediaExt",null));
// define a realtime parsable mimetype list
log.logSystem("Parser: Initializing Mime Types");
plasmaParser.initRealtimeParsableMimeTypes(getConfig("parseableRealtimeMimeTypes","application/xhtml+xml,text/html,text/plain"));
plasmaParser.initParseableMimeTypes(getConfig("parseableMimeTypes",null));
// start a loader
log.logSystem("Starting Crawl Loader");
int remoteport;
try { remoteport = Integer.parseInt(getConfig("remoteProxyPort","3128")); }
catch (NumberFormatException e) { remoteport = 3128; }
@ -277,18 +284,23 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
remoteport);
// init boards
log.logSystem("Starting Message Board");
messageDB = new messageBoard(new File(getRootPath(), "DATA/SETTINGS/message.db"), ramMessage);
wikiDB = new wikiBoard(new File(getRootPath(), "DATA/SETTINGS/wiki.db"),
log.logSystem("Starting Wiki Board");
wikiDB = new wikiBoard(new File(getRootPath(), "DATA/SETTINGS/wiki.db"),
new File(getRootPath(), "DATA/SETTINGS/wiki-bkp.db"), ramWiki);
// init cookie-Monitor
log.logSystem("Starting Cookie Monitor");
outgoingCookies = new HashMap();
incomingCookies = new HashMap();
// clean up profiles
log.logSystem("Cleaning Profiles");
cleanProfiles();
// init facility DB
log.logSystem("Starting Facility Database");
File facilityDBpath = new File(getRootPath(), "DATA/SETTINGS/");
facilityDB = new kelondroTables(facilityDBpath);
facilityDB.declareMaps("backlinks", 250, 500, new String[] {"date"}, null);
@ -299,10 +311,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
testresult = facilityDB.selectLong("statistik", (new serverDate()).toShortString(false).substring(0, 11));
// start yacy core
log.logSystem("Starting YaCy Protocol Core");
yacyCore yc = new yacyCore(this);
serverInstantThread.oneTimeJob(yc, "loadSeeds", yc.log, 3000);
// deploy threads
log.logSystem("Starting Threads");
deployThread("90_cleanup", "Cleanup", "simple cleaning process for monitoring information" ,
new serverInstantThread(this, "cleanupJob", "cleanupJobSize"), 10000); // all 5 Minutes
deployThread("80_dequeue", "Indexing Dequeue", "thread that creates database entries from scraped web content and performes indexing" ,

@ -0,0 +1,250 @@
// plasmaWordIndexAssortment.java
// ------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// last major change: 18.5.2005
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
/*
An assortment is a set of words that appear exactly on a specific
number of different web pages. A special case is, when the the word
appear only on a single web page: this is called a 'singleton'.
YaCy maintains a word cache for words appearing on x web pages.
For each 'x' there is an assortment database, where 1<=x<=max
If a word appears on more than 'max' web pages, the corresponing url-list
is stored to some kind of back-end database which we consider as the
'slowes' option to save data.
*/
package de.anomic.plasma;
import java.io.*;
import java.util.*;
import java.lang.RuntimeException;
import de.anomic.kelondro.*;
import de.anomic.server.serverLog;
public final class plasmaWordIndexAssortment {
// environment constants
private static final String assortmentFileName = "indexAssortment";
public static final int[] bufferStructureBasis = new int[]{
plasmaWordIndexEntry.wordHashLength, // a wordHash
4, // occurrence counter
8, // timestamp of last access
plasmaWordIndexEntry.urlHashLength, // corresponding URL hash
plasmaWordIndexEntry.attrSpaceLong // URL attributes
};
// class variables
private File assortmentFile;
private int assortmentCapacity;
private serverLog log;
private kelondroTree assortments;
private long bufferSize;
private int bufferStructureLength;
private static String intx(int x) {
String s = "" + x;
while (s.length() < 3) s = "0" + s;
return s;
}
private static int[] bufferStructure(int assortmentCapacity) {
int[] structure = new int[3 + 2 * assortmentCapacity];
structure[0] = bufferStructureBasis[0];
structure[1] = bufferStructureBasis[1];
structure[2] = bufferStructureBasis[2];
for (int i = 0; i < assortmentCapacity; i++) {
structure[3 + 2 * i] = bufferStructureBasis[3];
structure[4 + 2 * i] = bufferStructureBasis[4];
}
return structure;
}
public plasmaWordIndexAssortment(File storagePath, int assortmentCapacity, int bufferkb, serverLog log) {
if (!(storagePath.exists())) storagePath.mkdirs();
this.assortmentFile = new File(storagePath, assortmentFileName + intx(assortmentCapacity) + ".db");
this.assortmentCapacity = assortmentCapacity;
this.bufferStructureLength = 3 + 2 * assortmentCapacity;
this.bufferSize = bufferkb * 1024;
this.log = log;
if (assortmentFile.exists()) {
// open existing singeton tree file
try {
assortments = new kelondroTree(assortmentFile, bufferSize);
log.logSystem("Opened Assortment Database, " + assortments.size() + " entries.");
} catch (IOException e){
log.logError("unable to open assortment database: " + e.getMessage());
e.printStackTrace();
}
} else {
// create new sigleton tree file
try {
assortments = new kelondroTree(assortmentFile, bufferSize, bufferStructure(assortmentCapacity));
log.logSystem("Created new Assortment Database");
} catch (IOException e){
log.logError("unable to create assortment database: " + e.getMessage());
e.printStackTrace();
}
}
}
public record newRecord(plasmaWordIndexEntry entry, long creationTime) {
return new record(new plasmaWordIndexEntry[]{entry}, creationTime);
}
public record newRecord(plasmaWordIndexEntry[] entries, long creationTime) {
return new record(entries, creationTime);
}
public class record {
public plasmaWordIndexEntry[] entries;
public long creationTime;
public record(plasmaWordIndexEntry[] entries, long creationTime) {
this.entries = entries;
this.creationTime = creationTime;
}
}
public void store(String wordHash, record newRecord) {
// stores a word index to assortment database
// this throws an exception if the word hash already existed
//log.logDebug("storeAssortment: wordHash=" + wordHash + ", urlHash=" + entry.getUrlHash() + ", time=" + creationTime);
byte[][] row = new byte[this.bufferStructureLength][];
row[0] = wordHash.getBytes();
row[1] = kelondroRecords.long2bytes(1, 4);
row[2] = kelondroRecords.long2bytes(newRecord.creationTime, 8);
for (int i = 0; i < assortmentCapacity; i++) {
row[3 + 2 * i] = newRecord.entries[i].getUrlHash().getBytes();
row[4 + 2 * i] = newRecord.entries[i].toEncodedForm(true).getBytes();
}
byte[][] oldrow = null;
try {
oldrow = assortments.put(row);
} catch (IOException e) {
log.logFailure("storeAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB");
e.printStackTrace();
resetDatabase();
} catch (kelondroException e) {
log.logFailure("storeAssortment/kelondro-error: " + e.getMessage() + " - reset assortment-DB");
e.printStackTrace();
resetDatabase();
}
if (oldrow != null) throw new RuntimeException("Store to assortment ambiguous");
}
public record read(String wordHash) {
// returns a single word index from assortment database; returns null if index does not exist
//log.logDebug("readAssortment: wordHash=" + wordHash);
byte[][] row = null;
try {
row = assortments.get(wordHash.getBytes());
} catch (IOException e) {
log.logFailure("readAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB");
e.printStackTrace();
resetDatabase();
} catch (kelondroException e) {
log.logFailure("readAssortment/kelondro-error: " + e.getMessage() + " - reset assortment-DB");
e.printStackTrace();
resetDatabase();
}
if (row == null) return null;
long creationTime = kelondroRecords.bytes2long(row[2]);
plasmaWordIndexEntry[] wordEntries = new plasmaWordIndexEntry[this.bufferStructureLength];
for (int i = 0; i < assortmentCapacity; i++) {
wordEntries[i] = new plasmaWordIndexEntry(new String(row[3 + 2 * i]), new String(row[4 + 2 * i]));
}
return new record(wordEntries, creationTime);
}
public void remove(String wordHash) {
// deletes a word index from assortment database
//log.logDebug("removeAssortment: wordHash=" + wordHash);
byte[][] row = null;
try {
row = assortments.remove(wordHash.getBytes());
} catch (IOException e) {
log.logFailure("removeAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB");
e.printStackTrace();
resetDatabase();
} catch (kelondroException e) {
log.logFailure("removeAssortment/kelondro-error: " + e.getMessage() + " - reset assortment-DB");
e.printStackTrace();
resetDatabase();
}
}
private void resetDatabase() {
// deletes the assortment database and creates a new one
try {
assortments.close();
} catch (IOException e) {}
if (!(assortmentFile.delete())) throw new RuntimeException("cannot delete assortment database");
try {
assortments = new kelondroTree(assortmentFile, bufferSize, bufferStructure(assortmentCapacity));
} catch (IOException e){
log.logError("unable to re-create assortment database: " + e.getMessage());
e.printStackTrace();
}
}
public Iterator hashes(String startWordHash, boolean up, boolean rot) {
try {
return assortments.keys(up, rot, startWordHash.getBytes());
} catch (IOException e) {
log.logFailure("iterateAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB");
e.printStackTrace();
resetDatabase();
return null;
}
}
public int size() {
return assortments.size();
}
public void close() {
try {
assortments.close();
} catch (IOException e){
log.logError("unable to close assortment database: " + e.getMessage());
e.printStackTrace();
}
}
}

@ -53,14 +53,8 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
// environment constants
private static final String indexDumpFileName = "indexDump0.stack";
private static final String singletonFileName = "indexSingletons0.db";
private static final int[] bufferStructure = new int[]{
plasmaWordIndexEntry.wordHashLength, // a wordHash
4, // occurrence counter
8, // timestamp of last access
plasmaWordIndexEntry.urlHashLength, // corresponding URL hash
plasmaWordIndexEntry.attrSpaceLong // URL attributes
};
private static final String oldSingletonFileName = "indexSingletons0.db";
private static final String newSingletonFileName = "indexAssortment001.db";
// class variables
private File databaseRoot;
@ -70,8 +64,8 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
private HashMap hashDate;
private int maxWords;
private serverLog log;
private kelondroTree singletons;
private long singletonBufferSize;
private plasmaWordIndexAssortment singletons;
private int singletonBufferSize; //kb
// calculated constants
private static String minKey, maxKey;
@ -83,36 +77,23 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
}
public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, int singletonbufferkb, serverLog log) {
// migrate
File oldSingletonFile = new File(databaseRoot, oldSingletonFileName);
File newSingletonFile = new File(databaseRoot, newSingletonFileName);
if ((oldSingletonFile.exists()) && (!(newSingletonFile.exists()))) oldSingletonFile.renameTo(newSingletonFile);
// creates a new index cache
// the cache has a back-end where indexes that do not fit in the cache are flushed
this.databaseRoot = databaseRoot;
this.singletonBufferSize = singletonbufferkb * 1024;
this.singletonBufferSize = singletonbufferkb;
this.cache = new TreeMap();
this.hashScore = new kelondroMScoreCluster();
this.hashDate = new HashMap();
this.maxWords = 10000;
this.backend = backend;
this.log = log;
File singletonFile = new File(databaseRoot, singletonFileName);
if (singletonFile.exists()) {
// open existing singeton tree file
try {
singletons = new kelondroTree(singletonFile, singletonBufferSize);
log.logSystem("Opened Singleton Database, " + singletons.size() + " entries.");
} catch (IOException e){
log.logError("unable to open singleton database: " + e.getMessage());
e.printStackTrace();
}
} else {
// create new sigleton tree file
try {
singletons = new kelondroTree(singletonFile, singletonBufferSize, bufferStructure);
log.logSystem("Created new Singleton Database");
} catch (IOException e){
log.logError("unable to create singleton database: " + e.getMessage());
e.printStackTrace();
}
}
this.singletons = new plasmaWordIndexAssortment(databaseRoot, 1, singletonBufferSize, log);
// read in dump of last session
try {
restore();
@ -126,7 +107,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
log.logSystem("creating dump for index cache, " + cache.size() + " words (and much more urls)");
File indexDumpFile = new File(databaseRoot, indexDumpFileName);
if (indexDumpFile.exists()) indexDumpFile.delete();
kelondroStack dumpStack = new kelondroStack(indexDumpFile, 0, bufferStructure);
kelondroStack dumpStack = new kelondroStack(indexDumpFile, 1024, plasmaWordIndexAssortment.bufferStructureBasis);
long startTime = System.currentTimeMillis();
long messageTime = System.currentTimeMillis() + 5000;
long wordsPerSecond = 0, wordcount = 0, urlcount = 0;
@ -179,7 +160,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
private long restore() throws IOException {
File indexDumpFile = new File(databaseRoot, indexDumpFileName);
if (!(indexDumpFile.exists())) return 0;
kelondroStack dumpStack = new kelondroStack(indexDumpFile, 0);
kelondroStack dumpStack = new kelondroStack(indexDumpFile, 1024);
log.logSystem("restore dump of index cache, " + dumpStack.size() + " word/url relations");
long startTime = System.currentTimeMillis();
long messageTime = System.currentTimeMillis() + 5000;
@ -217,97 +198,6 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
return urlCount;
}
// singleton access methods
private void storeSingleton(String wordHash, plasmaWordIndexEntry entry, long creationTime) {
// stores a word index to singleton database
// this throws an exception if the word hash already existed
//log.logDebug("storeSingleton: wordHash=" + wordHash + ", urlHash=" + entry.getUrlHash() + ", time=" + creationTime);
byte[][] row = new byte[5][];
row[0] = wordHash.getBytes();
row[1] = kelondroRecords.long2bytes(1, 4);
row[2] = kelondroRecords.long2bytes(creationTime, 8);
row[3] = entry.getUrlHash().getBytes();
row[4] = entry.toEncodedForm(true).getBytes();
byte[][] oldrow = null;
try {
oldrow = singletons.put(row);
} catch (IOException e) {
log.logFailure("storeSingleton/IO-error: " + e.getMessage() + " - reset singleton-DB");
e.printStackTrace();
resetSingletonDatabase();
} catch (kelondroException e) {
log.logFailure("storeSingleton/kelondro-error: " + e.getMessage() + " - reset singleton-DB");
e.printStackTrace();
resetSingletonDatabase();
}
if (oldrow != null) throw new RuntimeException("Store to singleton ambiguous");
}
public Object[] /*{plasmaWordIndexEntry, Long(creationTime)}*/ readSingleton(String wordHash) {
// returns a single word index from singleton database; returns null if index does not exist
//log.logDebug("readSingleton: wordHash=" + wordHash);
byte[][] row = null;
try {
row = singletons.get(wordHash.getBytes());
} catch (IOException e) {
log.logFailure("readSingleton/IO-error: " + e.getMessage() + " - reset singleton-DB");
e.printStackTrace();
resetSingletonDatabase();
} catch (kelondroException e) {
log.logFailure("readSingleton/kelondro-error: " + e.getMessage() + " - reset singleton-DB");
e.printStackTrace();
resetSingletonDatabase();
}
if (row == null) return null;
long creationTime = kelondroRecords.bytes2long(row[2]);
plasmaWordIndexEntry wordEntry = new plasmaWordIndexEntry(new String(row[3]), new String(row[4]));
return new Object[]{wordEntry, new Long(creationTime)};
}
private void removeSingleton(String wordHash) {
// deletes a word index from singleton database
//log.logDebug("removeSingleton: wordHash=" + wordHash);
byte[][] row = null;
try {
row = singletons.remove(wordHash.getBytes());
} catch (IOException e) {
log.logFailure("removeSingleton/IO-error: " + e.getMessage() + " - reset singleton-DB");
e.printStackTrace();
resetSingletonDatabase();
} catch (kelondroException e) {
log.logFailure("removeSingleton/kelondro-error: " + e.getMessage() + " - reset singleton-DB");
e.printStackTrace();
resetSingletonDatabase();
}
}
private void resetSingletonDatabase() {
// deletes the singleton database and creates a new one
try {
singletons.close();
} catch (IOException e) {}
File singletonFile = new File(databaseRoot, singletonFileName);
if (!(singletonFile.delete())) throw new RuntimeException("cannot delete singleton database");
try {
singletons = new kelondroTree(singletonFile, singletonBufferSize, bufferStructure);
} catch (IOException e){
log.logError("unable to re-create singleton database: " + e.getMessage());
e.printStackTrace();
}
}
public Iterator singletonHashes(String startWordHash, boolean up, boolean rot) {
try {
return singletons.keys(up, rot, startWordHash.getBytes());
} catch (IOException e) {
log.logFailure("iterateSingleton/IO-error: " + e.getMessage() + " - reset singleton-DB");
e.printStackTrace();
resetSingletonDatabase();
return null;
}
}
// cache settings
public int maxURLinWordCache() {
@ -318,14 +208,14 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
return cache.size();
}
public int singletonsSize() {
return singletons.size();
}
public void setMaxWords(int maxWords) {
this.maxWords = maxWords;
}
public int singletonsSize() {
return singletons.size();
}
public int size() {
return java.lang.Math.max(singletons.size(), java.lang.Math.max(backend.size(), cache.size()));
}
@ -339,7 +229,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
return new kelondroMergeIterator(
new kelondroMergeIterator(
cache.keySet().iterator(),
singletonHashes(startWordHash, true, false),
singletons.hashes(startWordHash, true, false),
true),
backend.wordHashes(startWordHash, true),
true);
@ -366,13 +256,14 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
hashScore.deleteScore(key);
hashDate.remove(key);
}
// now decide where to flush that container
Object[] singleton = readSingleton(key);
plasmaWordIndexAssortment.record singleton = singletons.read(key);
if (singleton == null) {
// not found in singletons
if (container.size() == 1) {
// it is a singleton: store to singleton
storeSingleton(key, container.getOne(), time);
singletons.store(key, singletons.newRecord(container.getOne(), time));
return 1;
} else {
// store to back-end; this should be a rare case
@ -380,8 +271,8 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
}
} else {
// we have a singleton and need to integrate this in the flush
plasmaWordIndexEntry oldEntry = (plasmaWordIndexEntry) singleton[0];
long oldTime = ((Long) singleton[1]).longValue();
plasmaWordIndexEntry oldEntry = singleton.entries[0];
long oldTime = singleton.creationTime;
if (container.contains(oldEntry.getUrlHash())) {
// we have an double-occurrence
if (container.size() == 1) {
@ -389,13 +280,13 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
return 0;
} else {
// we flush to the backend, and the entry from the singletons
removeSingleton(key);
singletons.remove(key);
return backend.addEntries(container, java.lang.Math.max(time, oldTime));
}
} else {
// now we have more than one entry
// we must remove the key from the singleton database
removeSingleton(key);
singletons.remove(key);
// .. and put it to the container
container.add(oldEntry);
if (reintegrate) {
@ -416,15 +307,15 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
private boolean flushFromSingleton(String key) {
// this should only be called if the singleton shall be deleted or returned in an index entity
Object[] singleton = readSingleton(key);
plasmaWordIndexAssortment.record singleton = singletons.read(key);
if (singleton == null) {
return false;
} else {
// we have a singleton
plasmaWordIndexEntry entry = (plasmaWordIndexEntry) singleton[0];
long time = ((Long) singleton[1]).longValue();
plasmaWordIndexEntry entry = (plasmaWordIndexEntry) singleton.entries[0];
long time = singleton.creationTime;
// remove it from the singleton database
removeSingleton(key);
singletons.remove(key);
// integrate it to the backend
return backend.addEntries(plasmaWordIndexEntryContainer.instantContainer(key, entry), time) > 0;
}
@ -518,7 +409,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
hashScore.deleteScore(wordHash);
hashDate.remove(wordHash);
}
removeSingleton(wordHash);
singletons.remove(wordHash);
backend.deleteIndex(wordHash);
}
@ -561,12 +452,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
}
public void close(int waitingSeconds) {
try {
singletons.close();
} catch (IOException e){
log.logError("unable to close singleton database: " + e.getMessage());
e.printStackTrace();
}
singletons.close();
try {
dump(waitingSeconds);
} catch (IOException e){

Loading…
Cancel
Save