- added an assortment importer. the old database structures can be imported with java -classpath classes yacy -migrateassortments - modified wordmigration. The indexes from WORDS are now imported to the collection database. The call is java -classpath classes yacy -migratewords (as it was) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3044 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
8043bb82fb
commit
109ed0a0bb
@ -1,99 +0,0 @@
|
||||
// plasmaURLPool.java
|
||||
// -----------------------
|
||||
// part of YaCy
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2005
|
||||
// last major change: 16.06.2005
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
// this class combines all url storage methods into one. It is the host for all url storage
|
||||
|
||||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import de.anomic.plasma.plasmaURL;
|
||||
import de.anomic.index.indexURLEntry;
|
||||
import de.anomic.net.URL;
|
||||
|
||||
public class plasmaURLPool {
|
||||
|
||||
|
||||
public final plasmaCrawlLURL loadedURL;
|
||||
public final plasmaCrawlNURL noticeURL;
|
||||
public final plasmaCrawlEURL errorURL;
|
||||
|
||||
public plasmaURLPool(File plasmaPath, File indexPath,
|
||||
int ramLURL,
|
||||
int ramNURL,
|
||||
int ramEURL,
|
||||
long preloadTime) {
|
||||
loadedURL = new plasmaCrawlLURL(plasmaPath, indexPath, ramLURL, preloadTime);
|
||||
noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL, -1);
|
||||
errorURL = new plasmaCrawlEURL(plasmaPath, ramEURL, -1);
|
||||
}
|
||||
|
||||
public String exists(String hash) {
|
||||
// tests if hash occurrs in any database
|
||||
// if it exists, the name of the database is returned,
|
||||
// if it not exists, null is returned
|
||||
if (loadedURL.exists(hash)) return "loaded";
|
||||
if (noticeURL.existsInStack(hash)) return "crawler";
|
||||
if (errorURL.exists(hash)) return "errors";
|
||||
return null;
|
||||
}
|
||||
|
||||
public URL getURL(String urlhash) throws IOException {
|
||||
if (urlhash.equals(plasmaURL.dummyHash)) return null;
|
||||
try {
|
||||
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
|
||||
if (ne != null) return ne.url();
|
||||
} catch (IOException e) {}
|
||||
indexURLEntry le = loadedURL.load(urlhash, null);
|
||||
if (le != null) return le.comp().url();
|
||||
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
|
||||
if (ee != null) return ee.url();
|
||||
return null;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
try {loadedURL.close();} catch (IOException e) {}
|
||||
noticeURL.close();
|
||||
try {errorURL.close();} catch (IOException e) {}
|
||||
}
|
||||
}
|
@ -1,408 +0,0 @@
|
||||
// plasmaWordIndexAssortmentCluster.java
|
||||
// -------------------------------------
|
||||
// part of YACY
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2005
|
||||
// last major change: 20.5.2005
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
/*
|
||||
An assortment-cluster is a set of assortments.
|
||||
Each one carries a different number of URL's
|
||||
*/
|
||||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
import de.anomic.index.indexContainer;
|
||||
import de.anomic.index.indexContainerOrder;
|
||||
import de.anomic.index.indexRWIEntry;
|
||||
import de.anomic.index.indexRI;
|
||||
import de.anomic.kelondro.kelondroCache;
|
||||
import de.anomic.kelondro.kelondroMergeIterator;
|
||||
import de.anomic.kelondro.kelondroNaturalOrder;
|
||||
import de.anomic.kelondro.kelondroRecords;
|
||||
import de.anomic.kelondro.kelondroRow;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
|
||||
public final class plasmaWordIndexAssortmentCluster implements indexRI {
|
||||
|
||||
// class variables
|
||||
private int clusterCount; // number of cluster files
|
||||
public int clusterCapacity; // number of all url referrences that can be stored to a single word in the cluster
|
||||
|
||||
//private serverLog log;
|
||||
private plasmaWordIndexAssortment[] assortments;
|
||||
private long completeBufferKB;
|
||||
private kelondroRow payloadrow;
|
||||
|
||||
public plasmaWordIndexAssortmentCluster(File assortmentsPath, int clusterCount, kelondroRow payloadrow, int bufferkb, long preloadTime, serverLog log) throws IOException {
|
||||
// set class variables
|
||||
if (!(assortmentsPath.exists())) assortmentsPath.mkdirs();
|
||||
this.payloadrow = payloadrow;
|
||||
this.clusterCount = clusterCount;
|
||||
this.clusterCapacity = clusterCount * (clusterCount + 1) / 2;
|
||||
this.completeBufferKB = bufferkb;
|
||||
// this.log = log;
|
||||
this.assortments = new plasmaWordIndexAssortment[clusterCount];
|
||||
|
||||
// open cluster and close it directly again to detect the element sizes
|
||||
int[] sizes = new int[clusterCount];
|
||||
int sumSizes = 1;
|
||||
plasmaWordIndexAssortment testAssortment;
|
||||
for (int i = 0; i < clusterCount; i++) {
|
||||
testAssortment = new plasmaWordIndexAssortment(assortmentsPath, payloadrow, i + 1, 0, 0, null);
|
||||
sizes[i] = testAssortment.size() + clusterCount - i;
|
||||
sumSizes += sizes[i];
|
||||
testAssortment.close();
|
||||
testAssortment = null;
|
||||
}
|
||||
|
||||
// initialize cluster using the cluster elements size for optimal buffer
|
||||
// size
|
||||
long nextTime;
|
||||
long startTime;
|
||||
long sS = (long) sumSizes;
|
||||
for (int i = 0; i < clusterCount; i++) {
|
||||
nextTime = Math.max(0, preloadTime * ((long) sizes[i]) / sS);
|
||||
startTime = System.currentTimeMillis();
|
||||
assortments[i] = new plasmaWordIndexAssortment(
|
||||
assortmentsPath,
|
||||
payloadrow,
|
||||
i + 1,
|
||||
(int) (completeBufferKB * (long) sizes[i] / (long) sumSizes),
|
||||
nextTime,
|
||||
log);
|
||||
preloadTime -= System.currentTimeMillis() - startTime;
|
||||
sS -= sizes[i];
|
||||
}
|
||||
}
|
||||
|
||||
private indexContainer storeSingular(indexContainer newContainer) throws IOException {
|
||||
// this tries to store the record. If the record does not fit, or a same hash already
|
||||
// exists and would not fit together with the new record, then the record is deleted from
|
||||
// the assortmen(s) and returned together with the newRecord.
|
||||
// if storage was successful, NULL is returned.
|
||||
if (newContainer.size() > clusterCount) return newContainer; // it will not fit
|
||||
indexContainer buffer;
|
||||
while ((buffer = assortments[newContainer.size() - 1].remove(newContainer.getWordHash())) != null) {
|
||||
if (newContainer.add(buffer, -1) == 0) return newContainer; // security check; othervise this loop does not terminate
|
||||
if (newContainer.size() > clusterCount) return newContainer; // it will not fit
|
||||
}
|
||||
// the assortment (newContainer.size() - 1) should now be empty. put it in there
|
||||
assortments[newContainer.size() - 1].store(newContainer);
|
||||
// return null to show that we have stored the new Record successfully
|
||||
return null;
|
||||
}
|
||||
|
||||
private void storeForced(indexContainer newContainer) throws IOException {
|
||||
// this stores the record and overwrites an existing record.
|
||||
// this is safe if we can be shure that the record does not exist before.
|
||||
if ((newContainer == null) || (newContainer.size() == 0) || (newContainer.size() > clusterCount)) return; // it will not fit
|
||||
assortments[newContainer.size() - 1].store(newContainer);
|
||||
}
|
||||
|
||||
private void storeStretched(indexContainer newContainer) throws IOException {
|
||||
// this stores the record and stretches the storage over
|
||||
// all the assortments that are necessary to fit in the record
|
||||
// IMPORTANT: it must be ensured that the wordHash does not exist in the cluster before
|
||||
// i.e. by calling removeFromAll
|
||||
if (newContainer.size() <= clusterCount) {
|
||||
storeForced(newContainer);
|
||||
return;
|
||||
}
|
||||
|
||||
// calculate minimum cluster insert point
|
||||
int clusterMinStart = clusterCount;
|
||||
int cap = clusterCapacity - newContainer.size() - 2 * clusterCount;
|
||||
while (cap > 0) {
|
||||
cap -= clusterMinStart;
|
||||
clusterMinStart--;
|
||||
}
|
||||
|
||||
// point the real cluster insert point somewhere between the minimum and the maximum
|
||||
int clusterStart = clusterCount - (int) (Math.random() * (clusterCount - clusterMinStart));
|
||||
|
||||
// do the insert
|
||||
indexContainer c;
|
||||
Iterator i = newContainer.entries();
|
||||
for (int j = clusterStart; j >= 1; j--) {
|
||||
c = new indexContainer(newContainer.getWordHash(), payloadrow, false);
|
||||
for (int k = 0; k < j; k++) {
|
||||
if (i.hasNext()) {
|
||||
c.add((indexRWIEntry) i.next(), newContainer.updated());
|
||||
} else {
|
||||
storeForced(c);
|
||||
return;
|
||||
}
|
||||
}
|
||||
storeForced(c);
|
||||
}
|
||||
}
|
||||
|
||||
public indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
|
||||
indexContainer container = new indexContainer(wordHash, payloadrow, false);
|
||||
container.add(newEntry);
|
||||
return addEntries(container, updateTime, dhtCase);
|
||||
}
|
||||
|
||||
public long getUpdateTime(String wordHash) {
|
||||
indexContainer entries = getContainer(wordHash, null, false, -1);
|
||||
if (entries == null) return 0;
|
||||
return entries.updated();
|
||||
}
|
||||
|
||||
public indexContainer addEntries(indexContainer newContainer, long creationTime, boolean dhtCase) {
|
||||
// this is called by the index ram cache flush process
|
||||
// it returnes NULL if the storage was successful
|
||||
// it returnes a new container if the given container cannot be stored
|
||||
// containers that are returned will be stored in a WORDS file
|
||||
if (newContainer == null) return null;
|
||||
if (newContainer.size() > clusterCapacity) return newContainer; // it will not fit
|
||||
|
||||
// split the container into several smaller containers that will take the whole thing
|
||||
// first find out how the container can be splitted
|
||||
int testsize = Math.min(clusterCount, newContainer.size());
|
||||
int [] spaces = new int[testsize];
|
||||
for (int i = testsize - 1; i >= 0; i--) spaces[i] = 0;
|
||||
int need = newContainer.size();
|
||||
int selectedAssortment = testsize - 1;
|
||||
while (selectedAssortment >= 0) {
|
||||
if (selectedAssortment + 1 <= need) {
|
||||
spaces[selectedAssortment] = (assortments[selectedAssortment].get(newContainer.getWordHash()) == null) ? (selectedAssortment + 1) : 0;
|
||||
need -= spaces[selectedAssortment];
|
||||
assert (need >= 0);
|
||||
if (need == 0) break;
|
||||
}
|
||||
selectedAssortment--;
|
||||
}
|
||||
if (need == 0) {
|
||||
// we found spaces so that we can put in the newContainer into these spaces
|
||||
indexContainer c;
|
||||
Iterator i = newContainer.entries();
|
||||
for (int j = testsize - 1; j >= 0; j--) {
|
||||
if (spaces[j] == 0) continue;
|
||||
c = new indexContainer(newContainer.getWordHash(), payloadrow, false);
|
||||
for (int k = 0; k <= j; k++) {
|
||||
assert (i.hasNext());
|
||||
c.add((indexRWIEntry) i.next(), newContainer.updated());
|
||||
}
|
||||
try {
|
||||
storeForced(c);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
if (newContainer.size() <= clusterCount) try {
|
||||
newContainer = storeSingular(newContainer);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
if (newContainer == null) return null;
|
||||
|
||||
// clean up the whole thing and try to insert the container then
|
||||
newContainer.add(deleteContainer(newContainer.getWordHash(), -1), -1);
|
||||
if (newContainer.size() > clusterCapacity) return newContainer;
|
||||
try {
|
||||
storeStretched(newContainer);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public indexContainer deleteContainer(String wordHash) {
|
||||
return deleteContainer(wordHash, -1);
|
||||
}
|
||||
|
||||
public indexContainer deleteContainer(String wordHash, long maxTime) {
|
||||
// removes all records from all the assortments and return them
|
||||
indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false);
|
||||
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
|
||||
long remainingTime;
|
||||
for (int i = 0; i < clusterCount; i++) {
|
||||
buffer = assortments[i].remove(wordHash);
|
||||
remainingTime = limitTime - System.currentTimeMillis();
|
||||
if (0 > remainingTime) break;
|
||||
if (buffer != null) record.add(buffer, remainingTime);
|
||||
}
|
||||
return record;
|
||||
}
|
||||
|
||||
/*
|
||||
public int removeEntries(String wordHash, String[] referenceHashes, boolean deleteComplete) {
|
||||
indexContainer c = deleteContainer(wordHash, -1);
|
||||
int b = c.size();
|
||||
c.removeEntries(wordHash, referenceHashes, false);
|
||||
if (c.size() != 0) {
|
||||
addEntries(c, c.updated(), false);
|
||||
}
|
||||
return b - c.size();
|
||||
}
|
||||
*/
|
||||
|
||||
public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) {
|
||||
indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false);
|
||||
boolean found = false;
|
||||
for (int i = 0; i < clusterCount; i++) {
|
||||
buffer = assortments[i].remove(wordHash);
|
||||
if ((buffer != null) && (buffer.remove(urlHash) != null)) found = true;
|
||||
record.add(buffer, -1);
|
||||
if (found) break;
|
||||
}
|
||||
// put back remaining
|
||||
if (record.size() != 0) {
|
||||
addEntries(record, record.updated(), false);
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete) {
|
||||
indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false);
|
||||
int initialSize = urlHashes.size();
|
||||
for (int i = 0; i < clusterCount; i++) {
|
||||
buffer = assortments[i].remove(wordHash);
|
||||
if (buffer != null) {
|
||||
// sort out url hashes that shall be deleted
|
||||
Iterator bi = buffer.entries();
|
||||
indexRWIEntry entry;
|
||||
while (bi.hasNext()) {
|
||||
entry = (indexRWIEntry) bi.next();
|
||||
if (urlHashes.remove(entry.urlHash())) bi.remove();
|
||||
}
|
||||
record.add(buffer, -1);
|
||||
}
|
||||
if (urlHashes.size() == 0) break;
|
||||
}
|
||||
// put back remaining
|
||||
if (record.size() != 0) {
|
||||
addEntries(record, record.updated(), false);
|
||||
}
|
||||
return initialSize - urlHashes.size();
|
||||
}
|
||||
|
||||
public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) {
|
||||
// collect all records from all the assortments and return them
|
||||
indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false);
|
||||
long timeout = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
|
||||
for (int i = 0; i < clusterCount; i++) {
|
||||
buffer = assortments[i].get(wordHash);
|
||||
if (buffer != null) {
|
||||
buffer.select(urlselection);
|
||||
record.add(buffer, -1);
|
||||
}
|
||||
if (System.currentTimeMillis() > timeout) break;
|
||||
}
|
||||
return record;
|
||||
}
|
||||
|
||||
public int indexSize(String wordHash) {
|
||||
int size = 0;
|
||||
for (int i = 0; i < clusterCount; i++) {
|
||||
if (assortments[i].contains(wordHash)) size += i + 1;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
public Iterator wordContainers(String startWordHash, boolean rot) {
|
||||
try {
|
||||
return wordContainers(startWordHash, true, rot);
|
||||
} catch (IOException e) {
|
||||
return new HashSet().iterator();
|
||||
}
|
||||
}
|
||||
|
||||
public Iterator wordContainers(String startWordHash, boolean up, boolean rot) throws IOException {
|
||||
// iterates indexContainer - Objects
|
||||
HashSet containerIterators = new HashSet();
|
||||
for (int i = 0; i < clusterCount; i++) containerIterators.add(assortments[i].containers(startWordHash, up, rot));
|
||||
return kelondroMergeIterator.cascade(containerIterators, new indexContainerOrder(kelondroNaturalOrder.naturalOrder), indexContainer.containerMergeMethod, up);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
int total = 0;
|
||||
for (int i = 0; i < clusterCount; i++) total += assortments[i].size();
|
||||
return total;
|
||||
}
|
||||
|
||||
public int[] sizes() {
|
||||
int[] sizes = new int[clusterCount];
|
||||
for (int i = 0; i < clusterCount; i++) sizes[i] = assortments[i].size();
|
||||
return sizes;
|
||||
}
|
||||
|
||||
public int cacheChunkSizeAvg() {
|
||||
int i = 0;
|
||||
for (int j = 0; j < clusterCount; j++) {
|
||||
i += assortments[j].cacheNodeChunkSize();
|
||||
}
|
||||
return i / clusterCount;
|
||||
}
|
||||
|
||||
public int cacheObjectSizeAvg() {
|
||||
long c = 0, k = 0;
|
||||
for (int j = 0; j < clusterCount; j++) {
|
||||
c += assortments[j].size() * assortments[j].cacheObjectChunkSize();
|
||||
k += assortments[j].size();
|
||||
}
|
||||
return (k > 0) ? (int) (c / k) : 0;
|
||||
}
|
||||
|
||||
public int[] cacheNodeStatus() {
|
||||
int[][] a = new int[assortments.length][];
|
||||
for (int i = assortments.length - 1; i >= 0; i--) a[i] = assortments[i].cacheNodeStatus();
|
||||
return kelondroRecords.cacheCombinedStatus(a, assortments.length);
|
||||
}
|
||||
|
||||
public long[] cacheObjectStatus() {
|
||||
long[][] a = new long[assortments.length][];
|
||||
for (int i = assortments.length - 1; i >= 0; i--) a[i] = assortments[i].cacheObjectStatus();
|
||||
return kelondroCache.combinedStatus(a, a.length);
|
||||
}
|
||||
|
||||
public void close(int waitingSeconds) {
|
||||
for (int i = 0; i < clusterCount; i++) assortments[i].close();
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in new issue