- full integration of new LURL database in INDEX

- added migration method for urlHash.db into INDEX

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2819 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 02c66c04f2
commit 06854988da

@ -98,7 +98,7 @@ public final class IndexImport_p {
if (startImport) {
dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType);
if (importerThread != null) {
importerThread.init(new File(importPath), switchboard.indexPublicTextPath, cacheSize, 100);
importerThread.init(new File(importPath), switchboard.indexPath, cacheSize, 100);
importerThread.startIt();
}
prop.put("LOCATION","");

@ -73,6 +73,7 @@ import de.anomic.server.serverMemory;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.tools.dirlistComparator;
import de.anomic.tools.md5DirFileFilter;
import de.anomic.yacy.yacyCore;
@ -174,14 +175,15 @@ public class dir {
final byte[] binary = (byte[]) post.get("file$file", new byte[0]);
try {
serverFileUtils.write(binary, newfile);
String md5s = serverCodings.encodeMD5Hex(newfile);
byte[] md5 = serverCodings.encodeMD5Raw(newfile);
String md5s = serverCodings.encodeHex(md5);
serverFileUtils.write((md5s + "\n" + description).getBytes("UTF-8"), newfilemd5); // generate md5
// index file info
if (post.get("indexing", "").equals("on")) {
final String urlstring = yacyhURL(yacyCore.seedDB.mySeed, filename, md5s);
final String phrase = filename.replace('.', ' ').replace('_', ' ').replace('-', ' ');
indexPhrase(switchboard, urlstring, phrase, description);
indexPhrase(switchboard, urlstring, phrase, description, md5);
}
} catch (IOException e) {}
} else if (action.equals("newdir") && (uploadAuthorization || adminAuthorization)) {
@ -354,20 +356,27 @@ public class dir {
return "http://share." + seed.getHexHash() + ".yacyh/" + filename + "?md5=" + md5;
}
public static void indexPhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr) {
public static void indexPhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr, byte[] md5) {
try {
final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
final plasmaCrawlLURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
url.toNormalform(), "YaCyShare: " + descr, new Date(), new Date(),
"AAAAAAAAAAAA", /*referrer*/
0, /*copycount*/
false, /*localneed*/
condenser.RESULT_WORD_ENTROPHY,
"**", /*language*/
indexEntryAttribute.DT_SHARE, /*doctype*/
phrase.length(), /*size*/
condenser.RESULT_NUMB_WORDS
url,
"YaCyShare: " + descr,
yacyCore.seedDB.mySeed.getName(),
"", // tags
"", // ETag
new Date(), // modification
new Date(), // loadtime
new Date(), // freshtime
"AAAAAAAAAAAA", // referrer
md5, // md5
(long) phrase.length(), // size
condenser.RESULT_NUMB_WORDS, // word count
indexEntryAttribute.DT_SHARE, // doctype
new bitfield(4),
"**", // language
0,0,0,0,0,0
);
switchboard.urlPool.loadedURL.store(newEntry);
switchboard.urlPool.loadedURL.stack(

@ -124,7 +124,7 @@ public final class crawlReceipt {
prop.put("delay", "3600");
} else if (result.equals("fill")) {
// generating a new loaded URL entry
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true);
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr);
if (entry == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr);

@ -97,7 +97,7 @@ public final class transferURL {
if (urls == null) {
yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName);
} else {
lEntry = sb.urlPool.loadedURL.newEntry(urls, true);
lEntry = sb.urlPool.loadedURL.newEntry(urls);
if (lEntry == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message???

@ -550,6 +550,14 @@ final class dbTable implements kelondroIndex {
}
}
public synchronized void addUnique(kelondroRow.Entry row) throws IOException {
throw new UnsupportedOperationException();
}
public synchronized void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException {
throw new UnsupportedOperationException();
}
public kelondroRow.Entry remove(byte[] key) throws IOException {
try {

@ -78,7 +78,7 @@ public class indexContainer extends kelondroRowSet {
}
public int add(indexEntry entry) {
this.add(entry.toKelondroEntry());
this.addUnique(entry.toKelondroEntry());
return 1;
}

@ -425,16 +425,6 @@ public class indexURL {
}
}
public boolean remove(String hash) {
if (hash == null) return false;
try {
urlIndexFile.remove(hash.getBytes());
return true;
} catch (IOException e) {
return false;
}
}
public void close() throws IOException {
if (urlIndexFile != null) {
urlIndexFile.close();

@ -411,7 +411,7 @@ public class kelondroAttrSeq {
kelondroRowCollection collection = new kelondroRowCollection(structure.seqrow, seq.size());
Iterator i = seq.iterator();
while (i.hasNext()) {
collection.add(structure.seqrow.newEntry(((String) i.next()).getBytes()));
collection.addUnique(structure.seqrow.newEntry(((String) i.next()).getBytes()));
}
return collection;
}

@ -90,26 +90,21 @@ public class kelondroBufferedIndex implements kelondroIndex {
}
public synchronized kelondroRow.Entry get(byte[] key) throws IOException {
long handle = index.profile().startRead();
long handle = (index instanceof kelondroFlexSplitTable) ? -1 : index.profile().startRead();
kelondroRow.Entry entry = null;
entry = (kelondroRow.Entry) buffer.get(key);
if (entry == null) entry = index.get(key);
index.profile().stopRead(handle);
if (handle >= 0) index.profile().stopRead(handle);
return entry;
}
public synchronized void add(kelondroRow.Entry newentry) throws IOException {
assert (index instanceof kelondroRowSet);
((kelondroRowSet) index).add(newentry);
public synchronized kelondroRow.Entry put(kelondroRow.Entry row) throws IOException {
return put(row, null);
}
public synchronized kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}
public synchronized kelondroRow.Entry put(kelondroRow.Entry newentry) throws IOException {
long handle = index.profile().startWrite();
byte[] key = newentry.getColBytes(index.primarykey());
long handle = (index instanceof kelondroFlexSplitTable) ? -1 : index.profile().startWrite();
byte[] key = row.getColBytes(index.primarykey());
kelondroRow.Entry oldentry = null;
oldentry = (kelondroRow.Entry) buffer.get(key);
if (oldentry == null) {
@ -117,45 +112,67 @@ public class kelondroBufferedIndex implements kelondroIndex {
oldentry = index.get(key);
if (oldentry == null) {
// this was not anywhere
buffer.put(key, newentry);
if (((buffer.size() > bufferFlushMinimum) && (serverMemory.available() > memBlockLimit))
|| (buffer.size() > bufferFlushLimit))
flush();
if (entryDate == null) {
buffer.put(key, row);
if (((buffer.size() > bufferFlushMinimum) && (serverMemory.available() > memBlockLimit))
|| (buffer.size() > bufferFlushLimit))
flush();
} else {
index.put(row, entryDate);
}
} else {
// replace old entry
index.put(newentry);
if (entryDate == null) {
index.put(row);
} else {
index.put(row, entryDate);
}
}
} else {
// the entry is already in buffer
// simply replace old entry
buffer.put(key, newentry);
if (entryDate == null) {
buffer.put(key, row);
} else {
buffer.remove(key);
index.put(row, entryDate);
}
}
index.profile().stopWrite(handle);
if (handle >= 0) index.profile().stopWrite(handle);
return oldentry;
}
public synchronized void addUnique(kelondroRow.Entry row) throws IOException {
assert (index instanceof kelondroRowSet);
((kelondroRowSet) index).addUnique(row);
}
public synchronized void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException {
addUnique(row);
}
public synchronized kelondroRow.Entry remove(byte[] key) throws IOException {
long handle = index.profile().startDelete();
long handle = (index instanceof kelondroFlexSplitTable) ? -1 : index.profile().startDelete();
kelondroRow.Entry oldentry = null;
oldentry = (kelondroRow.Entry) buffer.remove(key);
if (oldentry == null) {
// try the collection
return index.remove(key);
}
index.profile().stopDelete(handle);
if (handle >= 0) index.profile().stopDelete(handle);
return oldentry;
}
public synchronized kelondroRow.Entry removeOne() throws IOException {
long handle = index.profile().startDelete();
long handle = (index instanceof kelondroFlexSplitTable) ? -1 : index.profile().startDelete();
if (buffer.size() > 0) {
byte[] key = (byte[]) buffer.keySet().iterator().next();
kelondroRow.Entry entry = (kelondroRow.Entry) buffer.remove(key);
index.profile().stopDelete(handle);
if (handle >= 0) index.profile().stopDelete(handle);
return entry;
} else {
kelondroRow.Entry entry = index.removeOne();
index.profile().stopDelete(handle);
if (handle >= 0) index.profile().stopDelete(handle);
return entry;
}
}

@ -52,6 +52,13 @@ public class kelondroBytesIntMap {
return (int) oldentry.getColLong(1);
}
public synchronized void addi(byte[] key, int i) throws IOException {
kelondroRow.Entry newentry = ki.row().newEntry();
newentry.setCol(0, key);
newentry.setCol(1, i);
ki.addUnique(newentry);
}
public synchronized int removei(byte[] key) throws IOException {
// returns the integer index of the key, if the key can be found and was removed
// and -1 if the key was not found.

@ -159,7 +159,7 @@ public class kelondroCollectionIndex {
ientry.setCol(idx_col_lastread, t);
ientry.setCol(idx_col_lastwrote, t);
if (index instanceof kelondroBufferedIndex)
((kelondroBufferedIndex) index).add(ientry);
((kelondroBufferedIndex) index).addUnique(ientry);
else
index.put(ientry);
@ -540,13 +540,13 @@ public class kelondroCollectionIndex {
// fill index with values
kelondroRowSet collection = new kelondroRowSet(rowdef);
collection.add(rowdef.newEntry(new byte[][]{"abc".getBytes(), "efg".getBytes()}));
collection.addUnique(rowdef.newEntry(new byte[][]{"abc".getBytes(), "efg".getBytes()}));
collectionIndex.put("erstes".getBytes(), collection);
for (int i = 0; i <= 17; i++) {
collection = new kelondroRowSet(rowdef);
for (int j = 0; j < i; j++) {
collection.add(rowdef.newEntry(new byte[][]{("abc" + j).getBytes(), "xxx".getBytes()}));
collection.addUnique(rowdef.newEntry(new byte[][]{("abc" + j).getBytes(), "xxx".getBytes()}));
}
System.out.println("put key-" + i + ": " + collection.toString());
collectionIndex.put(("key-" + i).getBytes(), collection);
@ -556,7 +556,7 @@ public class kelondroCollectionIndex {
for (int i = 0; i <= 17; i++) {
collection = new kelondroRowSet(rowdef);
for (int j = 0; j < i; j++) {
collection.add(rowdef.newEntry(new byte[][]{("def" + j).getBytes(), "xxx".getBytes()}));
collection.addUnique(rowdef.newEntry(new byte[][]{("def" + j).getBytes(), "xxx".getBytes()}));
}
collectionIndex.merge(("key-" + i).getBytes(), collection);
}

@ -53,6 +53,7 @@ public class kelondroFlexSplitTable implements kelondroIndex {
// initialized tables map
this.tables = new HashMap();
if (!(path.exists())) path.mkdirs();
String[] dir = path.list();
String date;
@ -78,7 +79,7 @@ public class kelondroFlexSplitTable implements kelondroIndex {
StringBuffer suffix = new StringBuffer(6);
synchronized (thisCalendar) {
thisCalendar.setTime(date);
month = thisCalendar.get(Calendar.MONTH);
month = thisCalendar.get(Calendar.MONTH) + 1;
year = thisCalendar.get(Calendar.YEAR);
}
if ((year < 1970) && (year >= 70)) suffix.append("19").append(Integer.toString(year));
@ -136,7 +137,6 @@ public class kelondroFlexSplitTable implements kelondroIndex {
public synchronized kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
kelondroRow.Entry r = remove(row.getColBytes(0));
String suffix = dateSuffix(entryDate);
if (suffix == null) return null;
kelondroFlexTable table = (kelondroFlexTable) tables.get(suffix);
@ -150,6 +150,22 @@ public class kelondroFlexSplitTable implements kelondroIndex {
return r;
}
public synchronized void addUnique(kelondroRow.Entry row) throws IOException {
addUnique(row, new Date());
}
public synchronized void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException {
String suffix = dateSuffix(entryDate);
if (suffix == null) return;
kelondroFlexTable table = (kelondroFlexTable) tables.get(suffix);
if (table == null) {
// make new table
table = new kelondroFlexTable(path, tablename + "." + suffix, buffersize / (tables.size() + 1), -1, rowdef, objectOrder);
tables.put(suffix, table);
}
table.addUnique(row, entryDate);
}
public synchronized kelondroRow.Entry remove(byte[] key) throws IOException {
Iterator i = tables.values().iterator();
kelondroFlexTable table;

@ -93,7 +93,7 @@ public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondr
indexentry = ri.row().newEntry();
indexentry.setCol(0, node.getValueRow());
indexentry.setCol(1, i);
ri.add(indexentry);
ri.addUnique(indexentry);
if ((i % 10000) == 0) {
System.out.print('.');
System.out.flush();
@ -139,7 +139,7 @@ public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondr
return super.get(i);
}
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
public synchronized kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}
@ -152,6 +152,14 @@ public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondr
return super.set(i, row);
}
public synchronized void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException {
addUnique(row);
}
public synchronized void addUnique(kelondroRow.Entry row) throws IOException {
index.addi(row.getColBytes(0), super.add(row));
}
public synchronized kelondroRow.Entry remove(byte[] key) throws IOException {
int i = index.removei(key);
if (i < 0) return null;

@ -64,6 +64,8 @@ public interface kelondroIndex {
public kelondroRow.Entry get(byte[] key) throws IOException;
public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException;
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException;
public void addUnique(kelondroRow.Entry row) throws IOException; // no double-check
public void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException; // no double-check
public kelondroRow.Entry remove(byte[] key) throws IOException;
public kelondroRow.Entry removeOne() throws IOException;
public Iterator rows(boolean up, boolean rotating, byte[] firstKey) throws IOException;

@ -58,7 +58,7 @@ public class kelondroIntBytesMap {
newentry = index.row().newEntry();
newentry.setCol(0, (long) ii);
newentry.setCol(1, value);
index.add(newentry);
index.addUnique(newentry);
} catch (IOException e) {}
}

@ -75,6 +75,14 @@ public class kelondroRAMIndex implements kelondroIndex {
return (kelondroRow.Entry) index.put(row.getColBytes(0), row);
}
public synchronized void addUnique(kelondroRow.Entry row) throws IOException {
throw new UnsupportedOperationException();
}
public synchronized void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException {
throw new UnsupportedOperationException();
}
public synchronized Entry remove(byte[] key) {
return (kelondroRow.Entry) index.remove(key);
}

@ -148,7 +148,7 @@ public class kelondroRecords {
protected int readHit, readMiss, writeUnique, writeDouble, cacheDelete, cacheFlush;
// optional logger
protected Logger theLogger = null;
protected Logger theLogger = Logger.getLogger("KELONDRO"); // default logger
// tracking of file cration
protected boolean fileExisted;
@ -1046,19 +1046,38 @@ public class kelondroRecords {
synchronized (USAGE) {
if (USAGE.FREEC != 0) {
Handle h = USAGE.FREEH;
long repair_position = POS_FREEH;
int iter = 0;
while (h.index != NUL) {
//System.out.println("handle=0x" + Integer.toHexString(h.index));
// check handle
seekp = seekpos(h);
if (seekp > entryFile.length()) {
// repair last hande store position
this.theLogger.severe("KELONDRO WARNING " + this.filename + ": seek position " + seekp + "/" + h.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize) + " after " + iter + " iterations");
entryFile.writeInt(repair_position, NUL);
return markedDeleted;
}
// handle seems to be corrent. store handle
markedDeleted.add(h);
// move to next handle
repair_position = seekp;
h = new Handle(entryFile.readInt(seekp));
// double-check for already stored handles: detect loops
if (markedDeleted.contains(h)) {
// loop detection
this.theLogger.severe("KELONDRO WARNING " + this.filename + ": FREE-Queue contains loops");
return markedDeleted; // TODO: automatic fix
entryFile.writeInt(repair_position, NUL);
return markedDeleted;
}
markedDeleted.add(h);
seekp = seekpos(h);
if (seekp > entryFile.length()) throw new kelondroException("deletedHandles: seek position " + seekp + "/" + h.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize));
h = new Handle(entryFile.readInt(seekp));
// this appears to be correct. go on.
iter++;
if (System.currentTimeMillis() > timeLimit) throw new kelondroException(filename, "time limit of " + maxTime + " exceeded; > " + markedDeleted.size() + " deleted entries");
}
System.out.println("\nDEBUG: " + iter + " deleted entries in " + entryFile.name());
}
}
return markedDeleted;

@ -243,6 +243,13 @@ public class kelondroRow {
return rowinstance[colstart[column]] == 0;
}
public void setCol(String nickname, char c) {
if (nickref == null) genNickRef();
Object[] ref = (Object[]) nickref.get(nickname);
if (ref == null) return;
rowinstance[((Integer) ref[1]).intValue()] = (byte) c;
}
public void setCol(String nickname, byte[] cell) {
if (nickref == null) genNickRef();
Object[] ref = (Object[]) nickref.get(nickname);

@ -24,6 +24,7 @@
package de.anomic.kelondro;
import java.util.Date;
import java.util.Iterator;
import java.util.Set;
@ -198,10 +199,14 @@ public class kelondroRowCollection {
this.lastTimeWrote = System.currentTimeMillis();
}
public void add(kelondroRow.Entry a) {
add(a.bytes(), 0, a.bytes().length);
public void addUnique(kelondroRow.Entry row) {
add(row.bytes(), 0, row.bytes().length);
}
public void addUnique(kelondroRow.Entry row, Date entryDate) {
addUnique(row);
}
public void add(byte[] a) {
add(a, 0, a.length);
}
@ -225,7 +230,7 @@ public class kelondroRowCollection {
kelondroRow.Entry entry;
while (i.hasNext()) {
entry = (kelondroRow.Entry) i.next();
add(entry);
addUnique(entry);
}
}

@ -97,7 +97,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
set(index, entry);
removeMarker.remove(new Integer(index));
} else if (index < 0) {
add(entry);
addUnique(entry);
} else {
oldentry = get(index);
set(index, entry);

@ -114,6 +114,14 @@ public class kelondroSplittedTree implements kelondroIndex {
return put(row);
}
public synchronized void addUnique(kelondroRow.Entry row) throws IOException {
throw new UnsupportedOperationException();
}
public synchronized void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException {
throw new UnsupportedOperationException();
}
public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException {
return ktfs[partition(row.getColBytes(0))].put(row);
}

@ -553,6 +553,14 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex {
return result;
}
public synchronized void addUnique(kelondroRow.Entry row) throws IOException {
throw new UnsupportedOperationException();
}
public synchronized void addUnique(kelondroRow.Entry row, Date entryDate) throws IOException {
throw new UnsupportedOperationException();
}
private void assignChild(Node parentNode, Node childNode, int childType) throws IOException {
parentNode.setOHHandle(childType, childNode.handle());
childNode.setOHHandle(parent, parentNode.handle());

@ -75,9 +75,9 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
}
this.log.logFine("Initializing source word index db.");
this.importWordIndex = new plasmaWordIndex(this.importPath, this.indexPath, (this.cacheSize/2)/1024, preloadTime / 2, this.log, sb.getConfigBool("useCollectionIndex", false));
this.importWordIndex = new plasmaWordIndex(this.importPath, this.indexPath, true, (this.cacheSize/2)/1024, preloadTime / 2, this.log, sb.getConfigBool("useCollectionIndex", false));
this.log.logFine("Initializing import URL db.");
this.importUrlDB = new plasmaCrawlLURL(this.importPath, (this.cacheSize/2)/1024, preloadTime / 2, false);
this.importUrlDB = new plasmaCrawlLURL(this.importPath, this.indexPath, (this.cacheSize/2)/1024, preloadTime / 2, false);
this.importStartSize = this.importWordIndex.size();
}

@ -171,6 +171,16 @@ public class plasmaCrawlEURL extends indexURL {
return new Entry(url, referrer, initiator, executor, name, failreason, flags);
}
public boolean remove(String hash) {
if (hash == null) return false;
try {
urlIndexFile.remove(hash.getBytes());
return true;
} catch (IOException e) {
return false;
}
}
public synchronized void stackPushEntry(Entry e) {
rejectedStack.add(e.hash);
}

@ -67,6 +67,8 @@ import de.anomic.http.httpc.response;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroBufferedIndex;
import de.anomic.kelondro.kelondroFlexSplitTable;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroTree;
import de.anomic.net.URL;
@ -74,6 +76,7 @@ import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCodings;
import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.tools.nxTools;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
@ -90,14 +93,22 @@ public final class plasmaCrawlLURL extends indexURL {
private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling
private final LinkedList gcrawlResultStack; // 6 - local index: triggered external
public plasmaCrawlLURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) {
private boolean newdb;
public plasmaCrawlLURL(File plasmaPath, File indexPath, int bufferkb, long preloadTime, boolean newdb) {
super();
File cacheFile = new File(cachePath, "urlHash.db");
this.newdb = newdb;
cacheFile.getParentFile().mkdirs();
try {
urlIndexFile = new kelondroBufferedIndex(new kelondroTree(cacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, plasmaCrawlLURLOldEntry.rowdef));
if (newdb) {
urlIndexFile = new kelondroBufferedIndex(
new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", bufferkb * 0x400, preloadTime, plasmaCrawlLURLNewEntry.rowdef, kelondroBase64Order.enhancedCoder));
} else {
File oldLURLDB = new File(plasmaPath, "urlHash.db");
oldLURLDB.getParentFile().mkdirs();
urlIndexFile = new kelondroBufferedIndex(
new kelondroTree(oldLURLDB, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, plasmaCrawlLURLOldEntry.rowdef));
}
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
@ -133,21 +144,21 @@ public final class plasmaCrawlLURL extends indexURL {
}
}
public void notifyGCrawl(String urlHash, String initiatorHash, String executorHash) {
public synchronized void notifyGCrawl(String urlHash, String initiatorHash, String executorHash) {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
}
public void flushCacheSome() {
public synchronized void flushCacheSome() {
try {
((kelondroBufferedIndex) urlIndexFile).flushSome();
} catch (IOException e) {}
}
public int writeCacheSize() {
public synchronized int writeCacheSize() {
return ((kelondroBufferedIndex) urlIndexFile).writeBufferSize();
}
public plasmaCrawlLURLEntry load(String urlHash, indexEntry searchedWord) {
public synchronized plasmaCrawlLURLEntry load(String urlHash, indexEntry searchedWord) {
// generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
@ -158,13 +169,16 @@ public final class plasmaCrawlLURL extends indexURL {
try {
kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes());
if (entry == null) return null;
return new plasmaCrawlLURLOldEntry(entry, searchedWord);
if (newdb)
return new plasmaCrawlLURLNewEntry(entry, searchedWord);
else
return new plasmaCrawlLURLOldEntry(entry, searchedWord);
} catch (IOException e) {
return null;
}
}
public void store(plasmaCrawlLURLEntry entry) throws IOException {
public synchronized void store(plasmaCrawlLURLEntry entry) throws IOException {
// Check if there is a more recent Entry already in the DB
plasmaCrawlLURLEntry oldEntry;
try {
@ -187,23 +201,48 @@ public final class plasmaCrawlLURL extends indexURL {
urlIndexFile.put(entry.toRowEntry(), entry.loaddate());
}
public synchronized plasmaCrawlLURLEntry newEntry(String propStr, boolean setGlobal) {
public synchronized plasmaCrawlLURLEntry newEntry(String propStr) {
if (propStr.startsWith("{") && propStr.endsWith("}")) {
return new plasmaCrawlLURLOldEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal);
if (newdb)
return new plasmaCrawlLURLNewEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
else
return new plasmaCrawlLURLOldEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
} else {
return null;
}
}
public synchronized plasmaCrawlLURLEntry newEntry(String url, String descr, Date moddate, Date loaddate,
String referrerHash, int copyCount, boolean localNeed,
int quality, String language, char doctype,
int size, int wordCount) {
plasmaCrawlLURLEntry e = new plasmaCrawlLURLOldEntry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount);
return e;
public synchronized plasmaCrawlLURLEntry newEntry(
URL url,
String descr,
String author,
String tags,
String ETag,
Date mod,
Date load,
Date fresh,
String referrer,
byte[] md5,
long size,
int wc,
char dt,
bitfield flags,
String lang,
int llocal,
int lother,
int laudio,
int limage,
int lvideo,
int lapp) {
if (newdb)
return new plasmaCrawlLURLNewEntry(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5,
size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp);
else
return new plasmaCrawlLURLOldEntry(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5,
size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp);
}
public int getStackSize(int stack) {
public synchronized int getStackSize(int stack) {
switch (stack) {
case 1: return externResultStack.size();
case 2: return searchResultStack.size();
@ -215,7 +254,7 @@ public final class plasmaCrawlLURL extends indexURL {
return -1;
}
public String getUrlHash(int stack, int pos) {
public synchronized String getUrlHash(int stack, int pos) {
switch (stack) {
case 1: return ((String) externResultStack.get(pos)).substring(0, urlHashLength);
case 2: return ((String) searchResultStack.get(pos)).substring(0, urlHashLength);
@ -227,7 +266,7 @@ public final class plasmaCrawlLURL extends indexURL {
return null;
}
public String getInitiatorHash(int stack, int pos) {
public synchronized String getInitiatorHash(int stack, int pos) {
switch (stack) {
case 1: return ((String) externResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2);
case 2: return ((String) searchResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2);
@ -239,7 +278,7 @@ public final class plasmaCrawlLURL extends indexURL {
return null;
}
public String getExecutorHash(int stack, int pos) {
public synchronized String getExecutorHash(int stack, int pos) {
switch (stack) {
case 1: return ((String) externResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3);
case 2: return ((String) searchResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3);
@ -251,7 +290,7 @@ public final class plasmaCrawlLURL extends indexURL {
return null;
}
public boolean removeStack(int stack, int pos) {
public synchronized boolean removeStack(int stack, int pos) {
Object prevElement = null;
switch (stack) {
case 1: prevElement = externResultStack.remove(pos); break;
@ -264,7 +303,7 @@ public final class plasmaCrawlLURL extends indexURL {
return prevElement != null;
}
public void clearStack(int stack) {
public synchronized void clearStack(int stack) {
switch (stack) {
case 1: externResultStack.clear(); break;
case 2: searchResultStack.clear(); break;
@ -275,29 +314,31 @@ public final class plasmaCrawlLURL extends indexURL {
}
}
public boolean remove(String urlHash) {
if (!super.remove(urlHash)) return false;
for (int stack = 1; stack <= 6; stack++) {
for (int i = getStackSize(stack) - 1; i >= 0; i--) {
if (getUrlHash(stack,i).equals(urlHash)) {
removeStack(stack,i);
return true;
public synchronized boolean remove(String urlHash) {
if (urlHash == null) return false;
try {
kelondroRow.Entry r = urlIndexFile.remove(urlHash.getBytes());
if (r == null) return false;
for (int stack = 1; stack <= 6; stack++) {
for (int i = getStackSize(stack) - 1; i >= 0; i--) {
if (getUrlHash(stack, i).equals(urlHash)) {
removeStack(stack, i);
return true;
}
}
}
return true;
} catch (IOException e) {
return false;
}
return false;
}
public boolean exists(String urlHash) {
try {
if (urlIndexFile.get(urlHash.getBytes()) != null) {
return true;
} else {
return false;
}
} catch (IOException e) {
return false;
}
public synchronized boolean exists(String urlHash) {
try {
return (urlIndexFile.get(urlHash.getBytes()) != null);
} catch (IOException e) {
return false;
}
}
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
@ -402,7 +443,10 @@ public final class plasmaCrawlLURL extends indexURL {
kelondroRow.Entry e = (kelondroRow.Entry) i.next();
if (e == null) return null;
try {
return new plasmaCrawlLURLOldEntry(e, null);
if (newdb)
return new plasmaCrawlLURLNewEntry(e, null);
else
return new plasmaCrawlLURLOldEntry(e, null);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
}
@ -602,7 +646,7 @@ public final class plasmaCrawlLURL extends indexURL {
} catch (MalformedURLException e) {}
if (args[0].equals("-l")) try {
// arg 1 is path to URLCache
final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1, 0, false);
final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), new File(args[2]), 1, 0, false);
final Iterator enu = urls.entries(true, false, null);
while (enu.hasNext()) {
System.out.println(((plasmaCrawlLURLEntry) enu.next()).toString());

@ -37,33 +37,20 @@ import de.anomic.index.indexEntry;
public interface plasmaCrawlLURLEntry {
public kelondroRow.Entry toRowEntry() throws IOException;
public String hash();
public Components comp();
public Date moddate();
public Date loaddate();
public Date freshdate();
public String referrerHash();
public char doctype();
public String language();
public int size();
public int wordCount();
public String snippet();
public indexEntry word();
public boolean isOlder(plasmaCrawlLURLEntry other);
public String toString(String snippet);
public String toString();
public class Components {
@ -81,6 +68,13 @@ public interface plasmaCrawlLURLEntry {
this.tags = tags;
this.ETag = ETag;
}
public Components(URL url, String descr, String author, String tags, String ETag) {
this.url = url;
this.descr = descr;
this.author = author;
this.tags = tags;
this.ETag = ETag;
}
public URL url() { return this.url; }
public String descr() { return this.descr; }
public String author() { return this.author; }

@ -14,7 +14,7 @@ import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverCharBuffer;
import de.anomic.server.serverCodings;
import de.anomic.tools.crypt;
import de.anomic.tools.bitfield;
@ -27,8 +27,9 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
"String comp-360, " + // components: the url, description, author and tags. As 5th element, an ETag is possible
"Cardinal mod-4 {b256}, " + // last-modified from the httpd
"Cardinal load-4 {b256}, " + // time when the url was loaded
"Cardinal fresh-4 {b256}, " + // time until this url is fresh
"String referrer-12, " + // (one of) the url's referrer hash(es)
"byte[] md5-8" + // the md5 of the url content (to identify changes)
"byte[] md5-8, " + // the md5 of the url content (to identify changes)
"Cardinal size-6 {b256}, " + // size of file in bytes
"Cardinal wc-3 {b256}, " + // size of file by number of words; for video and audio: seconds
"byte[] dt-1, " + // doctype, taken from extension or any other heuristic
@ -53,11 +54,12 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
String ETag,
Date mod,
Date load,
Date fresh,
String referrer,
byte[] md5,
long size,
int wc,
byte dt,
char dt,
bitfield flags,
String lang,
int llocal,
@ -72,6 +74,7 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
this.entry.setCol("comp", encodeComp(url, descr, author, tags, ETag));
this.entry.setCol("mod", encodeDate(mod));
this.entry.setCol("load", encodeDate(load));
this.entry.setCol("fresh", encodeDate(fresh));
this.entry.setCol("referrer", referrer.getBytes());
this.entry.setCol("md5", md5);
this.entry.setCol("size", size);
@ -89,17 +92,18 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
this.word = null;
}
byte[] encodeDate(Date d) {
public static byte[] encodeDate(Date d) {
return kelondroNaturalOrder.encodeLong(d.getTime() / 86400000, 4);
}
byte[] encodeComp(URL url, String descr, String author, String tags, String ETag) {
serverByteBuffer s = new serverByteBuffer(200);
public static byte[] encodeComp(URL url, String descr, String author, String tags, String ETag) {
serverCharBuffer s = new serverCharBuffer(200);
s.append(url.toNormalform()).append((char) 10);
s.append(descr).append((char) 10);
s.append(author).append((char) 10);
s.append(tags).append((char) 10);
s.append(ETag).append((char) 10);
return s.getBytes();
return s.toString().getBytes();
}
public plasmaCrawlLURLNewEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
@ -108,7 +112,7 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
this.word = searchedWord;
}
public plasmaCrawlLURLNewEntry(Properties prop, boolean setGlobal) throws IOException {
public plasmaCrawlLURLNewEntry(Properties prop){
// generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
@ -116,7 +120,7 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
try {
url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null));
} catch (MalformedURLException e) {
throw new IOException("URL is not proper: " + crypt.simpleDecode(prop.getProperty("url", ""), null));
url = null;
}
String descr = crypt.simpleDecode(prop.getProperty("descr", ""), null); if (descr == null) descr = "";
String author = crypt.simpleDecode(prop.getProperty("author", ""), null); if (author == null) author = "";
@ -136,8 +140,13 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
} catch (ParseException e) {
this.entry.setCol("load", encodeDate(new Date()));
}
try {
this.entry.setCol("fresh", encodeDate(indexURL.shortDayFormatter.parse(prop.getProperty("fresh", "20000101"))));
} catch (ParseException e) {
this.entry.setCol("fresh", encodeDate(new Date()));
}
this.entry.setCol("referrer", prop.getProperty("referrer", indexURL.dummyHash).getBytes());
this.entry.setCol("md5", serverCodings.decodeHex(prop.getProperty("md5", indexURL.dummyHash)));
this.entry.setCol("md5", serverCodings.decodeHex(prop.getProperty("md5", "")));
this.entry.setCol("size", Integer.parseInt(prop.getProperty("size", "0")));
this.entry.setCol("wc", Integer.parseInt(prop.getProperty("wc", "0")));
this.entry.setCol("dt", prop.getProperty("dt", "t").charAt(0));
@ -166,6 +175,7 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
s.append(",ETag=").append(crypt.simpleEncode(comp.ETag()));
s.append(",mod=").append(indexURL.shortDayFormatter.format(moddate()));
s.append(",load=").append(indexURL.shortDayFormatter.format(loaddate()));
s.append(",fresh=").append(indexURL.shortDayFormatter.format(freshdate()));
s.append(",referrer=").append(referrerHash());
s.append(",md5=").append(md5());
s.append(",size=").append(size());
@ -207,7 +217,7 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
return this.entry.getColString("hash", "", null);
}
public de.anomic.plasma.plasmaCrawlLURLEntry.Components comp() {
public plasmaCrawlLURLEntry.Components comp() {
ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8");
return new de.anomic.plasma.plasmaCrawlLURLEntry.Components(
(cl.size() > 0) ? (String) cl.get(0) : "",
@ -225,6 +235,10 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
return new Date(86400000 * entry.getColLong("load", 0));
}
public Date freshdate() {
return new Date(86400000 * entry.getColLong("fresh", 0));
}
public String referrerHash() {
// return the creator's hash
return entry.getColString("referrer", indexURL.dummyHash, null);

@ -36,7 +36,9 @@ import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.tools.crypt;
public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
@ -56,7 +58,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
"Cardinal size-" + indexURL.urlSizeLength + " {b64e}, " + // size of file in bytes
"Cardinal wc-" + indexURL.urlWordCountLength + " {b64e}"); // word count
private String url;
private URL url;
private String descr;
private Date moddate;
private Date loaddate;
@ -72,24 +74,42 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
private String snippet;
private indexEntry word; // this is only used if the url is transported via remote search requests
public plasmaCrawlLURLOldEntry(String url, String descr, Date moddate,
Date loaddate, String referrerHash, int copyCount,
boolean localNeed, int quality, String language, char doctype,
int size, int wordCount) {
public plasmaCrawlLURLOldEntry(
URL url,
String descr,
String author,
String tags,
String ETag,
Date mod,
Date load,
Date fresh,
String referrer,
byte[] md5,
long size,
int wc,
char dt,
bitfield flags,
String lang,
int llocal,
int lother,
int laudio,
int limage,
int lvideo,
int lapp) {
// create new entry and store it into database
this.urlHash = indexURL.urlHash(url);
this.url = url;
this.descr = (descr == null) ? this.url.toString() : descr;
this.moddate = moddate;
this.loaddate = loaddate;
this.moddate = mod;
this.loaddate = load;
this.referrerHash = (referrerHash == null) ? indexURL.dummyHash : referrerHash;
this.copyCount = copyCount; // the number of remote (global) copies of this object without this one
this.flags = (localNeed) ? "L " : " ";
this.quality = quality;
this.copyCount = 0; // the number of remote (global) copies of this object without this one
this.flags = " ";
this.quality = 0;
this.language = (language == null) ? "uk" : language;
this.doctype = doctype;
this.size = size;
this.wordCount = wordCount;
this.doctype = dt;
this.size = (int) size;
this.wordCount = wc;
this.snippet = null;
this.word = null;
}
@ -97,7 +117,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
public plasmaCrawlLURLOldEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
try {
this.urlHash = entry.getColString(0, null);
this.url = entry.getColString(1, "UTF-8").trim();
this.url = new URL(entry.getColString(1, "UTF-8").trim());
this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim();
this.moddate = new Date(86400000 * entry.getColLong(3));
this.loaddate = new Date(86400000 * entry.getColLong(4));
@ -118,7 +138,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
}
}
public plasmaCrawlLURLOldEntry(Properties prop, boolean setGlobal) {
public plasmaCrawlLURLOldEntry(Properties prop) {
// generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
@ -130,8 +150,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
this.loaddate = indexURL.shortDayFormatter.parse(prop.getProperty("load", "20000101"));
this.copyCount = Integer.parseInt(prop.getProperty("cc", "0"));
this.flags = ((prop.getProperty("local", "true").equals("true")) ? "L " : " ");
if (setGlobal) this.flags = "G ";
this.url = crypt.simpleDecode(prop.getProperty("url", ""), null);
this.url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null));
this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), null);
if (this.descr == null) this.descr = this.url.toString();
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", ""));
@ -154,6 +173,10 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
}
}
public static kelondroRow rowdef() {
return rowdef;
}
public kelondroRow.Entry toRowEntry() throws IOException {
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, indexURL.urlDateLength);
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexURL.urlDateLength);
@ -195,6 +218,10 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
return loaddate;
}
public Date freshdate() {
return loaddate;
}
public String referrerHash() {
// return the creator's hash
return referrerHash;

@ -239,6 +239,16 @@ public class plasmaCrawlNURL extends indexURL {
}
}
public boolean remove(String hash) {
if (hash == null) return false;
try {
urlIndexFile.remove(hash.getBytes());
return true;
} catch (IOException e) {
return false;
}
}
private static String normalizeHandle(int h) {
String d = Integer.toHexString(h);
while (d.length() < urlHandleLength) d = "0" + d;

@ -199,7 +199,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// storage management
public File htCachePath;
private File plasmaPath;
public File indexPublicTextPath;
public File indexPath;
public File listsPath;
public File htDocsPath;
public File rankingPath;
@ -279,8 +279,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// load values from configs
this.plasmaPath = new File(rootPath, getConfig("dbPath", "DATA/PLASMADB"));
this.log.logConfig("Plasma DB Path: " + this.plasmaPath.toString());
this.indexPublicTextPath = new File(rootPath, getConfig("indexPublicTextPath", "DATA/INDEX/PUBLIC/TEXT"));
this.log.logConfig("Index Path: " + this.indexPublicTextPath.toString());
this.indexPath = new File(rootPath, getConfig("indexPath", "DATA/INDEX"));
this.log.logConfig("Index Path: " + this.indexPath.toString());
this.listsPath = new File(rootPath, getConfig("listsPath", "DATA/LISTS"));
this.log.logConfig("Lists Path: " + this.listsPath.toString());
this.htDocsPath = new File(rootPath, getConfig("htDocsPath", "DATA/HTDOCS"));
@ -418,12 +418,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// start indexing management
log.logConfig("Starting Indexing Management");
urlPool = new plasmaURLPool(plasmaPath,
urlPool = new plasmaURLPool(plasmaPath, indexPath,
ramLURL, getConfigBool("useFlexTableForLURL", false),
ramNURL, getConfigBool("useFlexTableForNURL", false),
ramEURL, getConfigBool("useFlexTableForEURL", true),
ramLURL_time);
wordIndex = new plasmaWordIndex(plasmaPath, indexPublicTextPath, ramRWI, ramRWI_time, log, getConfigBool("useCollectionIndex", false));
wordIndex = new plasmaWordIndex(plasmaPath, indexPath, true, ramRWI, ramRWI_time, log, getConfigBool("useCollectionIndex", false));
// set a high maximum cache size to current size; this is adopted later automatically
int wordCacheMaxCount = Math.max((int) getConfigLong("wordCacheInitCount", 30000),
@ -1559,20 +1559,23 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// create a new loaded URL db entry
plasmaCrawlLURLEntry newEntry = urlPool.loadedURL.newEntry(
entry.url().toNormalform(), // URL
docDescription, // document description
docDate, // modification date
new Date(), // loaded date
referrerUrlHash, // referer hash
0, // copy count
true, // local need
condenser.RESULT_WORD_ENTROPHY, // quality
indexEntryAttribute.language(entry.url()), // language
indexEntryAttribute.docType(document.getMimeType()), // doctype
(int) entry.size(), // size
condenser.RESULT_NUMB_WORDS // word count
entry.url(), // URL
docDescription, // document description
"", // author
"", // tags
"", // ETag
docDate, // modification date
new Date(), // loaded date
new Date(), // freshdate
referrerUrlHash, // referer hash
new byte[0], // md5
(int) entry.size(), // size
condenser.RESULT_NUMB_WORDS, // word count
indexEntryAttribute.docType(document.getMimeType()), // doctype
new bitfield(4), // flags
indexEntryAttribute.language(entry.url()), // language
0,0,0,0,0,0
);
/* ========================================================================
* STORE URL TO LOADED-URL-DB
* ======================================================================== */
@ -1968,7 +1971,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
plasmaCrawlLURLEntry entry = urlPool.loadedURL.newEntry(propStr, true);
plasmaCrawlLURLEntry entry = urlPool.loadedURL.newEntry(propStr);
urlPool.loadedURL.store(entry);
urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
urlPool.noticeURL.remove(entry.hash());

@ -57,12 +57,12 @@ public class plasmaURLPool {
public final plasmaCrawlNURL noticeURL;
public final plasmaCrawlEURL errorURL;
public plasmaURLPool(File plasmaPath,
public plasmaURLPool(File plasmaPath, File indexPath,
int ramLURL, boolean newLURL,
int ramNURL, boolean newNURL,
int ramEURL, boolean newEURL,
long preloadTime) {
loadedURL = new plasmaCrawlLURL(plasmaPath, ramLURL, preloadTime, newLURL);
loadedURL = new plasmaCrawlLURL(plasmaPath, indexPath, ramLURL, preloadTime, newLURL);
noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL, -1, newNURL);
errorURL = new plasmaCrawlEURL(plasmaPath, ramEURL, -1, newEURL);
}

@ -72,7 +72,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
public boolean useCollectionIndex; // flag for usage of new collectionIndex db
private int idleDivisor, busyDivisor;
public plasmaWordIndex(File oldDatabaseRoot, File newIndexRoot, int bufferkb, long preloadTime, serverLog log, boolean useCollectionIndex) {
public plasmaWordIndex(File oldDatabaseRoot, File newIndexRoot, boolean dummy, int bufferkb, long preloadTime, serverLog log, boolean useCollectionIndex) {
this.oldDatabaseRoot = oldDatabaseRoot;
this.backend = new plasmaWordIndexFileCluster(oldDatabaseRoot, log);
this.dhtOutCache = new indexRAMCacheRI(oldDatabaseRoot, (useCollectionIndex) ? 1024 : 64, "indexDump1.array", log);
@ -83,9 +83,10 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
this.assortmentBufferSize = bufferkb;
// create collections storage path
if (!(newIndexRoot.exists())) newIndexRoot.mkdirs();
File textindexpath = new File(newIndexRoot, "PUBLIC/TEXT");
if (!(textindexpath.exists())) textindexpath.mkdirs();
if (useCollectionIndex) {
this.collections = new indexCollectionRI(newIndexRoot, "test_generation1", bufferkb * 1024, preloadTime);
this.collections = new indexCollectionRI(textindexpath, "test_generation1", bufferkb * 1024, preloadTime);
if (assortmentClusterPath.exists())
this.assortmentCluster = new plasmaWordIndexAssortmentCluster(assortmentClusterPath, assortmentCount, assortmentBufferSize, preloadTime, log);
else
@ -855,8 +856,8 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
// System.out.println(kelondroMSetTools.fastStringComparator(true).compare("RwGeoUdyDQ0Y", "rwGeoUdyDQ0Y"));
// System.out.println(new Date(reverseMicroDateDays(microDateDays(System.currentTimeMillis()))));
File plasmadb = new File("D:\\dev\\proxy\\DATA\\PLASMADB");
File indexdb = new File("D:\\dev\\proxy\\DATA\\INDEX\\PRIVATE\\TEXT");
plasmaWordIndex index = new plasmaWordIndex(plasmadb, indexdb, 555, 1000, new serverLog("TESTAPP"), false);
File indexdb = new File("D:\\dev\\proxy\\DATA\\INDEX");
plasmaWordIndex index = new plasmaWordIndex(plasmadb, indexdb, true, 555, 1000, new serverLog("TESTAPP"), false);
try {
Iterator containerIter = index.wordContainers("5A8yhZMh_Kmv", plasmaWordIndex.RL_WORDFILES, true);
while (containerIter.hasNext()) {

@ -501,7 +501,7 @@ public final class yacyClient {
String[] urls = new String[results];
for (int n = 0; n < results; n++) {
// get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
urlEntry = urlManager.newEntry((String) result.get("resource" + n));
if (urlEntry == null) continue;
plasmaCrawlLURLEntry.Components comp = urlEntry.comp();
if (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, comp.url())) continue; // block with backlist

@ -75,10 +75,10 @@ import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroFlexSplitTable;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroTree;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLURL;
@ -98,6 +98,7 @@ import de.anomic.server.serverPlainSwitch;
import de.anomic.server.serverSwitch;
import de.anomic.server.serverSystem;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.tools.enumerateFiles;
import de.anomic.yacy.yacyClient;
import de.anomic.yacy.yacyCore;
@ -651,11 +652,11 @@ public final class yacy {
final serverSwitch sps = new serverPlainSwitch(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
File dbroot = new File(new File(homePath), "DATA/PLASMADB");
File indexRoot = new File(new File(homePath), "DATA/INDEX/PUBLIC/TEXT");
File indexRoot = new File(new File(homePath), "DATA/INDEX");
serverLog log = new serverLog("WORDMIGRATION");
log.logInfo("STARTING MIGRATION");
boolean useCollectionIndex = sps.getConfigBool("useCollectionIndex", false);
plasmaWordIndex wordIndexCache = new plasmaWordIndex(dbroot, indexRoot, 20000, 10000, log, useCollectionIndex);
plasmaWordIndex wordIndexCache = new plasmaWordIndex(dbroot, indexRoot, true, 20000, 10000, log, useCollectionIndex);
enumerateFiles words = new enumerateFiles(new File(dbroot, "WORDS"), true, false, true, true);
String wordhash;
File wordfile;
@ -696,8 +697,8 @@ public final class yacy {
// run with "java -classpath classes yacy -minimizeUrlDB"
final serverSwitch sps = new serverPlainSwitch(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
File dbroot = new File(new File(homePath), "DATA/PLASMADB");
File indexRoot = new File(new File(homePath), "DATA/INDEX/PUBLIC/TEXT");
File plasmaroot = new File(new File(homePath), "DATA/PLASMADB");
File indexRoot = new File(new File(homePath), "DATA/INDEX");
serverLog log = new serverLog("URL-CLEANUP");
try {
log.logInfo("STARTING URL CLEANUP");
@ -705,16 +706,16 @@ public final class yacy {
// db containing all currently loades urls
int cache = dbcache * 1024; // in KB
log.logFine("URLDB-Caches: "+cache+" bytes");
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(dbroot, cache, 10000, false);
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(plasmaroot, indexRoot, cache, 10000, false);
// db used to hold all neede urls
plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(new File(dbroot, "minimized"), cache, 10000, false);
plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(new File(plasmaroot, "minimized"), indexRoot, cache, 10000, false);
Runtime rt = Runtime.getRuntime();
int cacheMem = (int)((serverMemory.max-rt.totalMemory())/1024)-(2*cache + 8*1024);
if (cacheMem < 2048) throw new OutOfMemoryError("Not enough memory available to start clean up.");
plasmaWordIndex wordIndex = new plasmaWordIndex(dbroot, indexRoot, cacheMem, 10000, log, sps.getConfigBool("useCollectionIndex", false));
plasmaWordIndex wordIndex = new plasmaWordIndex(plasmaroot, indexRoot, true, cacheMem, 10000, log, sps.getConfigBool("useCollectionIndex", false));
Iterator indexContainerIterator = wordIndex.wordContainers("------------", plasmaWordIndex.RL_WORDFILES, false);
long urlCounter = 0, wordCounter = 0;
@ -944,7 +945,7 @@ public final class yacy {
File root = new File(homePath);
try {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, false, 1000, false, 1000, false, 10000);
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, false, 1000, false, 1000, false, 10000);
HashMap doms = new HashMap();
System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries.");
System.out.println("a dump will be written after double-check of all extracted domains.");
@ -1060,7 +1061,7 @@ public final class yacy {
private static void urllist(String homePath, String source, boolean html, String targetName) {
File root = new File(homePath);
try {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, false, 1000, false, 1000, false, 10000);
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, false, 1000, false, 1000, false, 10000);
File file = new File(root, targetName);
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
@ -1120,23 +1121,104 @@ public final class yacy {
}
}
/*
private static void migratelurls(String homePath) {
File root = new File(homePath);
try {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, false, 1000, false, 1000, false, 10000);
kelondroFlexSplitTable fsp = new kelondroFlexSplitTable(new File(root, "DATA//INDEX/PUBLIC/TEXT"), "urls", 1000, -1, plasmaCrawlLURLOldEntry.rowdef, kelondroNaturalOrder.naturalOrder);
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, false, 1000, false, 1000, false, 10000);
kelondroFlexSplitTable fsp = new kelondroFlexSplitTable(new File(root, "DATA/INDEX/PUBLIC/TEXT"), "urls", 1000, -1, plasmaCrawlLURLNewEntry.rowdef, kelondroBase64Order.enhancedCoder);
long start = System.currentTimeMillis();
long last = start;
int tc = pool.loadedURL.size(), c = 0;
Iterator eiter = pool.loadedURL.entries(true, false, null);
plasmaCrawlLURLEntry entry;
plasmaCrawlLURLEntry oldentry;
kelondroRow.Entry newentry;
while (eiter.hasNext()) {
entry = (plasmaCrawlLURLEntry) eiter.next();
plasmaCrawlLURLEntry.Components comp = entry.comp();
if ((entry != null) && (comp.url() != null)) {
fsp.put(entry.toRowEntry(), entry.loaddate());
oldentry = (plasmaCrawlLURLEntry) eiter.next();
if (oldentry != null) {
plasmaCrawlLURLEntry.Components comp = oldentry.comp();
newentry = plasmaCrawlLURLNewEntry.rowdef.newEntry();
newentry.setCol("hash", indexURL.urlHash(comp.url()), null);
newentry.setCol("comp", plasmaCrawlLURLNewEntry.encodeComp(comp.url(), comp.descr(), "", "", ""));
newentry.setCol("mod", plasmaCrawlLURLNewEntry.encodeDate(oldentry.moddate()));
newentry.setCol("load", plasmaCrawlLURLNewEntry.encodeDate(oldentry.loaddate()));
newentry.setCol("referrer", oldentry.referrerHash().getBytes());
newentry.setCol("md5", new byte[0]);
newentry.setCol("size", oldentry.size());
newentry.setCol("wc", oldentry.wordCount());
newentry.setCol("dt", oldentry.doctype());
newentry.setCol("flags", new bitfield(4).getBytes());
newentry.setCol("lang", oldentry.language().getBytes());
newentry.setCol("llocal", 0);
newentry.setCol("lother", 0);
newentry.setCol("limage", 0);
newentry.setCol("laudio", 0);
newentry.setCol("lvideo", 0);
newentry.setCol("lapp", 0);
fsp.put(newentry, oldentry.loaddate());
}
c++;
if (System.currentTimeMillis() - last > 60000) {
System.out.println("Migrated " + c + " from " + tc + " urls. Estimated remaining time: " + ((System.currentTimeMillis() - start) * (tc - c) / c / 60000) + " minutes");
last = System.currentTimeMillis();
}
}
System.out.println("MIGRATION OF " + c + " URLs FINISHED");
pool.close();
} catch (IOException e) {
e.printStackTrace();
}
}
*/
private static void migratelurls(File root, File urlHash) {
try {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, true, 1000, true, 1000, true, 10000);
kelondroTree oldindex = new kelondroTree(urlHash, 1000, -1, kelondroTree.defaultObjectCachePercent, plasmaCrawlLURLOldEntry.rowdef);
long start = System.currentTimeMillis();
long last = start;
int tc = oldindex.size(), c = 0;
Iterator eiter = oldindex.contentRows(-1);
kelondroRow.Entry oldrow;
plasmaCrawlLURLEntry oldentry;
plasmaCrawlLURLEntry newentry;
plasmaCrawlLURLEntry.Components comp;
byte[] dummymd5 = new byte[0];
while (eiter.hasNext()) {
oldrow = (kelondroRow.Entry) eiter.next();
if (oldrow != null) {
oldentry = new plasmaCrawlLURLOldEntry(oldrow, null);
comp = oldentry.comp();
newentry = pool.loadedURL.newEntry(
comp.url(),
comp.descr(),
"",
"",
"",
oldentry.moddate(),
oldentry.loaddate(),
oldentry.freshdate(),
oldentry.referrerHash(),
dummymd5,
oldentry.size(),
oldentry.wordCount(),
oldentry.doctype(),
new bitfield(4),
oldentry.language(),
0, 0, 0, 0, 0, 0);
pool.loadedURL.store(newentry);
}
c++;
if (System.currentTimeMillis() - last > 60000) {
System.out.println("Migrated " + c + " from " + tc + " urls. Estimated remaining time: " + ((System.currentTimeMillis() - start) * (tc - c) * Math.sqrt(Math.sqrt(tc - c)) / c / 60000) + " minutes");
last = System.currentTimeMillis();
}
}
pool.close();
oldindex.close();
System.out.println("MIGRATION OF " + c + " URLs FINISHED");
} catch (IOException e) {
e.printStackTrace();
}
@ -1157,11 +1239,12 @@ public final class yacy {
*/
private static void urldbcleanup(String homePath) {
File root = new File(homePath);
File dbroot = new File(root, "DATA/PLASMADB");
File plasmaroot = new File(root, "DATA/PLASMADB");
File indexroot = new File(root, "DATA/INDEX");
serverLog log = new serverLog("URLDBCLEANUP");
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
try {
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(dbroot, 4194304, 10000, false);
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(plasmaroot, indexroot, 4194304, 10000, false);
currentUrlDB.urldbcleanup();
currentUrlDB.close();
} catch (IOException e) {
@ -1174,7 +1257,7 @@ public final class yacy {
serverLog log = new serverLog("HASHLIST");
final serverSwitch sps = new serverPlainSwitch(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
File homeDBroot = new File(new File(homePath), "DATA/PLASMADB");
File indexRoot = new File(new File(homePath), "DATA/INDEX/PUBLIC/TEXT");
File indexRoot = new File(new File(homePath), "DATA/INDEX");
String wordChunkStartHash = "------------";
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
log.logInfo("STARTING CREATION OF RWI-HASHLIST");
@ -1182,7 +1265,7 @@ public final class yacy {
try {
Iterator indexContainerIterator = null;
if (resource.equals("all")) {
WordIndex = new plasmaWordIndex(homeDBroot, indexRoot, 8*1024*1024, 3000, log, sps.getConfigBool("useCollectionIndex", false));
WordIndex = new plasmaWordIndex(homeDBroot, indexRoot, true, 8*1024*1024, 3000, log, sps.getConfigBool("useCollectionIndex", false));
indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
} else if (resource.equals("assortments")) {
plasmaWordIndexAssortmentCluster assortmentCluster = new plasmaWordIndexAssortmentCluster(new File(homeDBroot, "ACLUSTER"), 64, 16*1024*1024, 3000, log);
@ -1394,7 +1477,8 @@ public final class yacy {
String outfile = "urllist_" + source + "_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt");
urllist(applicationRoot, source, html, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-migratelurls"))) {
migratelurls(applicationRoot);
File root = new File(applicationRoot);
migratelurls(root, new File(root, "DATA/PLASMADB/urlHash.db"));
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
// generate a url list and save it in a file
if (args.length == 2) applicationRoot= args[1];

@ -201,7 +201,7 @@ promoteSearchPageGreeting =
dbPath=DATA/PLASMADB
# the path to the public reverse word index for text files (web pages)
indexPublicTextPath=DATA/INDEX/PUBLIC/TEXT
indexPath=DATA/INDEX
# the path to the LISTS files. Most lists are used to filter web content
listsPath=DATA/LISTS
@ -819,6 +819,7 @@ currentSkin=
useCollectionIndex=false
useFlexTableForNURL=false
useFlexTableForEURL=true
useFlexTableForLURL=false
tableTypeForPreNURL=2
# flag to show surftipps on index.html page

Loading…
Cancel
Save