a large collection of different changes

* mainly for the transition to the new indexing database structure
* a bugfix for an endless loop inside kelondroTree iteration
* a bugfix for bulk read inside a kelondroTree iteration; the bug caused that some elements had been iterated twice
* very strong speed enhancement for url/domain extraction

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2320 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 493b1cd2bf
commit 58df8b7bbf

@ -153,7 +153,7 @@ public class IndexControl_p {
int i = 0;
urlx = new String[index.size()];
while (en.hasNext()) {
urlx[i++] = ((indexURLEntry) en.next()).getUrlHash();
urlx[i++] = ((indexURLEntry) en.next()).urlHash();
}
index = null;
}
@ -259,15 +259,15 @@ public class IndexControl_p {
while (urlIter.hasNext()) {
indexEntry = (indexURLEntry) urlIter.next();
try {
lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), null);
lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.urlHash(), null);
if (lurl.toString() == null) {
unknownURLEntries.add(indexEntry.getUrlHash());
unknownURLEntries.add(indexEntry.urlHash());
urlIter.remove();
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
knownURLs.put(indexEntry.urlHash(), lurl);
}
} catch (IOException e) {
unknownURLEntries.add(indexEntry.getUrlHash());
unknownURLEntries.add(indexEntry.urlHash());
}
}
// use whats remaining
@ -441,7 +441,7 @@ public class IndexControl_p {
indexURLEntry xi;
while (en.hasNext()) {
xi = (indexURLEntry) en.next();
uh = new String[]{xi.getUrlHash(), Integer.toString(xi.posintext())};
uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())};
try {
us = switchboard.urlPool.loadedURL.getEntry(uh[0], null).url().toString();
tm.put(us, uh);

@ -149,7 +149,7 @@ public final class transferRWI {
sb.wordIndex.addEntry(wordHash, entry, System.currentTimeMillis(), true);
serverCore.checkInterruption();
urlHash = entry.getUrlHash();
urlHash = entry.urlHash();
try {
if ((!(unknownURL.contains(urlHash))) &&
(!(sb.urlPool.loadedURL.exists(urlHash)))) {

@ -107,10 +107,10 @@ public abstract class indexAbstractEntry implements indexEntry {
return e;
}
public String getUrlHash() { return urlHash; }
public int getQuality() { return quality; }
public int getVirtualAge() { return plasmaWordIndex.microDateDays(lastModified); }
public long getLastModified() { return lastModified; }
public String urlHash() { return urlHash; }
public int quality() { return quality; }
public int virtualAge() { return plasmaWordIndex.microDateDays(lastModified); }
public long lastModified() { return lastModified; }
public int hitcount() { return hitcount; }
public int posintext() { return posintext; }
public int posinphrase() { return posinphrase; }
@ -125,7 +125,7 @@ public abstract class indexAbstractEntry implements indexEntry {
public boolean isNewer(indexEntry other) {
if (other == null) return true;
if (this.lastModified > ((indexAbstractEntry) other).lastModified) return true;
if (this.lastModified == ((indexAbstractEntry) other).getLastModified()) {
if (this.lastModified == ((indexAbstractEntry) other).lastModified()) {
if (this.quality > ((indexAbstractEntry) other).quality) return true;
}
return false;
@ -133,8 +133,8 @@ public abstract class indexAbstractEntry implements indexEntry {
public boolean isOlder(indexEntry other) {
if (other == null) return false;
if (this.lastModified < ((indexAbstractEntry) other).getLastModified()) return true;
if (this.lastModified == ((indexAbstractEntry) other).getLastModified()) {
if (this.lastModified < ((indexAbstractEntry) other).lastModified()) return true;
if (this.lastModified == ((indexAbstractEntry) other).lastModified()) {
if (this.quality < ((indexAbstractEntry) other).quality) return true;
}
return false;

@ -36,7 +36,20 @@ public interface indexEntry {
public String toPropertyForm();
public kelondroRow.Entry toKelondroEntry();
public String getUrlHash();
public String urlHash();
public int quality();
public int virtualAge();
public long lastModified();
public int hitcount();
public int posintext();
public int posinphrase();
public int posofphrase();
public int wordcount();
public int phrasecount();
public String getLanguage();
public char getType();
public boolean isLocal();
public void combineDistance(indexEntry oe);
public int worddistance();
public void min(indexEntry other);

@ -119,7 +119,7 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI {
row.setCol(0, container.getWordHash().getBytes());
row.setCol(1, kelondroNaturalOrder.encodeLong(container.size(), 4));
row.setCol(2, kelondroNaturalOrder.encodeLong(container.updated(), 8));
row.setCol(3, wordEntry.getUrlHash().getBytes());
row.setCol(3, wordEntry.urlHash().getBytes());
row.setCol(4, wordEntry.toEncodedStringForm().getBytes());
dumpArray.set((int) urlcount++, row);
}
@ -148,7 +148,7 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI {
row.setCol(0, wordHash.getBytes());
row.setCol(1, kelondroNaturalOrder.encodeLong(container.size(), 4));
row.setCol(2, kelondroNaturalOrder.encodeLong(updateTime, 8));
row.setCol(3, wordEntry.getUrlHash().getBytes());
row.setCol(3, wordEntry.urlHash().getBytes());
row.setCol(4, wordEntry.toEncodedStringForm().getBytes());
dumpArray.set((int) urlcount++, row);
}

@ -134,9 +134,9 @@ public final class indexTreeMapContainer extends indexAbstractContainer implemen
private boolean addi(indexEntry entry) {
// returns true if the new entry was added, false if it already existed
indexURLEntry oldEntry = (indexURLEntry) container.put(entry.getUrlHash(), entry);
indexURLEntry oldEntry = (indexURLEntry) container.put(entry.urlHash(), entry);
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this container
container.put(entry.getUrlHash(), oldEntry); // put it back
container.put(entry.urlHash(), oldEntry); // put it back
return false;
}
return (oldEntry == null);
@ -259,7 +259,7 @@ public final class indexTreeMapContainer extends indexAbstractContainer implemen
long stamp = System.currentTimeMillis();
while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) {
ie0 = (indexEntry) se.next();
ie1 = large.get(ie0.getUrlHash());
ie1 = large.get(ie0.urlHash());
if (ie1 != null) {
// this is a hit. Calculate word distance:
ie0.combineDistance(ie1);
@ -285,7 +285,7 @@ public final class indexTreeMapContainer extends indexAbstractContainer implemen
long stamp = System.currentTimeMillis();
while ((System.currentTimeMillis() - stamp) < time) {
c = i1.getOrdering().compare(ie1.getUrlHash(), ie2.getUrlHash());
c = i1.getOrdering().compare(ie1.urlHash(), ie2.urlHash());
//System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
if (c < 0) {
if (e1.hasNext()) ie1 = (indexURLEntry) e1.next(); else break;

@ -37,21 +37,13 @@ import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexAbstractEntry;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroColumn;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRow.Entry;
import de.anomic.plasma.plasmaWordIndex;
public final class indexURLEntry extends indexAbstractEntry implements Cloneable, indexEntry {
public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{
new kelondroColumn(
"nickname",
kelondroColumn.celltype_undefined, 4 /*cellwidth*/,
kelondroColumn.encoder_none, 0,
"description")
});
// the class instantiation can only be done by a plasmaStore method
// therefore they are all public
public indexURLEntry(String urlHash,
@ -180,7 +172,7 @@ public final class indexURLEntry extends indexAbstractEntry implements Cloneable
}
public Entry toKelondroEntry() {
kelondroRow.Entry entry = urlEntryRow.newEntry(toEncodedByteArrayForm());
kelondroRow.Entry entry = indexURLEntryNew.urlEntryRow.newEntry(toEncodedByteArrayForm());
return entry;
}

@ -38,18 +38,18 @@ public class kelondroColumn {
public static final int encoder_none = 0;
public static final int encoder_b64e = 1;
public static final int encoder_string = 2;
public static final int encoder_bytes = 3;
public static final int encoder_char = 4;
public static final int encoder_b256 = 2;
public static final int encoder_string = 3;
public static final int encoder_bytes = 4;
public static final int encoder_char = 5;
private int celltype, cellwidth, encoder, encodedwidth;
private int celltype, cellwidth, encoder;
private String nickname, description;
public kelondroColumn(String nickname, int celltype, int cellwidth, int encoder, int encodedwidth, String description) {
public kelondroColumn(String nickname, int celltype, int encoder, int cellwidth, String description) {
this.celltype = celltype;
this.cellwidth = cellwidth;
this.encoder = encoder;
this.encodedwidth = encodedwidth;
this.nickname = nickname;
this.description = description;
}
@ -66,10 +66,6 @@ public class kelondroColumn {
return this.encoder;
}
public int encodedwidth() {
return this.encodedwidth;
}
public String nickname() {
return this.nickname;
}

@ -33,18 +33,15 @@ public class kelondroRow {
private kelondroColumn[] row;
protected int[] colstart;
private int encodedFormLength;
private int objectsize;
public kelondroRow(kelondroColumn[] row) {
this.row = row;
this.colstart = new int[row.length];
this.objectsize = 0;
this.encodedFormLength = 0;
for (int i = 0; i < row.length; i++) {
this.colstart[i] = this.objectsize;
this.objectsize += this.row[i].cellwidth();
this.encodedFormLength += this.row[i].encodedwidth();
}
}
@ -53,12 +50,10 @@ public class kelondroRow {
this.row = new kelondroColumn[rowi.length];
this.colstart = new int[rowi.length];
this.objectsize = 0;
this.encodedFormLength = 0;
for (int i = 0; i < rowi.length; i++) {
this.row[i] = new kelondroColumn("col_" + i, kelondroColumn.celltype_undefined, rowi[i], kelondroColumn.encoder_none, rowi[i], "");
this.row[i] = new kelondroColumn("col_" + i, kelondroColumn.celltype_undefined, kelondroColumn.encoder_none, rowi[i], "");
this.colstart[i] = this.objectsize;
this.objectsize += this.row[i].cellwidth();
this.encodedFormLength += this.row[i].encodedwidth();
}
}
@ -158,11 +153,49 @@ public class kelondroRow {
}
}
public void setColByte(int column, byte c) {
rowinstance[colstart[column]] = c;
}
public void setColString(int column, String cell, String encoding) {
if (encoding == null)
setCol(column, cell.getBytes());
else
try {
setCol(column, cell.getBytes(encoding));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
public void setColLong(int column, long cell) {
// uses the column definition to choose the right encoding
switch (row[column].encoder()) {
case kelondroColumn.encoder_none:
throw new kelondroException("ROW", "setColLong has celltype none, no encoder given");
case kelondroColumn.encoder_b64e:
setColLongB64E(column, cell);
break;
case kelondroColumn.encoder_b256:
setColLongB256(column, cell);
break;
case kelondroColumn.encoder_string:
setCol(column, Long.toString(cell).getBytes());
break;
case kelondroColumn.encoder_bytes:
throw new kelondroException("ROW", "setColLong of celltype bytes not applicable");
case kelondroColumn.encoder_char:
throw new kelondroException("ROW", "setColLong of celltype char not applicable");
}
}
public void setColLongB256(int column, long cell) {
// temporary method, should be replaced by setColLong if all row declarations are complete
kelondroNaturalOrder.encodeLong(cell, rowinstance, colstart[column], row[column].cellwidth());
}
public void setColLongB64E(int column, long cell) {
// temporary method, should be replaced by setColLong if all row declarations are complete
kelondroBase64Order.enhancedCoder.encodeLong(cell, rowinstance, colstart[column], row[column].cellwidth());
}
@ -183,11 +216,32 @@ public class kelondroRow {
}
}
public long getColLong(int column) {
// uses the column definition to choose the right encoding
switch (row[column].encoder()) {
case kelondroColumn.encoder_none:
throw new kelondroException("ROW", "getColLong has celltype none, no encoder given");
case kelondroColumn.encoder_b64e:
return getColLongB64E(column);
case kelondroColumn.encoder_b256:
return getColLongB256(column);
case kelondroColumn.encoder_string:
return Long.parseLong(getColString(column, null));
case kelondroColumn.encoder_bytes:
throw new kelondroException("ROW", "getColLong of celltype bytes not applicable");
case kelondroColumn.encoder_char:
throw new kelondroException("ROW", "getColLong of celltype char not applicable");
}
throw new kelondroException("ROW", "getColLong did not find appropriate encoding");
}
public long getColLongB256(int column) {
// temporary method, should be replaced by getColLong if all row declarations are complete
return kelondroNaturalOrder.decodeLong(rowinstance, colstart[column], row[column].cellwidth());
}
public long getColLongB64E(int column) {
// temporary method, should be replaced by getColLong if all row declarations are complete
return kelondroBase64Order.enhancedCoder.decodeLong(rowinstance, colstart[column], row[column].cellwidth());
}
@ -202,30 +256,30 @@ public class kelondroRow {
}
public byte[] toEncodedBytesForm() {
byte[] b = new byte[encodedFormLength];
int encoder, encodedwidth;
byte[] b = new byte[objectsize];
int encoder, cellwidth;
int p = 0;
for (int i = 0; i < row.length; i++) {
encoder = row[i].encoder();
encodedwidth = row[i].encodedwidth();
cellwidth = row[i].cellwidth();
switch (row[i].celltype()) {
case kelondroColumn.celltype_undefined:
throw new kelondroException("ROW", "toEncodedForm of celltype undefined not possible");
case kelondroColumn.celltype_boolean:
throw new kelondroException("ROW", "toEncodedForm of celltype boolean not yet implemented");
case kelondroColumn.celltype_binary:
System.arraycopy(rowinstance, colstart[i], b, p, encodedwidth);
p += encodedwidth;
System.arraycopy(rowinstance, colstart[i], b, p, cellwidth);
p += cellwidth;
continue;
case kelondroColumn.celltype_string:
System.arraycopy(rowinstance, colstart[i], b, p, encodedwidth);
p += encodedwidth;
System.arraycopy(rowinstance, colstart[i], b, p, cellwidth);
p += cellwidth;
continue;
case kelondroColumn.celltype_cardinal:
if (encoder == kelondroColumn.encoder_b64e) {
long c = bytes2long(rowinstance, colstart[i]);
System.arraycopy(kelondroBase64Order.enhancedCoder.encodeLongSmart(c, encodedwidth).getBytes(), 0, b, p, encodedwidth);
p += encodedwidth;
long c = bytes2long(rowinstance, colstart[i], cellwidth);
System.arraycopy(kelondroBase64Order.enhancedCoder.encodeLongSmart(c, cellwidth).getBytes(), 0, b, p, cellwidth);
p += cellwidth;
continue;
}
throw new kelondroException("ROW", "toEncodedForm of celltype cardinal has no encoder (" + encoder + ")");
@ -236,6 +290,49 @@ public class kelondroRow {
return b;
}
public String toPropertyForm() {
StringBuffer sb = new StringBuffer();
sb.append("{");
int encoder, cellwidth;
for (int i = 0; i < row.length; i++) {
encoder = row[i].encoder();
cellwidth = row[i].cellwidth();
switch (row[i].celltype()) {
case kelondroColumn.celltype_undefined:
throw new kelondroException("ROW", "toEncodedForm of celltype undefined not possible");
case kelondroColumn.celltype_boolean:
throw new kelondroException("ROW", "toEncodedForm of celltype boolean not yet implemented");
case kelondroColumn.celltype_binary:
sb.append(row[i].nickname());
sb.append('=');
for (int j = colstart[i]; j < colstart[i] + cellwidth; j++) sb.append((char) rowinstance[j]);
sb.append(',');
continue;
case kelondroColumn.celltype_string:
sb.append(row[i].nickname());
sb.append('=');
for (int j = colstart[i]; j < colstart[i] + cellwidth; j++) sb.append((char) rowinstance[j]);
sb.append(',');
continue;
case kelondroColumn.celltype_cardinal:
if (encoder == kelondroColumn.encoder_b64e) {
sb.append(row[i].nickname());
sb.append('=');
long c = bytes2long(rowinstance, colstart[i], cellwidth);
sb.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(c, cellwidth).getBytes());
sb.append(',');
continue;
}
throw new kelondroException("ROW", "toEncodedForm of celltype cardinal has no encoder (" + encoder + ")");
case kelondroColumn.celltype_real:
throw new kelondroException("ROW", "toEncodedForm of celltype real not yet implemented");
}
}
if (sb.charAt(sb.length() - 1) == ',') sb.deleteCharAt(sb.length() - 1); // remove ',' at end
sb.append("}");
return sb.toString();
}
public String toString() {
StringBuffer b = new StringBuffer();
b.append('{');
@ -255,10 +352,10 @@ public class kelondroRow {
}
}
public final static long bytes2long(byte[] b, int offset) {
public final static long bytes2long(byte[] b, int offset, int length) {
if (b == null) return 0;
long x = 0;
for (int i = 0; i < b.length; i++) x = (x << 8) | (0xff & b[offset + i]);
for (int i = 0; i < length; i++) x = (x << 8) | (0xff & b[offset + i]);
return x;
}

@ -1000,11 +1000,14 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex {
setOrder.rotate(firstKey);
TreeMap rows = new TreeMap(setOrder);
Node n;
String key;
synchronized (this) {
Iterator i = (firstKey == null) ? new nodeIterator(up, rotating) : new nodeIterator(up, rotating, firstKey, including);
while ((rows.size() < count) && (i.hasNext())) {
n = (Node) i.next();
if (n != null) rows.put(new String(n.getKey()), row().newEntry(n.getValueRow()));
if (n == null) return rows;
key = new String(n.getKey());
if (rows.put(key, row().newEntry(n.getValueRow())) != null) return rows; // protection against loops
}
}
return rows;
@ -1080,6 +1083,7 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex {
if (!(bufferIterator.hasNext())) {
// assign next buffer chunk
try {
lastKey[lastKey.length - 1]++;
rowBuffer = rowMap(inc, rot, lastKey, false, chunkSize);
bufferIterator = rowBuffer.entrySet().iterator();
} catch (IOException e) {

@ -135,7 +135,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// getting next word index entry
importWordIdxEntry = (indexURLEntry) importWordIdxEntries.next();
String urlHash = importWordIdxEntry.getUrlHash();
String urlHash = importWordIdxEntry.urlHash();
entityUrls.add(urlHash);
}

@ -462,25 +462,33 @@ public final class plasmaCrawlLURL extends indexURL {
this.urlHash = urlHash;
kelondroRow.Entry entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes());
if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL");
insertEntry(entry, searchedWord);
}
public Entry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException {
assert (entry != null);
insertEntry(entry, word);
}
private void insertEntry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException {
try {
if (entry != null) {
this.url = new URL(entry.getColString(1, "UTF-8").trim());
this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim();
this.moddate = new Date(86400000 * entry.getColLongB64E(3));
this.loaddate = new Date(86400000 * entry.getColLongB64E(4));
this.referrerHash = (entry.empty(5)) ? dummyHash : entry.getColString(5, "UTF-8");
this.copyCount = (int) entry.getColLongB64E(6);
this.flags = entry.getColString(7, "UTF-8");
this.quality = (int) entry.getColLongB64E(8);
this.language = entry.getColString(9, "UTF-8");
this.doctype = (char) entry.getColByte(10);
this.size = (int) entry.getColLongB64E(11);
this.wordCount = (int) entry.getColLongB64E(12);
this.snippet = null;
this.word = searchedWord;
this.stored = false;
return;
}
this.urlHash = entry.getColString(0, null);
this.url = new URL(entry.getColString(1, "UTF-8").trim());
this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim();
this.moddate = new Date(86400000 * entry.getColLongB64E(3));
this.loaddate = new Date(86400000 * entry.getColLongB64E(4));
this.referrerHash = (entry.empty(5)) ? dummyHash : entry.getColString(5, "UTF-8");
this.copyCount = (int) entry.getColLongB64E(6);
this.flags = entry.getColString(7, "UTF-8");
this.quality = (int) entry.getColLongB64E(8);
this.language = entry.getColString(9, "UTF-8");
this.doctype = (char) entry.getColByte(10);
this.size = (int) entry.getColLongB64E(11);
this.wordCount = (int) entry.getColLongB64E(12);
this.snippet = null;
this.word = searchedWord;
this.stored = false;
return;
} catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e);
throw new IOException("plasmaLURL.entry/1: " + e.toString());
@ -764,8 +772,8 @@ public final class plasmaCrawlLURL extends indexURL {
Iterator i;
boolean error = false;
public kiter(boolean up, boolean rotating) throws IOException {
i = urlHashCache.rows(up, rotating, null);
public kiter(boolean up, boolean rotating, String firstHash) throws IOException {
i = urlHashCache.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes());
error = false;
}
@ -777,12 +785,10 @@ public final class plasmaCrawlLURL extends indexURL {
public Object next() throws RuntimeException {
kelondroRow.Entry e = (kelondroRow.Entry) i.next();
if (e == null) return null;
String hash = null;
try {
hash = new String(e.getColBytes(0));
return new Entry(hash, null);
return new Entry(e, null);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + hash);
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
}
}
@ -792,9 +798,9 @@ public final class plasmaCrawlLURL extends indexURL {
}
public Iterator entries(boolean up, boolean rotating) throws IOException {
public Iterator entries(boolean up, boolean rotating, String firstHash) throws IOException {
// enumerates entry elements
return new kiter(up, rotating);
return new kiter(up, rotating, firstHash);
}
/**
@ -807,7 +813,7 @@ public final class plasmaCrawlLURL extends indexURL {
serverLog log = new serverLog("URLDBCLEANUP");
HashSet damagedURLS = new HashSet();
try {
Iterator eiter = entries(true, false);
Iterator eiter = entries(true, false, null);
int iteratorCount = 0;
while (eiter.hasNext()) try {
eiter.next();
@ -893,7 +899,7 @@ public final class plasmaCrawlLURL extends indexURL {
public void run() {
try {
serverLog.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet");
Iterator eiter = entries(true,false);
Iterator eiter = entries(true, false, null);
while (eiter.hasNext() && run) {
synchronized(this) {
if (this.pause) {
@ -975,7 +981,7 @@ public final class plasmaCrawlLURL extends indexURL {
if (args[0].equals("-l")) try {
// arg 1 is path to URLCache
final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1, 0);
final Iterator enu = urls.entries(true, false);
final Iterator enu = urls.entries(true, false, null);
while (enu.hasNext()) {
((Entry) enu.next()).print();
}

@ -209,19 +209,19 @@ public class plasmaDHTChunk {
while ((urlIter.hasNext()) && (maxcount > refcount)) {
indexEntry = (indexURLEntry) urlIter.next();
try {
lurl = lurls.getEntry(indexEntry.getUrlHash(), indexEntry);
lurl = lurls.getEntry(indexEntry.urlHash(), indexEntry);
if ((lurl == null) || (lurl.url() == null)) {
notBoundCounter++;
urlIter.remove();
wordIndex.removeEntries(nexthash, new String[] { indexEntry.getUrlHash() }, true);
wordIndex.removeEntries(nexthash, new String[] { indexEntry.urlHash() }, true);
} else {
urlCache.put(indexEntry.getUrlHash(), lurl);
urlCache.put(indexEntry.urlHash(), lurl);
refcount++;
}
} catch (IOException e) {
notBoundCounter++;
urlIter.remove();
wordIndex.removeEntries(nexthash, new String[] { indexEntry.getUrlHash() }, true);
wordIndex.removeEntries(nexthash, new String[] { indexEntry.urlHash() }, true);
}
}
@ -279,7 +279,7 @@ public class plasmaDHTChunk {
urlIter = this.indexContainers[i].entries();
while (urlIter.hasNext()) {
indexEntry = (indexURLEntry) urlIter.next();
urlHashes[c++] = indexEntry.getUrlHash();
urlHashes[c++] = indexEntry.urlHash();
}
count += wordIndex.removeEntries(this.indexContainers[i].getWordHash(), urlHashes, true);
log.logFine("Deleted partial index (" + c + " URLs) for word " + this.indexContainers[i].getWordHash() + "; " + this.wordIndex.indexSize(indexContainers[i].getWordHash()) + " entries left");

@ -251,7 +251,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
entry = preorder.next();
// find the url entry
try {
page = urlStore.getEntry(entry.getUrlHash(), entry);
page = urlStore.getEntry(entry.urlHash(), entry);
// add a result
acc.addResult(entry, page);
} catch (IOException e) {

@ -142,7 +142,7 @@ public final class plasmaSearchPreOrder {
i = container.entries();
for (int j = 0; j < count; j++) {
indexEntry = (indexURLEntry) i.next();
pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.getUrlHash(), indexEntry);
pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.urlHash(), indexEntry);
}
}

@ -168,9 +168,9 @@ public class plasmaSearchRankingProfile {
long ranking = 0;
if (entry instanceof indexURLEntry) {
indexURLEntry normalizedEntry = (indexURLEntry) entry;
ranking += normalizedEntry.getQuality() << ((Integer) coeff.get(ENTROPY)).intValue();
ranking += normalizedEntry.getVirtualAge() << ((Integer) coeff.get(DATE)).intValue();
ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << ((Integer) coeff.get(YBR)).intValue();
ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue();
ranking += normalizedEntry.virtualAge() << ((Integer) coeff.get(DATE)).intValue();
ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.urlHash()) << ((Integer) coeff.get(YBR)).intValue();
ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue();
ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue();
ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue();

@ -161,7 +161,7 @@ public final class plasmaSearchResult {
// insert value
//System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url());
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), page);
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.urlHash(), page);
}
// flush memory

@ -594,12 +594,12 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
// System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash());
try {
url = lurl.getEntry(entry.getUrlHash(), null).url();
url = lurl.getEntry(entry.urlHash(), null).url();
if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(url) == true)) {
urlHashs.add(entry.getUrlHash());
urlHashs.add(entry.urlHash());
}
} catch (IOException e) {
urlHashs.add(entry.getUrlHash());
urlHashs.add(entry.urlHash());
}
}
if (urlHashs.size() > 0) {

@ -150,7 +150,7 @@ public final class plasmaWordIndexAssortment {
indexURLEntry entry;
for (int i = 0; i < assortmentLength; i++) {
entry = (indexURLEntry) entries.next();
row.setCol(3 + 2 * i, entry.getUrlHash().getBytes());
row.setCol(3 + 2 * i, entry.urlHash().getBytes());
row.setCol(4 + 2 * i, entry.toEncodedStringForm().getBytes());
}
kelondroRow.Entry oldrow = null;

@ -148,16 +148,16 @@ public final class plasmaWordIndexFile {
}
public boolean contains(indexURLEntry entry) throws IOException {
return (theIndex.get(entry.getUrlHash().getBytes()) != null);
return (theIndex.get(entry.urlHash().getBytes()) != null);
}
public boolean addEntry(indexURLEntry entry) throws IOException {
if (entry == null) return false;
indexURLEntry oldEntry = getEntry(entry.getUrlHash());
indexURLEntry oldEntry = getEntry(entry.urlHash());
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity
return false;
}
return (theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedStringForm().getBytes()) == null);
return (theIndex.put(entry.urlHash().getBytes(), entry.toEncodedStringForm().getBytes()) == null);
}
public int addEntries(indexContainer container) throws IOException {

@ -63,6 +63,9 @@ import java.util.Properties;
import java.util.Hashtable;
import java.util.Iterator;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRowSet;
public final class serverFileUtils {
/**
@ -326,6 +329,25 @@ public final class serverFileUtils {
tf.renameTo(file);
}
public static void saveSet(File file, kelondroRowSet set, String sep) throws IOException {
File tf = new File(file.toString() + "." + (System.currentTimeMillis() % 1000));
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(tf));
Iterator i = set.rows();
String key;
if (i.hasNext()) {
key = new String(((kelondroRow.Entry) i.next()).getColBytes(0));
bos.write(key.getBytes());
}
while (i.hasNext()) {
key = new String(((kelondroRow.Entry) i.next()).getColBytes(0));
if (sep != null) bos.write(sep.getBytes());
bos.write(key.getBytes());
}
bos.close();
file.delete();
tf.renameTo(file);
}
/**
* Moves all files from a directory to another.
* @param from_dir Directory which contents will be moved.

@ -893,8 +893,8 @@ public final class yacyClient {
eenum = indexes[i].entries();
while (eenum.hasNext()) {
entry = (indexURLEntry) eenum.next();
if (urlCache.get(entry.getUrlHash()) == null) {
yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.getUrlHash() + "' is not contained in urlCache");
if (urlCache.get(entry.urlHash()) == null) {
yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.urlHash() + "' is not contained in urlCache");
}
}
}

@ -85,11 +85,11 @@ public class yacyNewsDB {
}
public static final kelondroRow rowdef = new kelondroRow(new kelondroColumn[]{
new kelondroColumn("newsid", kelondroColumn.celltype_string, yacyNewsRecord.idLength(), kelondroColumn.encoder_string, yacyNewsRecord.idLength(), "id = created + originator"),
new kelondroColumn("category", kelondroColumn.celltype_string, yacyNewsRecord.categoryStringLength, kelondroColumn.encoder_string, yacyNewsRecord.categoryStringLength, ""),
new kelondroColumn("received", kelondroColumn.celltype_string, yacyCore.universalDateShortPattern.length(), kelondroColumn.encoder_string, yacyCore.universalDateShortPattern.length(), ""),
new kelondroColumn("", kelondroColumn.celltype_string, 2, kelondroColumn.encoder_string, 2, ""),
new kelondroColumn("", kelondroColumn.celltype_string, attributesMaxLength, kelondroColumn.encoder_string, attributesMaxLength, ""),
new kelondroColumn("newsid", kelondroColumn.celltype_string, kelondroColumn.encoder_string, yacyNewsRecord.idLength(), "id = created + originator"),
new kelondroColumn("category", kelondroColumn.celltype_string, kelondroColumn.encoder_string, yacyNewsRecord.categoryStringLength, ""),
new kelondroColumn("received", kelondroColumn.celltype_string, kelondroColumn.encoder_string, yacyCore.universalDateShortPattern.length(), ""),
new kelondroColumn("", kelondroColumn.celltype_string, kelondroColumn.encoder_string, 2, ""),
new kelondroColumn("", kelondroColumn.celltype_string, kelondroColumn.encoder_string, attributesMaxLength, ""),
});
private static kelondroTree createDB(File path, int bufferkb, long preloadTime) {

@ -77,8 +77,8 @@ public class yacyNewsQueue {
}
public static final kelondroRow rowdef = new kelondroRow(new kelondroColumn[]{
new kelondroColumn("newsid", kelondroColumn.celltype_string, yacyNewsRecord.idLength(), kelondroColumn.encoder_string, yacyNewsRecord.idLength(), "id = created + originator"),
new kelondroColumn("last touched", kelondroColumn.celltype_string, yacyCore.universalDateShortPattern.length(), kelondroColumn.encoder_string, yacyCore.universalDateShortPattern.length(), "")
new kelondroColumn("newsid", kelondroColumn.celltype_string, kelondroColumn.encoder_string, yacyNewsRecord.idLength(), "id = created + originator"),
new kelondroColumn("last touched", kelondroColumn.celltype_string, kelondroColumn.encoder_string, yacyCore.universalDateShortPattern.length(), "")
});
private static kelondroStack createStack(File path) {

@ -723,7 +723,7 @@ public final class yacy {
indexURLEntry wordIdxEntry;
while (wordIdxEntries.hasNext()) {
wordIdxEntry = (indexURLEntry) wordIdxEntries.next();
String urlHash = wordIdxEntry.getUrlHash();
String urlHash = wordIdxEntry.urlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null);
urlCounter++;
@ -937,7 +937,7 @@ public final class yacy {
File root = new File(homePath);
try {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, 1000, 1000, 10000);
Iterator eiter = pool.loadedURL.entries(true, false);
Iterator eiter = pool.loadedURL.entries(true, false, null);
HashSet doms = new HashSet();
plasmaCrawlLURL.Entry entry;
System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries.");
@ -1018,7 +1018,9 @@ public final class yacy {
}
else {
// plain text list
serverFileUtils.saveSet(new File(root, targetName + ".txt"), doms, new String(serverCore.crlf));
File file = new File(root, targetName + ".txt");
System.out.println("Started domain list dump to file " + file);
serverFileUtils.saveSet(file, doms, new String(serverCore.crlf));
}
pool.close();
} catch (IOException e) {
@ -1030,7 +1032,7 @@ public final class yacy {
File root = new File(homePath);
try {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, 1000, 1000, 10000);
Iterator eiter = pool.loadedURL.entries(true, false);
Iterator eiter = pool.loadedURL.entries(true, false, null);
plasmaCrawlLURL.Entry entry;
File file = new File(root, targetName);
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));

Loading…
Cancel
Save