From 58df8b7bbfe404c0d6763731354e0d9c6ee641f7 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 23 Jul 2006 22:39:41 +0000 Subject: [PATCH] a large collection of different changes * mainly for the transition to the new indexing database structure * a bugfix for an endless loop inside kelondroTree iteration * a bugfix for bulk read inside a kelondroTree iteration; the bug caused that some elements had been iterated twice * very strong speed enhancement for url/domain extraction git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2320 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControl_p.java | 12 +- htroot/yacy/transferRWI.java | 2 +- .../de/anomic/index/indexAbstractEntry.java | 14 +- source/de/anomic/index/indexEntry.java | 15 +- source/de/anomic/index/indexRAMCacheRI.java | 4 +- .../anomic/index/indexTreeMapContainer.java | 8 +- source/de/anomic/index/indexURLEntry.java | 12 +- source/de/anomic/kelondro/kelondroColumn.java | 16 +-- source/de/anomic/kelondro/kelondroRow.java | 133 +++++++++++++++--- source/de/anomic/kelondro/kelondroTree.java | 6 +- .../plasma/dbImport/plasmaDbImporter.java | 2 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 64 +++++---- source/de/anomic/plasma/plasmaDHTChunk.java | 10 +- .../de/anomic/plasma/plasmaSearchEvent.java | 2 +- .../anomic/plasma/plasmaSearchPreOrder.java | 2 +- .../plasma/plasmaSearchRankingProfile.java | 6 +- .../de/anomic/plasma/plasmaSearchResult.java | 2 +- source/de/anomic/plasma/plasmaWordIndex.java | 6 +- .../plasma/plasmaWordIndexAssortment.java | 2 +- .../de/anomic/plasma/plasmaWordIndexFile.java | 6 +- source/de/anomic/server/serverFileUtils.java | 22 +++ source/de/anomic/yacy/yacyClient.java | 4 +- source/de/anomic/yacy/yacyNewsDB.java | 10 +- source/de/anomic/yacy/yacyNewsQueue.java | 4 +- source/yacy.java | 10 +- 25 files changed, 253 insertions(+), 121 deletions(-) diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 8d91e1546..89196a279 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -153,7 +153,7 @@ public class IndexControl_p { int i = 0; urlx = new String[index.size()]; while (en.hasNext()) { - urlx[i++] = ((indexURLEntry) en.next()).getUrlHash(); + urlx[i++] = ((indexURLEntry) en.next()).urlHash(); } index = null; } @@ -259,15 +259,15 @@ public class IndexControl_p { while (urlIter.hasNext()) { indexEntry = (indexURLEntry) urlIter.next(); try { - lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), null); + lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.urlHash(), null); if (lurl.toString() == null) { - unknownURLEntries.add(indexEntry.getUrlHash()); + unknownURLEntries.add(indexEntry.urlHash()); urlIter.remove(); } else { - knownURLs.put(indexEntry.getUrlHash(), lurl); + knownURLs.put(indexEntry.urlHash(), lurl); } } catch (IOException e) { - unknownURLEntries.add(indexEntry.getUrlHash()); + unknownURLEntries.add(indexEntry.urlHash()); } } // use whats remaining @@ -441,7 +441,7 @@ public class IndexControl_p { indexURLEntry xi; while (en.hasNext()) { xi = (indexURLEntry) en.next(); - uh = new String[]{xi.getUrlHash(), Integer.toString(xi.posintext())}; + uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())}; try { us = switchboard.urlPool.loadedURL.getEntry(uh[0], null).url().toString(); tm.put(us, uh); diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 012c9724e..1b3840462 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -149,7 +149,7 @@ public final class transferRWI { sb.wordIndex.addEntry(wordHash, entry, System.currentTimeMillis(), true); serverCore.checkInterruption(); - urlHash = entry.getUrlHash(); + urlHash = entry.urlHash(); try { if ((!(unknownURL.contains(urlHash))) && (!(sb.urlPool.loadedURL.exists(urlHash)))) { diff --git a/source/de/anomic/index/indexAbstractEntry.java b/source/de/anomic/index/indexAbstractEntry.java index 53218ddca..0fd747897 100644 --- a/source/de/anomic/index/indexAbstractEntry.java +++ b/source/de/anomic/index/indexAbstractEntry.java @@ -107,10 +107,10 @@ public abstract class indexAbstractEntry implements indexEntry { return e; } - public String getUrlHash() { return urlHash; } - public int getQuality() { return quality; } - public int getVirtualAge() { return plasmaWordIndex.microDateDays(lastModified); } - public long getLastModified() { return lastModified; } + public String urlHash() { return urlHash; } + public int quality() { return quality; } + public int virtualAge() { return plasmaWordIndex.microDateDays(lastModified); } + public long lastModified() { return lastModified; } public int hitcount() { return hitcount; } public int posintext() { return posintext; } public int posinphrase() { return posinphrase; } @@ -125,7 +125,7 @@ public abstract class indexAbstractEntry implements indexEntry { public boolean isNewer(indexEntry other) { if (other == null) return true; if (this.lastModified > ((indexAbstractEntry) other).lastModified) return true; - if (this.lastModified == ((indexAbstractEntry) other).getLastModified()) { + if (this.lastModified == ((indexAbstractEntry) other).lastModified()) { if (this.quality > ((indexAbstractEntry) other).quality) return true; } return false; @@ -133,8 +133,8 @@ public abstract class indexAbstractEntry implements indexEntry { public boolean isOlder(indexEntry other) { if (other == null) return false; - if (this.lastModified < ((indexAbstractEntry) other).getLastModified()) return true; - if (this.lastModified == ((indexAbstractEntry) other).getLastModified()) { + if (this.lastModified < ((indexAbstractEntry) other).lastModified()) return true; + if (this.lastModified == ((indexAbstractEntry) other).lastModified()) { if (this.quality < ((indexAbstractEntry) other).quality) return true; } return false; diff --git a/source/de/anomic/index/indexEntry.java b/source/de/anomic/index/indexEntry.java index 8e9364c6a..72e2c343a 100644 --- a/source/de/anomic/index/indexEntry.java +++ b/source/de/anomic/index/indexEntry.java @@ -36,7 +36,20 @@ public interface indexEntry { public String toPropertyForm(); public kelondroRow.Entry toKelondroEntry(); - public String getUrlHash(); + public String urlHash(); + public int quality(); + public int virtualAge(); + public long lastModified(); + public int hitcount(); + public int posintext(); + public int posinphrase(); + public int posofphrase(); + public int wordcount(); + public int phrasecount(); + public String getLanguage(); + public char getType(); + public boolean isLocal(); + public void combineDistance(indexEntry oe); public int worddistance(); public void min(indexEntry other); diff --git a/source/de/anomic/index/indexRAMCacheRI.java b/source/de/anomic/index/indexRAMCacheRI.java index 367da470f..6cd7fc2f7 100644 --- a/source/de/anomic/index/indexRAMCacheRI.java +++ b/source/de/anomic/index/indexRAMCacheRI.java @@ -119,7 +119,7 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI { row.setCol(0, container.getWordHash().getBytes()); row.setCol(1, kelondroNaturalOrder.encodeLong(container.size(), 4)); row.setCol(2, kelondroNaturalOrder.encodeLong(container.updated(), 8)); - row.setCol(3, wordEntry.getUrlHash().getBytes()); + row.setCol(3, wordEntry.urlHash().getBytes()); row.setCol(4, wordEntry.toEncodedStringForm().getBytes()); dumpArray.set((int) urlcount++, row); } @@ -148,7 +148,7 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI { row.setCol(0, wordHash.getBytes()); row.setCol(1, kelondroNaturalOrder.encodeLong(container.size(), 4)); row.setCol(2, kelondroNaturalOrder.encodeLong(updateTime, 8)); - row.setCol(3, wordEntry.getUrlHash().getBytes()); + row.setCol(3, wordEntry.urlHash().getBytes()); row.setCol(4, wordEntry.toEncodedStringForm().getBytes()); dumpArray.set((int) urlcount++, row); } diff --git a/source/de/anomic/index/indexTreeMapContainer.java b/source/de/anomic/index/indexTreeMapContainer.java index 3923c227b..deb0f8355 100644 --- a/source/de/anomic/index/indexTreeMapContainer.java +++ b/source/de/anomic/index/indexTreeMapContainer.java @@ -134,9 +134,9 @@ public final class indexTreeMapContainer extends indexAbstractContainer implemen private boolean addi(indexEntry entry) { // returns true if the new entry was added, false if it already existed - indexURLEntry oldEntry = (indexURLEntry) container.put(entry.getUrlHash(), entry); + indexURLEntry oldEntry = (indexURLEntry) container.put(entry.urlHash(), entry); if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this container - container.put(entry.getUrlHash(), oldEntry); // put it back + container.put(entry.urlHash(), oldEntry); // put it back return false; } return (oldEntry == null); @@ -259,7 +259,7 @@ public final class indexTreeMapContainer extends indexAbstractContainer implemen long stamp = System.currentTimeMillis(); while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) { ie0 = (indexEntry) se.next(); - ie1 = large.get(ie0.getUrlHash()); + ie1 = large.get(ie0.urlHash()); if (ie1 != null) { // this is a hit. Calculate word distance: ie0.combineDistance(ie1); @@ -285,7 +285,7 @@ public final class indexTreeMapContainer extends indexAbstractContainer implemen long stamp = System.currentTimeMillis(); while ((System.currentTimeMillis() - stamp) < time) { - c = i1.getOrdering().compare(ie1.getUrlHash(), ie2.getUrlHash()); + c = i1.getOrdering().compare(ie1.urlHash(), ie2.urlHash()); //System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c); if (c < 0) { if (e1.hasNext()) ie1 = (indexURLEntry) e1.next(); else break; diff --git a/source/de/anomic/index/indexURLEntry.java b/source/de/anomic/index/indexURLEntry.java index 56b2c0b8b..e966b9833 100644 --- a/source/de/anomic/index/indexURLEntry.java +++ b/source/de/anomic/index/indexURLEntry.java @@ -37,21 +37,13 @@ import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexAbstractEntry; import de.anomic.index.indexURL; import de.anomic.kelondro.kelondroBase64Order; -import de.anomic.kelondro.kelondroColumn; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow.Entry; import de.anomic.plasma.plasmaWordIndex; public final class indexURLEntry extends indexAbstractEntry implements Cloneable, indexEntry { - public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{ - new kelondroColumn( - "nickname", - kelondroColumn.celltype_undefined, 4 /*cellwidth*/, - kelondroColumn.encoder_none, 0, - "description") - }); - + // the class instantiation can only be done by a plasmaStore method // therefore they are all public public indexURLEntry(String urlHash, @@ -180,7 +172,7 @@ public final class indexURLEntry extends indexAbstractEntry implements Cloneable } public Entry toKelondroEntry() { - kelondroRow.Entry entry = urlEntryRow.newEntry(toEncodedByteArrayForm()); + kelondroRow.Entry entry = indexURLEntryNew.urlEntryRow.newEntry(toEncodedByteArrayForm()); return entry; } diff --git a/source/de/anomic/kelondro/kelondroColumn.java b/source/de/anomic/kelondro/kelondroColumn.java index 2024c6129..a89067767 100644 --- a/source/de/anomic/kelondro/kelondroColumn.java +++ b/source/de/anomic/kelondro/kelondroColumn.java @@ -38,18 +38,18 @@ public class kelondroColumn { public static final int encoder_none = 0; public static final int encoder_b64e = 1; - public static final int encoder_string = 2; - public static final int encoder_bytes = 3; - public static final int encoder_char = 4; + public static final int encoder_b256 = 2; + public static final int encoder_string = 3; + public static final int encoder_bytes = 4; + public static final int encoder_char = 5; - private int celltype, cellwidth, encoder, encodedwidth; + private int celltype, cellwidth, encoder; private String nickname, description; - public kelondroColumn(String nickname, int celltype, int cellwidth, int encoder, int encodedwidth, String description) { + public kelondroColumn(String nickname, int celltype, int encoder, int cellwidth, String description) { this.celltype = celltype; this.cellwidth = cellwidth; this.encoder = encoder; - this.encodedwidth = encodedwidth; this.nickname = nickname; this.description = description; } @@ -66,10 +66,6 @@ public class kelondroColumn { return this.encoder; } - public int encodedwidth() { - return this.encodedwidth; - } - public String nickname() { return this.nickname; } diff --git a/source/de/anomic/kelondro/kelondroRow.java b/source/de/anomic/kelondro/kelondroRow.java index f6b90fb01..b65e4050e 100644 --- a/source/de/anomic/kelondro/kelondroRow.java +++ b/source/de/anomic/kelondro/kelondroRow.java @@ -33,18 +33,15 @@ public class kelondroRow { private kelondroColumn[] row; protected int[] colstart; - private int encodedFormLength; private int objectsize; public kelondroRow(kelondroColumn[] row) { this.row = row; this.colstart = new int[row.length]; this.objectsize = 0; - this.encodedFormLength = 0; for (int i = 0; i < row.length; i++) { this.colstart[i] = this.objectsize; this.objectsize += this.row[i].cellwidth(); - this.encodedFormLength += this.row[i].encodedwidth(); } } @@ -53,12 +50,10 @@ public class kelondroRow { this.row = new kelondroColumn[rowi.length]; this.colstart = new int[rowi.length]; this.objectsize = 0; - this.encodedFormLength = 0; for (int i = 0; i < rowi.length; i++) { - this.row[i] = new kelondroColumn("col_" + i, kelondroColumn.celltype_undefined, rowi[i], kelondroColumn.encoder_none, rowi[i], ""); + this.row[i] = new kelondroColumn("col_" + i, kelondroColumn.celltype_undefined, kelondroColumn.encoder_none, rowi[i], ""); this.colstart[i] = this.objectsize; this.objectsize += this.row[i].cellwidth(); - this.encodedFormLength += this.row[i].encodedwidth(); } } @@ -158,11 +153,49 @@ public class kelondroRow { } } + public void setColByte(int column, byte c) { + rowinstance[colstart[column]] = c; + } + + public void setColString(int column, String cell, String encoding) { + if (encoding == null) + setCol(column, cell.getBytes()); + else + try { + setCol(column, cell.getBytes(encoding)); + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } + } + + public void setColLong(int column, long cell) { + // uses the column definition to choose the right encoding + switch (row[column].encoder()) { + case kelondroColumn.encoder_none: + throw new kelondroException("ROW", "setColLong has celltype none, no encoder given"); + case kelondroColumn.encoder_b64e: + setColLongB64E(column, cell); + break; + case kelondroColumn.encoder_b256: + setColLongB256(column, cell); + break; + case kelondroColumn.encoder_string: + setCol(column, Long.toString(cell).getBytes()); + break; + case kelondroColumn.encoder_bytes: + throw new kelondroException("ROW", "setColLong of celltype bytes not applicable"); + case kelondroColumn.encoder_char: + throw new kelondroException("ROW", "setColLong of celltype char not applicable"); + } + } + public void setColLongB256(int column, long cell) { + // temporary method, should be replaced by setColLong if all row declarations are complete kelondroNaturalOrder.encodeLong(cell, rowinstance, colstart[column], row[column].cellwidth()); } public void setColLongB64E(int column, long cell) { + // temporary method, should be replaced by setColLong if all row declarations are complete kelondroBase64Order.enhancedCoder.encodeLong(cell, rowinstance, colstart[column], row[column].cellwidth()); } @@ -183,11 +216,32 @@ public class kelondroRow { } } + public long getColLong(int column) { + // uses the column definition to choose the right encoding + switch (row[column].encoder()) { + case kelondroColumn.encoder_none: + throw new kelondroException("ROW", "getColLong has celltype none, no encoder given"); + case kelondroColumn.encoder_b64e: + return getColLongB64E(column); + case kelondroColumn.encoder_b256: + return getColLongB256(column); + case kelondroColumn.encoder_string: + return Long.parseLong(getColString(column, null)); + case kelondroColumn.encoder_bytes: + throw new kelondroException("ROW", "getColLong of celltype bytes not applicable"); + case kelondroColumn.encoder_char: + throw new kelondroException("ROW", "getColLong of celltype char not applicable"); + } + throw new kelondroException("ROW", "getColLong did not find appropriate encoding"); + } + public long getColLongB256(int column) { + // temporary method, should be replaced by getColLong if all row declarations are complete return kelondroNaturalOrder.decodeLong(rowinstance, colstart[column], row[column].cellwidth()); } public long getColLongB64E(int column) { + // temporary method, should be replaced by getColLong if all row declarations are complete return kelondroBase64Order.enhancedCoder.decodeLong(rowinstance, colstart[column], row[column].cellwidth()); } @@ -202,30 +256,30 @@ public class kelondroRow { } public byte[] toEncodedBytesForm() { - byte[] b = new byte[encodedFormLength]; - int encoder, encodedwidth; + byte[] b = new byte[objectsize]; + int encoder, cellwidth; int p = 0; for (int i = 0; i < row.length; i++) { encoder = row[i].encoder(); - encodedwidth = row[i].encodedwidth(); + cellwidth = row[i].cellwidth(); switch (row[i].celltype()) { case kelondroColumn.celltype_undefined: throw new kelondroException("ROW", "toEncodedForm of celltype undefined not possible"); case kelondroColumn.celltype_boolean: throw new kelondroException("ROW", "toEncodedForm of celltype boolean not yet implemented"); case kelondroColumn.celltype_binary: - System.arraycopy(rowinstance, colstart[i], b, p, encodedwidth); - p += encodedwidth; + System.arraycopy(rowinstance, colstart[i], b, p, cellwidth); + p += cellwidth; continue; case kelondroColumn.celltype_string: - System.arraycopy(rowinstance, colstart[i], b, p, encodedwidth); - p += encodedwidth; + System.arraycopy(rowinstance, colstart[i], b, p, cellwidth); + p += cellwidth; continue; case kelondroColumn.celltype_cardinal: if (encoder == kelondroColumn.encoder_b64e) { - long c = bytes2long(rowinstance, colstart[i]); - System.arraycopy(kelondroBase64Order.enhancedCoder.encodeLongSmart(c, encodedwidth).getBytes(), 0, b, p, encodedwidth); - p += encodedwidth; + long c = bytes2long(rowinstance, colstart[i], cellwidth); + System.arraycopy(kelondroBase64Order.enhancedCoder.encodeLongSmart(c, cellwidth).getBytes(), 0, b, p, cellwidth); + p += cellwidth; continue; } throw new kelondroException("ROW", "toEncodedForm of celltype cardinal has no encoder (" + encoder + ")"); @@ -236,6 +290,49 @@ public class kelondroRow { return b; } + public String toPropertyForm() { + StringBuffer sb = new StringBuffer(); + sb.append("{"); + int encoder, cellwidth; + for (int i = 0; i < row.length; i++) { + encoder = row[i].encoder(); + cellwidth = row[i].cellwidth(); + switch (row[i].celltype()) { + case kelondroColumn.celltype_undefined: + throw new kelondroException("ROW", "toEncodedForm of celltype undefined not possible"); + case kelondroColumn.celltype_boolean: + throw new kelondroException("ROW", "toEncodedForm of celltype boolean not yet implemented"); + case kelondroColumn.celltype_binary: + sb.append(row[i].nickname()); + sb.append('='); + for (int j = colstart[i]; j < colstart[i] + cellwidth; j++) sb.append((char) rowinstance[j]); + sb.append(','); + continue; + case kelondroColumn.celltype_string: + sb.append(row[i].nickname()); + sb.append('='); + for (int j = colstart[i]; j < colstart[i] + cellwidth; j++) sb.append((char) rowinstance[j]); + sb.append(','); + continue; + case kelondroColumn.celltype_cardinal: + if (encoder == kelondroColumn.encoder_b64e) { + sb.append(row[i].nickname()); + sb.append('='); + long c = bytes2long(rowinstance, colstart[i], cellwidth); + sb.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(c, cellwidth).getBytes()); + sb.append(','); + continue; + } + throw new kelondroException("ROW", "toEncodedForm of celltype cardinal has no encoder (" + encoder + ")"); + case kelondroColumn.celltype_real: + throw new kelondroException("ROW", "toEncodedForm of celltype real not yet implemented"); + } + } + if (sb.charAt(sb.length() - 1) == ',') sb.deleteCharAt(sb.length() - 1); // remove ',' at end + sb.append("}"); + return sb.toString(); + } + public String toString() { StringBuffer b = new StringBuffer(); b.append('{'); @@ -255,10 +352,10 @@ public class kelondroRow { } } - public final static long bytes2long(byte[] b, int offset) { + public final static long bytes2long(byte[] b, int offset, int length) { if (b == null) return 0; long x = 0; - for (int i = 0; i < b.length; i++) x = (x << 8) | (0xff & b[offset + i]); + for (int i = 0; i < length; i++) x = (x << 8) | (0xff & b[offset + i]); return x; } diff --git a/source/de/anomic/kelondro/kelondroTree.java b/source/de/anomic/kelondro/kelondroTree.java index f82012478..6fab25910 100644 --- a/source/de/anomic/kelondro/kelondroTree.java +++ b/source/de/anomic/kelondro/kelondroTree.java @@ -1000,11 +1000,14 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex { setOrder.rotate(firstKey); TreeMap rows = new TreeMap(setOrder); Node n; + String key; synchronized (this) { Iterator i = (firstKey == null) ? new nodeIterator(up, rotating) : new nodeIterator(up, rotating, firstKey, including); while ((rows.size() < count) && (i.hasNext())) { n = (Node) i.next(); - if (n != null) rows.put(new String(n.getKey()), row().newEntry(n.getValueRow())); + if (n == null) return rows; + key = new String(n.getKey()); + if (rows.put(key, row().newEntry(n.getValueRow())) != null) return rows; // protection against loops } } return rows; @@ -1080,6 +1083,7 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex { if (!(bufferIterator.hasNext())) { // assign next buffer chunk try { + lastKey[lastKey.length - 1]++; rowBuffer = rowMap(inc, rot, lastKey, false, chunkSize); bufferIterator = rowBuffer.entrySet().iterator(); } catch (IOException e) { diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index 6bb88bf9b..162e3cf06 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -135,7 +135,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { // getting next word index entry importWordIdxEntry = (indexURLEntry) importWordIdxEntries.next(); - String urlHash = importWordIdxEntry.getUrlHash(); + String urlHash = importWordIdxEntry.urlHash(); entityUrls.add(urlHash); } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 7f7521b3b..1f03a03a0 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -462,25 +462,33 @@ public final class plasmaCrawlLURL extends indexURL { this.urlHash = urlHash; kelondroRow.Entry entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes()); if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL"); + insertEntry(entry, searchedWord); + } + + public Entry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException { + assert (entry != null); + insertEntry(entry, word); + } + + private void insertEntry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException { try { - if (entry != null) { - this.url = new URL(entry.getColString(1, "UTF-8").trim()); - this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim(); - this.moddate = new Date(86400000 * entry.getColLongB64E(3)); - this.loaddate = new Date(86400000 * entry.getColLongB64E(4)); - this.referrerHash = (entry.empty(5)) ? dummyHash : entry.getColString(5, "UTF-8"); - this.copyCount = (int) entry.getColLongB64E(6); - this.flags = entry.getColString(7, "UTF-8"); - this.quality = (int) entry.getColLongB64E(8); - this.language = entry.getColString(9, "UTF-8"); - this.doctype = (char) entry.getColByte(10); - this.size = (int) entry.getColLongB64E(11); - this.wordCount = (int) entry.getColLongB64E(12); - this.snippet = null; - this.word = searchedWord; - this.stored = false; - return; - } + this.urlHash = entry.getColString(0, null); + this.url = new URL(entry.getColString(1, "UTF-8").trim()); + this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim(); + this.moddate = new Date(86400000 * entry.getColLongB64E(3)); + this.loaddate = new Date(86400000 * entry.getColLongB64E(4)); + this.referrerHash = (entry.empty(5)) ? dummyHash : entry.getColString(5, "UTF-8"); + this.copyCount = (int) entry.getColLongB64E(6); + this.flags = entry.getColString(7, "UTF-8"); + this.quality = (int) entry.getColLongB64E(8); + this.language = entry.getColString(9, "UTF-8"); + this.doctype = (char) entry.getColByte(10); + this.size = (int) entry.getColLongB64E(11); + this.wordCount = (int) entry.getColLongB64E(12); + this.snippet = null; + this.word = searchedWord; + this.stored = false; + return; } catch (Exception e) { serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e); throw new IOException("plasmaLURL.entry/1: " + e.toString()); @@ -764,8 +772,8 @@ public final class plasmaCrawlLURL extends indexURL { Iterator i; boolean error = false; - public kiter(boolean up, boolean rotating) throws IOException { - i = urlHashCache.rows(up, rotating, null); + public kiter(boolean up, boolean rotating, String firstHash) throws IOException { + i = urlHashCache.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes()); error = false; } @@ -777,12 +785,10 @@ public final class plasmaCrawlLURL extends indexURL { public Object next() throws RuntimeException { kelondroRow.Entry e = (kelondroRow.Entry) i.next(); if (e == null) return null; - String hash = null; try { - hash = new String(e.getColBytes(0)); - return new Entry(hash, null); + return new Entry(e, null); } catch (IOException ex) { - throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + hash); + throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null)); } } @@ -792,9 +798,9 @@ public final class plasmaCrawlLURL extends indexURL { } - public Iterator entries(boolean up, boolean rotating) throws IOException { + public Iterator entries(boolean up, boolean rotating, String firstHash) throws IOException { // enumerates entry elements - return new kiter(up, rotating); + return new kiter(up, rotating, firstHash); } /** @@ -807,7 +813,7 @@ public final class plasmaCrawlLURL extends indexURL { serverLog log = new serverLog("URLDBCLEANUP"); HashSet damagedURLS = new HashSet(); try { - Iterator eiter = entries(true, false); + Iterator eiter = entries(true, false, null); int iteratorCount = 0; while (eiter.hasNext()) try { eiter.next(); @@ -893,7 +899,7 @@ public final class plasmaCrawlLURL extends indexURL { public void run() { try { serverLog.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet"); - Iterator eiter = entries(true,false); + Iterator eiter = entries(true, false, null); while (eiter.hasNext() && run) { synchronized(this) { if (this.pause) { @@ -975,7 +981,7 @@ public final class plasmaCrawlLURL extends indexURL { if (args[0].equals("-l")) try { // arg 1 is path to URLCache final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1, 0); - final Iterator enu = urls.entries(true, false); + final Iterator enu = urls.entries(true, false, null); while (enu.hasNext()) { ((Entry) enu.next()).print(); } diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java index d184be8c5..b04ff0b12 100644 --- a/source/de/anomic/plasma/plasmaDHTChunk.java +++ b/source/de/anomic/plasma/plasmaDHTChunk.java @@ -209,19 +209,19 @@ public class plasmaDHTChunk { while ((urlIter.hasNext()) && (maxcount > refcount)) { indexEntry = (indexURLEntry) urlIter.next(); try { - lurl = lurls.getEntry(indexEntry.getUrlHash(), indexEntry); + lurl = lurls.getEntry(indexEntry.urlHash(), indexEntry); if ((lurl == null) || (lurl.url() == null)) { notBoundCounter++; urlIter.remove(); - wordIndex.removeEntries(nexthash, new String[] { indexEntry.getUrlHash() }, true); + wordIndex.removeEntries(nexthash, new String[] { indexEntry.urlHash() }, true); } else { - urlCache.put(indexEntry.getUrlHash(), lurl); + urlCache.put(indexEntry.urlHash(), lurl); refcount++; } } catch (IOException e) { notBoundCounter++; urlIter.remove(); - wordIndex.removeEntries(nexthash, new String[] { indexEntry.getUrlHash() }, true); + wordIndex.removeEntries(nexthash, new String[] { indexEntry.urlHash() }, true); } } @@ -279,7 +279,7 @@ public class plasmaDHTChunk { urlIter = this.indexContainers[i].entries(); while (urlIter.hasNext()) { indexEntry = (indexURLEntry) urlIter.next(); - urlHashes[c++] = indexEntry.getUrlHash(); + urlHashes[c++] = indexEntry.urlHash(); } count += wordIndex.removeEntries(this.indexContainers[i].getWordHash(), urlHashes, true); log.logFine("Deleted partial index (" + c + " URLs) for word " + this.indexContainers[i].getWordHash() + "; " + this.wordIndex.indexSize(indexContainers[i].getWordHash()) + " entries left"); diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index d0c3b02cc..06b98c37f 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -251,7 +251,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { entry = preorder.next(); // find the url entry try { - page = urlStore.getEntry(entry.getUrlHash(), entry); + page = urlStore.getEntry(entry.urlHash(), entry); // add a result acc.addResult(entry, page); } catch (IOException e) { diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index ed879f40f..a4859662a 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -142,7 +142,7 @@ public final class plasmaSearchPreOrder { i = container.entries(); for (int j = 0; j < count; j++) { indexEntry = (indexURLEntry) i.next(); - pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.getUrlHash(), indexEntry); + pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(indexEntry.generateNormalized(entryMin, entryMax)), 16) + indexEntry.urlHash(), indexEntry); } } diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index 1c0a632ef..a00cd4e20 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -168,9 +168,9 @@ public class plasmaSearchRankingProfile { long ranking = 0; if (entry instanceof indexURLEntry) { indexURLEntry normalizedEntry = (indexURLEntry) entry; - ranking += normalizedEntry.getQuality() << ((Integer) coeff.get(ENTROPY)).intValue(); - ranking += normalizedEntry.getVirtualAge() << ((Integer) coeff.get(DATE)).intValue(); - ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.getUrlHash()) << ((Integer) coeff.get(YBR)).intValue(); + ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue(); + ranking += normalizedEntry.virtualAge() << ((Integer) coeff.get(DATE)).intValue(); + ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.urlHash()) << ((Integer) coeff.get(YBR)).intValue(); ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue(); ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue(); ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue(); diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java index 1ac47ac94..14b2f1da3 100644 --- a/source/de/anomic/plasma/plasmaSearchResult.java +++ b/source/de/anomic/plasma/plasmaSearchResult.java @@ -161,7 +161,7 @@ public final class plasmaSearchResult { // insert value //System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url()); - pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), page); + pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.urlHash(), page); } // flush memory diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 8cc0fe767..853701ce4 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -594,12 +594,12 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { // System.out.println("Wordhash: "+wordHash+" UrlHash: // "+entry.getUrlHash()); try { - url = lurl.getEntry(entry.getUrlHash(), null).url(); + url = lurl.getEntry(entry.urlHash(), null).url(); if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(url) == true)) { - urlHashs.add(entry.getUrlHash()); + urlHashs.add(entry.urlHash()); } } catch (IOException e) { - urlHashs.add(entry.getUrlHash()); + urlHashs.add(entry.urlHash()); } } if (urlHashs.size() > 0) { diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java index d6bbbb892..1eb310159 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java @@ -150,7 +150,7 @@ public final class plasmaWordIndexAssortment { indexURLEntry entry; for (int i = 0; i < assortmentLength; i++) { entry = (indexURLEntry) entries.next(); - row.setCol(3 + 2 * i, entry.getUrlHash().getBytes()); + row.setCol(3 + 2 * i, entry.urlHash().getBytes()); row.setCol(4 + 2 * i, entry.toEncodedStringForm().getBytes()); } kelondroRow.Entry oldrow = null; diff --git a/source/de/anomic/plasma/plasmaWordIndexFile.java b/source/de/anomic/plasma/plasmaWordIndexFile.java index a8ba85e83..4fb5a9fa3 100644 --- a/source/de/anomic/plasma/plasmaWordIndexFile.java +++ b/source/de/anomic/plasma/plasmaWordIndexFile.java @@ -148,16 +148,16 @@ public final class plasmaWordIndexFile { } public boolean contains(indexURLEntry entry) throws IOException { - return (theIndex.get(entry.getUrlHash().getBytes()) != null); + return (theIndex.get(entry.urlHash().getBytes()) != null); } public boolean addEntry(indexURLEntry entry) throws IOException { if (entry == null) return false; - indexURLEntry oldEntry = getEntry(entry.getUrlHash()); + indexURLEntry oldEntry = getEntry(entry.urlHash()); if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity return false; } - return (theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedStringForm().getBytes()) == null); + return (theIndex.put(entry.urlHash().getBytes(), entry.toEncodedStringForm().getBytes()) == null); } public int addEntries(indexContainer container) throws IOException { diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java index 86ad320b1..dd6819a5f 100644 --- a/source/de/anomic/server/serverFileUtils.java +++ b/source/de/anomic/server/serverFileUtils.java @@ -63,6 +63,9 @@ import java.util.Properties; import java.util.Hashtable; import java.util.Iterator; +import de.anomic.kelondro.kelondroRow; +import de.anomic.kelondro.kelondroRowSet; + public final class serverFileUtils { /** @@ -326,6 +329,25 @@ public final class serverFileUtils { tf.renameTo(file); } + public static void saveSet(File file, kelondroRowSet set, String sep) throws IOException { + File tf = new File(file.toString() + "." + (System.currentTimeMillis() % 1000)); + BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(tf)); + Iterator i = set.rows(); + String key; + if (i.hasNext()) { + key = new String(((kelondroRow.Entry) i.next()).getColBytes(0)); + bos.write(key.getBytes()); + } + while (i.hasNext()) { + key = new String(((kelondroRow.Entry) i.next()).getColBytes(0)); + if (sep != null) bos.write(sep.getBytes()); + bos.write(key.getBytes()); + } + bos.close(); + file.delete(); + tf.renameTo(file); + } + /** * Moves all files from a directory to another. * @param from_dir Directory which contents will be moved. diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 269976e7e..1e171abc3 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -893,8 +893,8 @@ public final class yacyClient { eenum = indexes[i].entries(); while (eenum.hasNext()) { entry = (indexURLEntry) eenum.next(); - if (urlCache.get(entry.getUrlHash()) == null) { - yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.getUrlHash() + "' is not contained in urlCache"); + if (urlCache.get(entry.urlHash()) == null) { + yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.urlHash() + "' is not contained in urlCache"); } } } diff --git a/source/de/anomic/yacy/yacyNewsDB.java b/source/de/anomic/yacy/yacyNewsDB.java index dcd76e441..f02810752 100644 --- a/source/de/anomic/yacy/yacyNewsDB.java +++ b/source/de/anomic/yacy/yacyNewsDB.java @@ -85,11 +85,11 @@ public class yacyNewsDB { } public static final kelondroRow rowdef = new kelondroRow(new kelondroColumn[]{ - new kelondroColumn("newsid", kelondroColumn.celltype_string, yacyNewsRecord.idLength(), kelondroColumn.encoder_string, yacyNewsRecord.idLength(), "id = created + originator"), - new kelondroColumn("category", kelondroColumn.celltype_string, yacyNewsRecord.categoryStringLength, kelondroColumn.encoder_string, yacyNewsRecord.categoryStringLength, ""), - new kelondroColumn("received", kelondroColumn.celltype_string, yacyCore.universalDateShortPattern.length(), kelondroColumn.encoder_string, yacyCore.universalDateShortPattern.length(), ""), - new kelondroColumn("", kelondroColumn.celltype_string, 2, kelondroColumn.encoder_string, 2, ""), - new kelondroColumn("", kelondroColumn.celltype_string, attributesMaxLength, kelondroColumn.encoder_string, attributesMaxLength, ""), + new kelondroColumn("newsid", kelondroColumn.celltype_string, kelondroColumn.encoder_string, yacyNewsRecord.idLength(), "id = created + originator"), + new kelondroColumn("category", kelondroColumn.celltype_string, kelondroColumn.encoder_string, yacyNewsRecord.categoryStringLength, ""), + new kelondroColumn("received", kelondroColumn.celltype_string, kelondroColumn.encoder_string, yacyCore.universalDateShortPattern.length(), ""), + new kelondroColumn("", kelondroColumn.celltype_string, kelondroColumn.encoder_string, 2, ""), + new kelondroColumn("", kelondroColumn.celltype_string, kelondroColumn.encoder_string, attributesMaxLength, ""), }); private static kelondroTree createDB(File path, int bufferkb, long preloadTime) { diff --git a/source/de/anomic/yacy/yacyNewsQueue.java b/source/de/anomic/yacy/yacyNewsQueue.java index c39c17acc..b3a8ed264 100644 --- a/source/de/anomic/yacy/yacyNewsQueue.java +++ b/source/de/anomic/yacy/yacyNewsQueue.java @@ -77,8 +77,8 @@ public class yacyNewsQueue { } public static final kelondroRow rowdef = new kelondroRow(new kelondroColumn[]{ - new kelondroColumn("newsid", kelondroColumn.celltype_string, yacyNewsRecord.idLength(), kelondroColumn.encoder_string, yacyNewsRecord.idLength(), "id = created + originator"), - new kelondroColumn("last touched", kelondroColumn.celltype_string, yacyCore.universalDateShortPattern.length(), kelondroColumn.encoder_string, yacyCore.universalDateShortPattern.length(), "") + new kelondroColumn("newsid", kelondroColumn.celltype_string, kelondroColumn.encoder_string, yacyNewsRecord.idLength(), "id = created + originator"), + new kelondroColumn("last touched", kelondroColumn.celltype_string, kelondroColumn.encoder_string, yacyCore.universalDateShortPattern.length(), "") }); private static kelondroStack createStack(File path) { diff --git a/source/yacy.java b/source/yacy.java index 412fd1367..e42702c04 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -723,7 +723,7 @@ public final class yacy { indexURLEntry wordIdxEntry; while (wordIdxEntries.hasNext()) { wordIdxEntry = (indexURLEntry) wordIdxEntries.next(); - String urlHash = wordIdxEntry.getUrlHash(); + String urlHash = wordIdxEntry.urlHash(); if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null); urlCounter++; @@ -937,7 +937,7 @@ public final class yacy { File root = new File(homePath); try { plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, 1000, 1000, 10000); - Iterator eiter = pool.loadedURL.entries(true, false); + Iterator eiter = pool.loadedURL.entries(true, false, null); HashSet doms = new HashSet(); plasmaCrawlLURL.Entry entry; System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries."); @@ -1018,7 +1018,9 @@ public final class yacy { } else { // plain text list - serverFileUtils.saveSet(new File(root, targetName + ".txt"), doms, new String(serverCore.crlf)); + File file = new File(root, targetName + ".txt"); + System.out.println("Started domain list dump to file " + file); + serverFileUtils.saveSet(file, doms, new String(serverCore.crlf)); } pool.close(); } catch (IOException e) { @@ -1030,7 +1032,7 @@ public final class yacy { File root = new File(homePath); try { plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, 1000, 1000, 10000); - Iterator eiter = pool.loadedURL.entries(true, false); + Iterator eiter = pool.loadedURL.entries(true, false, null); plasmaCrawlLURL.Entry entry; File file = new File(root, targetName); BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));