";
} else {
@@ -463,7 +463,7 @@ public class IndexControl_p {
while (en.hasNext()) {
xi = (indexEntry) en.next();
uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())};
- plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(uh[0], null);
+ plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(uh[0], null);
if (le == null) {
tm.put(uh[0], uh);
} else {
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index 8681df3b4..ced7a6386 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -55,13 +55,13 @@ import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.net.URL;
+import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.crawler.plasmaCrawlerException;
import de.anomic.plasma.parser.ParserException;
-import de.anomic.plasma.plasmaCrawlLURL.Entry;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@@ -107,7 +107,7 @@ public class ViewFile {
String viewMode = post.get("viewMode","sentences");
// getting the urlEntry that belongs to the url hash
- Entry urlEntry = null;
+ plasmaCrawlLURLEntry urlEntry = null;
urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
if (urlEntry == null) {
prop.put("error",2);
diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java
index 0b7a201e1..eaa44ca34 100644
--- a/htroot/htdocsdefault/dir.java
+++ b/htroot/htdocsdefault/dir.java
@@ -64,7 +64,7 @@ import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
-import de.anomic.plasma.plasmaCrawlLURL;
+import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
@@ -358,7 +358,7 @@ public class dir {
try {
final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
- final plasmaCrawlLURL.Entry newEntry = switchboard.urlPool.loadedURL.newEntry(
+ final plasmaCrawlLURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
url, "YaCyShare: " + descr, new Date(), new Date(),
"AAAAAAAAAAAA", /*referrer*/
0, /*copycount*/
diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java
index a5c638469..b60545195 100644
--- a/htroot/yacy/crawlOrder.java
+++ b/htroot/yacy/crawlOrder.java
@@ -51,7 +51,7 @@ import java.util.Date;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.net.URL;
-import de.anomic.plasma.plasmaCrawlLURL;
+import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@@ -249,7 +249,7 @@ public final class crawlOrder {
// case where we have already the url loaded;
reason = reasonString;
// send lurl-Entry as response
- plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
+ plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
if (entry == null) {
response = "rejected";
lurl = "";
diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java
index fef31f6ca..aed450308 100644
--- a/htroot/yacy/crawlReceipt.java
+++ b/htroot/yacy/crawlReceipt.java
@@ -51,7 +51,7 @@ import java.io.IOException;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlEURL;
-import de.anomic.plasma.plasmaCrawlLURL;
+import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
@@ -124,7 +124,7 @@ public final class crawlReceipt {
prop.put("delay", "3600");
} else if (result.equals("fill")) {
// generating a new loaded URL entry
- plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true);
+ plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true);
if ((entry == null)||(entry.url()==null)) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr);
diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java
index 1b94af568..6f687d874 100644
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@@ -54,7 +54,7 @@ import java.util.Set;
import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer;
import de.anomic.index.indexURL;
-import de.anomic.plasma.plasmaCrawlLURL;
+import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
@@ -244,10 +244,10 @@ public final class search {
StringBuffer links = new StringBuffer();
String resource = "";
//plasmaIndexEntry pie;
- plasmaCrawlLURL.Entry urlentry;
+ plasmaCrawlLURLEntry urlentry;
plasmaSnippetCache.Snippet snippet;
while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
- urlentry = acc.nextElement();
+ urlentry = (plasmaCrawlLURLEntry) acc.nextElement();
if (includesnippet) {
snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260, 1000);
} else {
diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java
index 31e4cc1b6..281fd48da 100644
--- a/htroot/yacy/transferURL.java
+++ b/htroot/yacy/transferURL.java
@@ -48,7 +48,7 @@
import java.io.IOException;
import de.anomic.http.httpHeader;
-import de.anomic.plasma.plasmaCrawlLURL;
+import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore;
@@ -90,7 +90,7 @@ public final class transferURL {
final int sizeBefore = sb.urlPool.loadedURL.size();
// read the urls from the other properties and store
String urls;
- plasmaCrawlLURL.Entry lEntry;
+ plasmaCrawlLURLEntry lEntry;
for (int i = 0; i < urlc; i++) {
serverCore.checkInterruption();
urls = (String) post.get("url" + i);
diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java
index 191fa3d71..71e7f8996 100644
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@@ -57,7 +57,7 @@ import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.net.URL;
-import de.anomic.plasma.plasmaCrawlLURL;
+import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSearchImages;
import de.anomic.plasma.plasmaSearchPreOrder;
@@ -189,7 +189,7 @@ public class yacysearch {
return prop;
}
final String recommendHash = post.get("recommendref", ""); // urlhash
- plasmaCrawlLURL.Entry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
+ plasmaCrawlLURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
if (urlentry != null) {
plasmaParserDocument document = sb.snippetCache.retrieveDocument(urlentry.url(), true);
if (document != null) {
diff --git a/source/dbtest.java b/source/dbtest.java
index e3b7288d6..bc7f5a5ba 100644
--- a/source/dbtest.java
+++ b/source/dbtest.java
@@ -13,6 +13,7 @@ import java.util.Iterator;
import java.util.Random;
import de.anomic.kelondro.kelondroBase64Order;
+import de.anomic.kelondro.kelondroFlexSplitTable;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroNaturalOrder;
@@ -186,6 +187,10 @@ public class dbtest {
File tablepath = new File(tablename).getParentFile();
table = new kelondroFlexTable(tablepath, new File(tablename).getName(), buffer, preload, testRow, kelondroBase64Order.enhancedCoder);
}
+ if (dbe.equals("kelondroFlexSplitTable")) {
+ File tablepath = new File(tablename).getParentFile();
+ table = new kelondroFlexSplitTable(tablepath, new File(tablename).getName(), buffer, preload, testRow, kelondroBase64Order.enhancedCoder);
+ }
if (dbe.equals("mysql")) {
table = new dbTable("mysql", testRow);
}
@@ -513,6 +518,10 @@ final class dbTable implements kelondroIndex {
}
}
+ public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
+ return put(row);
+ }
+
public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException {
try {
diff --git a/source/de/anomic/index/indexURL.java b/source/de/anomic/index/indexURL.java
index 91df5b9fc..12240d564 100644
--- a/source/de/anomic/index/indexURL.java
+++ b/source/de/anomic/index/indexURL.java
@@ -47,7 +47,7 @@ import de.anomic.yacy.yacySeedDB;
public class indexURL {
// day formatter for entry export
- protected static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd");
+ public static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd");
// statics for value lengths
public static final int urlHashLength = yacySeedDB.commonHashLength; // 12
@@ -428,15 +428,6 @@ public class indexURL {
}
}
- public void store(kelondroRow.Entry entry, boolean cached) throws IOException {
- if ((cached) && (urlIndexCache != null))
- synchronized (urlIndexCache) {
- urlIndexCache.put(entry);
- }
- else
- urlIndexFile.put(entry);
- }
-
public void flushCacheSome() {
if (urlIndexCache == null) return;
if (urlIndexCache.size() == 0) return;
diff --git a/source/de/anomic/kelondro/kelondroCollectionIndex.java b/source/de/anomic/kelondro/kelondroCollectionIndex.java
index 6dbaa2e2e..68e7da432 100644
--- a/source/de/anomic/kelondro/kelondroCollectionIndex.java
+++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java
@@ -446,7 +446,7 @@ public class kelondroCollectionIndex {
indexEntry.setCol(idx_col_lastread, kelondroRowCollection.daysSince2000(System.currentTimeMillis()));
indexEntry.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis()));
index.put(indexEntry);
- throw new kelondroException(arrayFile(this.path, this.filenameStub, this.loadfactor, chunksize, partitionnumber, serialnumber).toString(), "array contains wrong row '" + new String(arrayrow.getColBytes(0)) + "', expected is '" + new String(indexrow.getColBytes(idx_col_key)) + "', the row has been fixed");
+ throw new kelondroException(array.filename, "array contains wrong row '" + new String(arrayrow.getColBytes(0)) + "', expected is '" + new String(indexrow.getColBytes(idx_col_key)) + "', the row has been fixed");
}
int chunkcountInArray = collection.size();
if (chunkcountInArray != chunkcount) {
diff --git a/source/de/anomic/kelondro/kelondroColumn.java b/source/de/anomic/kelondro/kelondroColumn.java
index c4c211cf1..9b56e923c 100644
--- a/source/de/anomic/kelondro/kelondroColumn.java
+++ b/source/de/anomic/kelondro/kelondroColumn.java
@@ -209,6 +209,11 @@ public class kelondroColumn {
public String toString() {
StringBuffer s = new StringBuffer();
switch (celltype) {
+ case celltype_undefined:
+ s.append(nickname);
+ s.append('-');
+ s.append(cellwidth);
+ break;
case celltype_boolean:
s.append("boolean ");
s.append(nickname);
diff --git a/source/de/anomic/kelondro/kelondroFlexTable.java b/source/de/anomic/kelondro/kelondroFlexTable.java
index d171cab8c..e5bc41edb 100644
--- a/source/de/anomic/kelondro/kelondroFlexTable.java
+++ b/source/de/anomic/kelondro/kelondroFlexTable.java
@@ -27,6 +27,7 @@ package de.anomic.kelondro;
import java.io.File;
import java.io.IOException;
+import java.util.Date;
import java.util.Iterator;
public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondroIndex {
@@ -137,6 +138,10 @@ public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondr
return super.get(i);
}
+ public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
+ return put(row);
+ }
+
public synchronized kelondroRow.Entry put(kelondroRow.Entry row) throws IOException {
int i = index.geti(row.getColBytes(0));
if (i < 0) {
diff --git a/source/de/anomic/kelondro/kelondroIndex.java b/source/de/anomic/kelondro/kelondroIndex.java
index bc3b6969b..ca426cfb7 100644
--- a/source/de/anomic/kelondro/kelondroIndex.java
+++ b/source/de/anomic/kelondro/kelondroIndex.java
@@ -51,6 +51,7 @@
package de.anomic.kelondro;
import java.io.IOException;
+import java.util.Date;
import java.util.Iterator;
public interface kelondroIndex {
@@ -60,6 +61,7 @@ public interface kelondroIndex {
public kelondroRow row() throws IOException;
public kelondroRow.Entry get(byte[] key) throws IOException;
public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException;
+ public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException;
public kelondroRow.Entry remove(byte[] key) throws IOException;
public Iterator rows(boolean up, boolean rotating, byte[] firstKey) throws IOException;
public void close() throws IOException;
diff --git a/source/de/anomic/kelondro/kelondroRAMIndex.java b/source/de/anomic/kelondro/kelondroRAMIndex.java
index df0acd6d0..b7792215f 100644
--- a/source/de/anomic/kelondro/kelondroRAMIndex.java
+++ b/source/de/anomic/kelondro/kelondroRAMIndex.java
@@ -26,6 +26,8 @@
package de.anomic.kelondro;
+import java.io.IOException;
+import java.util.Date;
import java.util.Iterator;
import java.util.TreeMap;
@@ -59,6 +61,10 @@ public class kelondroRAMIndex implements kelondroIndex {
return (kelondroRow.Entry) index.get(key);
}
+ public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
+ return put(row);
+ }
+
public synchronized Entry put(Entry row) {
return (kelondroRow.Entry) index.put(row.getColBytes(0), row);
}
diff --git a/source/de/anomic/kelondro/kelondroRecords.java b/source/de/anomic/kelondro/kelondroRecords.java
index 41ae99c5f..34c184d0d 100644
--- a/source/de/anomic/kelondro/kelondroRecords.java
+++ b/source/de/anomic/kelondro/kelondroRecords.java
@@ -976,7 +976,7 @@ public class kelondroRecords {
return USAGE.FREEC;
}
- private final void dispose(Handle h) throws IOException {
+ private synchronized final void dispose(Handle h) throws IOException {
// delete element with handle h
// this element is then connected to the deleted-chain and can be
// re-used change counter
@@ -1052,7 +1052,7 @@ public class kelondroRecords {
if (markedDeleted.contains(h)) {
// loop detection
this.theLogger.severe("KELONDRO WARNING " + this.filename + ": FREE-Queue contains loops");
- return markedDeleted;
+ return markedDeleted; // TODO: automatic fix
}
markedDeleted.add(h);
seekp = seekpos(h);
diff --git a/source/de/anomic/kelondro/kelondroRowSet.java b/source/de/anomic/kelondro/kelondroRowSet.java
index fc00799a1..4eafe2efa 100644
--- a/source/de/anomic/kelondro/kelondroRowSet.java
+++ b/source/de/anomic/kelondro/kelondroRowSet.java
@@ -25,6 +25,7 @@
package de.anomic.kelondro;
import java.io.IOException;
+import java.util.Date;
import java.util.Iterator;
import java.util.Random;
import java.util.TreeSet;
@@ -76,6 +77,10 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
return entry;
}
+ public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
+ return put(row);
+ }
+
public kelondroRow.Entry put(kelondroRow.Entry entry) {
long handle = profile.startWrite();
int index = -1;
diff --git a/source/de/anomic/kelondro/kelondroSplittedTree.java b/source/de/anomic/kelondro/kelondroSplittedTree.java
index 11f7be948..9765d9992 100644
--- a/source/de/anomic/kelondro/kelondroSplittedTree.java
+++ b/source/de/anomic/kelondro/kelondroSplittedTree.java
@@ -47,6 +47,7 @@ package de.anomic.kelondro;
import java.io.File;
import java.io.IOException;
+import java.util.Date;
import java.util.Iterator;
public class kelondroSplittedTree implements kelondroIndex {
@@ -109,6 +110,10 @@ public class kelondroSplittedTree implements kelondroIndex {
return ktfs[partition(key)].get(key);
}
+ public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
+ return put(row);
+ }
+
public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException {
return ktfs[partition(row.getColBytes(0))].put(row);
}
diff --git a/source/de/anomic/kelondro/kelondroTree.java b/source/de/anomic/kelondro/kelondroTree.java
index aad149e1b..a674df9d7 100644
--- a/source/de/anomic/kelondro/kelondroTree.java
+++ b/source/de/anomic/kelondro/kelondroTree.java
@@ -50,6 +50,7 @@ import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.RandomAccessFile;
+import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
@@ -404,8 +405,12 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex {
return (lc.equals(childn.handle()));
}
- // Associates the specified value with the specified key in this map
+ public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
+ return put(row);
+ }
+
public kelondroRow.Entry put(kelondroRow.Entry newrow) throws IOException {
+ // Associates the specified value with the specified key in this map
kelondroRow.Entry result = null;
//writeLock.stay(2000, 1000);
if (newrow.columns() != row().columns()) throw new IllegalArgumentException("put: wrong row length " + newrow.columns() + "; must be " + row().columns());
diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
index b15bcac4c..1f2fe5288 100644
--- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
+++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
@@ -9,6 +9,7 @@ import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.plasma.plasmaCrawlLURL;
+import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverDate;
@@ -155,7 +156,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// we need to import the url
// getting the url entry
- plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.load(urlHash, null);
+ plasmaCrawlLURLEntry urlEntry = this.importUrlDB.load(urlHash, null);
if (urlEntry != null) {
/* write it into the home url db */
diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java
index 55d9665c1..e05edfcb3 100644
--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@@ -61,14 +61,11 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Locale;
-import java.util.Properties;
import de.anomic.http.httpc;
import de.anomic.http.httpc.response;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURL;
-import de.anomic.index.indexURLEntry;
-import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroRAMIndex;
import de.anomic.kelondro.kelondroRow;
@@ -78,7 +75,6 @@ import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCodings;
import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog;
-import de.anomic.tools.crypt;
import de.anomic.tools.nxTools;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
@@ -94,32 +90,16 @@ public final class plasmaCrawlLURL extends indexURL {
private final LinkedList proxyResultStack; // 4 - local index: result of proxy fetch/prefetch
private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling
private final LinkedList gcrawlResultStack; // 6 - local index: triggered external
-
- //public static Set damagedURLS = Collections.synchronizedSet(new HashSet());
public plasmaCrawlLURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) {
super();
- kelondroRow rowdef = new kelondroRow(
- "String urlhash-" + urlHashLength + ", " + // the url's hash
- "String urlstring-" + urlStringLength + ", " + // the url as string
- "String urldescr-" + urlDescrLength + ", " + // the description of the url
- "Cardinal moddate-" + urlDateLength + " {b64e}, " + // last-modified from the httpd
- "Cardinal loaddate-" + urlDateLength + " {b64e}, " + // time when the url was loaded
- "String refhash-" + urlHashLength + ", " + // the url's referrer hash
- "Cardinal copycount-" + urlCopyCountLength + " {b64e}, " + //
- "byte[] flags-" + urlFlagLength + ", " + // flags
- "Cardinal quality-" + urlQualityLength + " {b64e}, " + //
- "String language-" + urlLanguageLength + ", " + //
- "byte[] doctype-" + urlDoctypeLength + ", " + //
- "Cardinal size-" + urlSizeLength + " {b64e}, " + // size of file in bytes
- "Cardinal wc-" + urlWordCountLength + " {b64e}"); // word count
File cacheFile = new File(cachePath, "urlHash.db");
cacheFile.getParentFile().mkdirs();
try {
- urlIndexFile = new kelondroTree(cacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef);
- urlIndexCache = new kelondroRAMIndex(kelondroNaturalOrder.naturalOrder, rowdef);
+ urlIndexFile = new kelondroTree(cacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, plasmaCrawlLURLOldEntry.rowdef);
+ urlIndexCache = new kelondroRAMIndex(kelondroNaturalOrder.naturalOrder, plasmaCrawlLURLOldEntry.rowdef);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
@@ -134,19 +114,19 @@ public final class plasmaCrawlLURL extends indexURL {
gcrawlResultStack = new LinkedList();
}
- public synchronized void stack(Entry e, String initiatorHash, String executorHash, int stackType) {
+ public synchronized void stack(plasmaCrawlLURLEntry e, String initiatorHash, String executorHash, int stackType) {
if (e == null) { return; }
try {
if (initiatorHash == null) { initiatorHash = dummyHash; }
if (executorHash == null) { executorHash = dummyHash; }
switch (stackType) {
case 0: break;
- case 1: externResultStack.add(e.urlHash + initiatorHash + executorHash); break;
- case 2: searchResultStack.add(e.urlHash + initiatorHash + executorHash); break;
- case 3: transfResultStack.add(e.urlHash + initiatorHash + executorHash); break;
- case 4: proxyResultStack.add(e.urlHash + initiatorHash + executorHash); break;
- case 5: lcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
- case 6: gcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
+ case 1: externResultStack.add(e.hash() + initiatorHash + executorHash); break;
+ case 2: searchResultStack.add(e.hash() + initiatorHash + executorHash); break;
+ case 3: transfResultStack.add(e.hash() + initiatorHash + executorHash); break;
+ case 4: proxyResultStack.add(e.hash() + initiatorHash + executorHash); break;
+ case 5: lcrawlResultStack.add(e.hash() + initiatorHash + executorHash); break;
+ case 6: gcrawlResultStack.add(e.hash() + initiatorHash + executorHash); break;
}
return;
} catch (Exception ex) {
@@ -159,7 +139,7 @@ public final class plasmaCrawlLURL extends indexURL {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
}
- public Entry load(String urlHash, indexEntry searchedWord) {
+ public plasmaCrawlLURLEntry load(String urlHash, indexEntry searchedWord) {
// generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
@@ -171,19 +151,18 @@ public final class plasmaCrawlLURL extends indexURL {
try {
if (entry == null) entry = urlIndexFile.get(urlHash.getBytes());
if (entry == null) return null;
- return new Entry(entry, searchedWord);
+ return new plasmaCrawlLURLOldEntry(entry, searchedWord);
} catch (IOException e) {
return null;
}
}
- public void store(Entry entry, boolean cached) throws IOException {
+ public void store(plasmaCrawlLURLEntry entry, boolean cached) throws IOException {
// Check if there is a more recent Entry already in the DB
- if (entry.stored) return;
- Entry oldEntry;
+ plasmaCrawlLURLEntry oldEntry;
try {
- if (exists(entry.urlHash)) {
- oldEntry = load(entry.urlHash, null);
+ if (exists(entry.hash())) {
+ oldEntry = load(entry.hash(), null);
} else {
oldEntry = null;
}
@@ -194,40 +173,32 @@ public final class plasmaCrawlLURL extends indexURL {
// the fetched oldEntry is better, so return its properties instead of the new ones
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
// this.url = oldEntry.url; // unnecessary, should be the same
- entry.descr = oldEntry.descr;
- entry.moddate = oldEntry.moddate;
- entry.loaddate = oldEntry.loaddate;
- entry.referrerHash = oldEntry.referrerHash;
- entry.copyCount = oldEntry.copyCount;
- entry.flags = oldEntry.flags;
- entry.quality = oldEntry.quality;
- entry.language = oldEntry.language;
- entry.doctype = oldEntry.doctype;
- entry.size = oldEntry.size;
- entry.wordCount = oldEntry.wordCount;
- // this.snippet // not read from db
- // this.word // not read from db
- entry.stored = true;
+ entry = oldEntry;
return; // this did not need to be stored, but is updated
}
- super.store(entry.toRowEntry(), cached);
- entry.stored = true;
+ if ((cached) && (urlIndexCache != null)) {
+ synchronized (urlIndexCache) {
+ urlIndexCache.put(entry.toRowEntry());
+ }
+ } else {
+ urlIndexFile.put(entry.toRowEntry(), entry.loaddate());
+ }
}
-
- public synchronized Entry newEntry(String propStr, boolean setGlobal) {
+
+ public synchronized plasmaCrawlLURLEntry newEntry(String propStr, boolean setGlobal) {
if (propStr.startsWith("{") && propStr.endsWith("}")) {
- return new Entry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal);
+ return new plasmaCrawlLURLOldEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal);
} else {
return null;
}
}
- public synchronized Entry newEntry(URL url, String descr, Date moddate, Date loaddate,
+ public synchronized plasmaCrawlLURLEntry newEntry(URL url, String descr, Date moddate, Date loaddate,
String referrerHash, int copyCount, boolean localNeed,
int quality, String language, char doctype,
int size, int wordCount) {
- Entry e = new Entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount);
+ plasmaCrawlLURLEntry e = new plasmaCrawlLURLOldEntry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount);
return e;
}
@@ -365,7 +336,7 @@ public final class plasmaCrawlLURL extends indexURL {
String urlHash, initiatorHash, executorHash;
String cachepath, urlstr, urltxt;
yacySeed initiatorSeed, executorSeed;
- plasmaCrawlLURL.Entry urle;
+ plasmaCrawlLURLEntry urle;
URL url;
// needed for getCachePath(url)
@@ -412,317 +383,6 @@ public final class plasmaCrawlLURL extends indexURL {
return prop;
}
- public class Entry {
-
- private URL url;
-
- private String descr;
- private Date moddate;
- private Date loaddate;
- private String urlHash;
- private String referrerHash;
- private int copyCount;
- private String flags;
- private int quality;
- private String language;
- private char doctype;
- private int size;
- private int wordCount;
- private String snippet;
- private indexEntry word; // this is only used if the url is transported via remote search requests
- private boolean stored;
-
- // more needed attributes:
- // - author / copyright owner
- // - keywords
- // - phrasecount, total number of phrases
- // - boolean: URL attributes (see Word-Entity definition)
- // - boolean: appearance of bold and/or italics
- // - ETag: for re-crawl decision upon HEAD request
- // - int: # of outlinks to same domain
- // - int: # of outlinks to outside domain
- // - int: # of keywords
- // - int: # der auf der Seite vorhandenen Links zu image, audio, video, applications
-
- public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) {
- // create new entry and store it into database
- this.urlHash = urlHash(url);
- this.url = url;
- this.descr = (descr == null) ? this.url.toString() : descr;
- this.moddate = moddate;
- this.loaddate = loaddate;
- this.referrerHash = (referrerHash == null) ? dummyHash : referrerHash;
- this.copyCount = copyCount; // the number of remote (global) copies of this object without this one
- this.flags = (localNeed) ? "L " : " ";
- this.quality = quality;
- this.language = (language == null) ? "uk" : language;
- this.doctype = doctype;
- this.size = size;
- this.wordCount = wordCount;
- this.snippet = null;
- this.word = null;
- this.stored = false;
- }
-
- public Entry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
- try {
- this.urlHash = entry.getColString(0, null);
- this.url = new URL(entry.getColString(1, "UTF-8").trim());
- this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim();
- this.moddate = new Date(86400000 * entry.getColLong(3));
- this.loaddate = new Date(86400000 * entry.getColLong(4));
- this.referrerHash = (entry.empty(5)) ? dummyHash : entry.getColString(5, "UTF-8");
- this.copyCount = (int) entry.getColLong(6);
- this.flags = entry.getColString(7, "UTF-8");
- this.quality = (int) entry.getColLong(8);
- this.language = entry.getColString(9, "UTF-8");
- this.doctype = (char) entry.getColByte(10);
- this.size = (int) entry.getColLong(11);
- this.wordCount = (int) entry.getColLong(12);
- this.snippet = null;
- this.word = searchedWord;
- this.stored = false;
- return;
- } catch (Exception e) {
- serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e);
- throw new IOException("plasmaLURL.entry/1: " + e.toString());
- }
- }
-
- public Entry(Properties prop, boolean setGlobal) {
- // generates an plasmaLURLEntry using the properties from the argument
- // the property names must correspond to the one from toString
- //System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
- this.urlHash = prop.getProperty("hash", dummyHash);
- try {
- //byte[][] entry = urlHashCache.get(urlHash.getBytes());
- //if (entry == null) {
- this.referrerHash = prop.getProperty("referrer", dummyHash);
- this.moddate = shortDayFormatter.parse(prop.getProperty("mod", "20000101"));
- //System.out.println("DEBUG: moddate = " + moddate + ", prop=" + prop.getProperty("mod"));
- this.loaddate = shortDayFormatter.parse(prop.getProperty("load", "20000101"));
- this.copyCount = Integer.parseInt(prop.getProperty("cc", "0"));
- this.flags = ((prop.getProperty("local", "true").equals("true")) ? "L " : " ");
- if (setGlobal) this.flags = "G ";
- this.url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null));
- this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), null);
- if (this.descr == null) this.descr = this.url.toString();
- this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", ""));
- this.language = prop.getProperty("lang", "uk");
- this.doctype = prop.getProperty("dt", "t").charAt(0);
- this.size = Integer.parseInt(prop.getProperty("size", "0"));
- this.wordCount = Integer.parseInt(prop.getProperty("wc", "0"));
- this.snippet = prop.getProperty("snippet", "");
- if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null);
- this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null;
- this.stored = false;
- //}
- } catch (Exception e) {
- serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/2:" +
- "\nProperties: " + ((prop==null)?null:prop.toString()) +
- ((prop.containsKey("word")) ? "\nWord: " + kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word","")) : "") +
- "\nErrorMsg: " + e.toString(), e);
- }
- }
-
- public kelondroRow.Entry toRowEntry() throws IOException {
- final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, urlDateLength);
- final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength);
-
- final byte[][] entry = new byte[][] {
- urlHash.getBytes(),
- url.toString().getBytes(),
- descr.getBytes(), // null?
- moddatestr.getBytes(),
- loaddatestr.getBytes(),
- referrerHash.getBytes(),
- kelondroBase64Order.enhancedCoder.encodeLong(copyCount, urlCopyCountLength).getBytes(),
- flags.getBytes(),
- kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength).getBytes(),
- language.getBytes(),
- new byte[] {(byte) doctype},
- kelondroBase64Order.enhancedCoder.encodeLong(size, urlSizeLength).getBytes(),
- kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(),
- };
- return urlIndexFile.row().newEntry(entry);
- }
-
- public String hash() {
- // return a url-hash, based on the md5 algorithm
- // the result is a String of 12 bytes within a 72-bit space
- // (each byte has an 6-bit range)
- // that should be enough for all web pages on the world
- return this.urlHash;
- }
-
- public URL url() {
- return url;
- }
-
- public String descr() {
- return descr;
- }
-
- public Date moddate() {
- return moddate;
- }
-
- public Date loaddate() {
- return loaddate;
- }
-
- public String referrerHash() {
- // return the creator's hash
- return referrerHash;
- }
-
- public char doctype() {
- return doctype;
- }
-
- public int copyCount() {
- // return number of copies of this object in the global index
- return copyCount;
- }
-
- public boolean local() {
- // returns true if the url was created locally and is needed for own word index
- if (flags == null) return false;
- return flags.charAt(0) == 'L';
- }
-
- public int quality() {
- return quality;
- }
-
- public String language() {
- return language;
- }
-
- public int size() {
- return size;
- }
-
- public int wordCount() {
- return wordCount;
- }
-
- public String snippet() {
- // the snippet may appear here if the url was transported in a remote search
- // it will not be saved anywhere, but can only be requested here
- return snippet;
- }
-
- public indexEntry word() {
- return word;
- }
-
- public boolean isOlder (Entry other) {
- if (other == null) return false;
- if (moddate.before(other.moddate())) return true;
- if (moddate.equals(other.moddate())) {
- if (loaddate.before(other.loaddate())) return true;
- if (loaddate.equals(other.loaddate())) {
- if (quality < other.quality()) return true;
- }
- }
- return false;
- }
-
- private StringBuffer corePropList() {
- // generate a parseable string; this is a simple property-list
- final StringBuffer corePropStr = new StringBuffer(300);
- try {
- corePropStr
- .append("hash=") .append(urlHash)
- .append(",referrer=").append(referrerHash)
- .append(",mod=") .append(shortDayFormatter.format(moddate))
- .append(",load=") .append(shortDayFormatter.format(loaddate))
- .append(",size=") .append(size)
- .append(",wc=") .append(wordCount)
- .append(",cc=") .append(copyCount)
- .append(",local=") .append(((local()) ? "true" : "false"))
- .append(",q=") .append(kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength))
- .append(",dt=") .append(doctype)
- .append(",lang=") .append(language)
- .append(",url=") .append(crypt.simpleEncode(url.toString()))
- .append(",descr=") .append(crypt.simpleEncode(descr));
-
- if (this.word != null) {
- // append also word properties
- corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm(false)));
- }
- return corePropStr;
-
- } catch (Exception e) {
-// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
-// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
-// if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
-// e.printStackTrace();
- return null;
- }
- }
-
- /*
- public String toString(int posintext, int posinphrase, int posofphrase) {
- // add information needed for remote transport
- final StringBuffer core = corePropList();
- if (core == null) return null;
-
- core.ensureCapacity(core.length() + 200);
- core.insert(0,"{")
- .append(",posintext=").append(posintext)
- .append(",posinphrase=").append(posinphrase)
- .append(",posofphraseint=").append(posofphrase)
- .append("}");
- return core.toString();
- }
- */
-
- public String toString(String snippet) {
- // add information needed for remote transport
- final StringBuffer core = corePropList();
- if (core == null) return null;
-
- core.ensureCapacity(core.length() + snippet.length()*2);
- core.insert(0,"{");
- core.append(",snippet=").append(crypt.simpleEncode(snippet));
- core.append("}");
-
- return core.toString();
- //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
- }
-
- /**
- * Returns this object as String.
- * This e.g. looks like this:
- * {hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}
- */
- public String toString() {
- final StringBuffer core = corePropList();
- if (core == null) return null;
-
- core.insert(0,"{");
- core.append("}");
-
- return core.toString();
- //return "{" + core + "}";
- }
-
- public void print() {
- System.out.println("URL : " + url);
- System.out.println("Description : " + descr);
- System.out.println("Modified : " + httpc.dateString(moddate));
- System.out.println("Loaded : " + httpc.dateString(loaddate));
- System.out.println("Size : " + size + " bytes, " + wordCount + " words");
- System.out.println("Referrer Hash : " + referrerHash);
- System.out.println("Quality : " + quality);
- System.out.println("Language : " + language);
- System.out.println("DocType : " + doctype);
- System.out.println();
- }
- } // class Entry
-
public class kiter implements Iterator {
// enumerates entry elements
Iterator i;
@@ -742,7 +402,7 @@ public final class plasmaCrawlLURL extends indexURL {
kelondroRow.Entry e = (kelondroRow.Entry) i.next();
if (e == null) return null;
try {
- return new Entry(e, null);
+ return new plasmaCrawlLURLOldEntry(e, null);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
}
@@ -873,7 +533,7 @@ public final class plasmaCrawlLURL extends indexURL {
}
}
- plasmaCrawlLURL.Entry entry = (plasmaCrawlLURL.Entry) eiter.next();
+ plasmaCrawlLURLEntry entry = (plasmaCrawlLURLEntry) eiter.next();
totalSearchedUrls++;
if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, entry.url()) ||
plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, entry.url())) {
@@ -944,7 +604,7 @@ public final class plasmaCrawlLURL extends indexURL {
final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1, 0, false);
final Iterator enu = urls.entries(true, false, null);
while (enu.hasNext()) {
- ((Entry) enu.next()).print();
+ ((plasmaCrawlLURLEntry) enu.next()).print();
}
} catch (Exception e) {
e.printStackTrace();
diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java
index 52171a235..5fd51eec6 100644
--- a/source/de/anomic/plasma/plasmaCrawlStacker.java
+++ b/source/de/anomic/plasma/plasmaCrawlStacker.java
@@ -385,7 +385,7 @@ public final class plasmaCrawlStacker {
checkInterruption();
String nexturlhash = indexURL.urlHash(nexturl);
String dbocc = this.sb.urlPool.exists(nexturlhash);
- plasmaCrawlLURL.Entry oldEntry = null;
+ plasmaCrawlLURLEntry oldEntry = null;
oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null);
boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
if ((dbocc != null) && (!(recrawl))) {
diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java
index 27f515033..c827ee6af 100644
--- a/source/de/anomic/plasma/plasmaDHTChunk.java
+++ b/source/de/anomic/plasma/plasmaDHTChunk.java
@@ -199,7 +199,7 @@ public class plasmaDHTChunk {
indexContainer container;
Iterator urlIter;
indexEntry iEntry;
- plasmaCrawlLURL.Entry lurl;
+ plasmaCrawlLURLEntry lurl;
int refcount = 0;
int wholesize;
@@ -281,11 +281,11 @@ public class plasmaDHTChunk {
}
- public synchronized int deleteTransferIndexes() {
+ public synchronized String deleteTransferIndexes() {
Iterator urlIter;
indexEntry iEntry;
HashSet urlHashes;
- int count = 0;
+ String count = "0";
for (int i = 0; i < this.indexContainers.length; i++) {
// delete entries separately
@@ -301,7 +301,7 @@ public class plasmaDHTChunk {
urlHashes.add(iEntry.urlHash());
}
String wordHash = indexContainers[i].getWordHash();
- count += wordIndex.removeEntries(this.indexContainers[i].getWordHash(), urlHashes, true);
+ count = wordIndex.removeEntriesExpl(this.indexContainers[i].getWordHash(), urlHashes, true);
if (log.isFine())
log.logFine("Deleted partial index (" + c + " URLs) for word " + wordHash + "; " + this.wordIndex.indexSize(wordHash) + " entries left");
this.indexContainers[i] = null;
diff --git a/source/de/anomic/plasma/plasmaDHTFlush.java b/source/de/anomic/plasma/plasmaDHTFlush.java
index 9284bd254..10d95f563 100644
--- a/source/de/anomic/plasma/plasmaDHTFlush.java
+++ b/source/de/anomic/plasma/plasmaDHTFlush.java
@@ -222,7 +222,7 @@ public class plasmaDHTFlush extends Thread {
// deleting transfered words from index
if (this.delete) {
this.status = "Running: Deleting chunk " + iteration;
- int urlReferences = oldDHTChunk.deleteTransferIndexes();
+ String urlReferences = oldDHTChunk.deleteTransferIndexes();
this.log.logFine("Deleted from " + oldDHTChunk.containerSize() + " transferred RWIs locally " + urlReferences + " URL references");
}
oldDHTChunk = null;
diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java
index 71da580a2..466596b2c 100644
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@@ -370,7 +370,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
//if (searchResult.size() == 0) return acc; // case that we have nothing to do
indexEntry entry;
- plasmaCrawlLURL.Entry page;
+ plasmaCrawlLURLEntry page;
Long preranking;
Object[] preorderEntry;
int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);
diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java
index d6ea1bd9d..d849b394d 100644
--- a/source/de/anomic/plasma/plasmaSearchImages.java
+++ b/source/de/anomic/plasma/plasmaSearchImages.java
@@ -101,7 +101,7 @@ public final class plasmaSearchImages {
public plasmaSearchImages(plasmaSnippetCache sc, long maxTime, plasmaSearchResult sres, int depth) {
long start = System.currentTimeMillis();
this.images = new TreeSet();
- plasmaCrawlLURL.Entry urlentry;
+ plasmaCrawlLURLEntry urlentry;
while (sres.hasMoreElements()) {
urlentry = sres.nextElement();
addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.url(), depth));
diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java
index 18f353dd0..4985859f4 100644
--- a/source/de/anomic/plasma/plasmaSearchPreOrder.java
+++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java
@@ -185,7 +185,13 @@ public final class plasmaSearchPreOrder {
public Object[] /*{indexEntry, Long}*/ next() {
String top = (String) pageAcc.firstKey();
//System.out.println("preorder-key: " + top);
- Long preranking = new Long(Long.MAX_VALUE - Long.parseLong(top.substring(0, 16), 16)); // java.lang.NumberFormatException: For input string: "8000000000020b17" ???
+ Long preranking;
+ try {
+ preranking = new Long(Long.MAX_VALUE - Long.parseLong(top.substring(0, 16), 16)); // java.lang.NumberFormatException: For input string: "8000000000020b17" ???
+ } catch (NumberFormatException e) {
+ e.printStackTrace();
+ preranking = new Long(0);
+ }
return new Object[]{(indexEntry) pageAcc.remove(top), preranking};
}
diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java
index 0f04ab5ec..355f60839 100644
--- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java
+++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java
@@ -191,7 +191,7 @@ public class plasmaSearchRankingProfile {
Set topwords,
String[] urlcomps,
String[] descrcomps,
- plasmaCrawlLURL.Entry page) {
+ plasmaCrawlLURLEntry page) {
// apply pre-calculated order attributes
long ranking = preranking;
diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java
index 9bd849ece..0a2234ce3 100644
--- a/source/de/anomic/plasma/plasmaSearchResult.java
+++ b/source/de/anomic/plasma/plasmaSearchResult.java
@@ -99,13 +99,13 @@ public final class plasmaSearchResult {
return pageAcc.size() > 0;
}
- public plasmaCrawlLURL.Entry nextElement() {
+ public plasmaCrawlLURLEntry nextElement() {
Object top = pageAcc.firstKey();
//System.out.println("postorder-key: " + ((String) top));
- return (plasmaCrawlLURL.Entry) pageAcc.remove(top);
+ return (plasmaCrawlLURLEntry) pageAcc.remove(top);
}
- protected void addResult(plasmaCrawlLURL.Entry page, Long preranking) {
+ protected void addResult(plasmaCrawlLURLEntry page, Long preranking) {
// take out relevant information for reference computation
URL url = page.url();
@@ -132,12 +132,12 @@ public final class plasmaSearchResult {
for (int i = 0; i < references.length; i++) commonSense.add(references[i]);
Object[] resultVector;
- plasmaCrawlLURL.Entry page;
+ plasmaCrawlLURLEntry page;
long ranking;
for (int i = 0; i < results.size(); i++) {
// take out values from result array
resultVector = (Object[]) results.get(i);
- page = (plasmaCrawlLURL.Entry) resultVector[0];
+ page = (plasmaCrawlLURLEntry) resultVector[0];
// calculate ranking
if (postsort)
@@ -173,7 +173,7 @@ public final class plasmaSearchResult {
// first scan all entries and find all urls that are referenced
while (i.hasNext()) {
entry = (Map.Entry) i.next();
- path = urlPath(((plasmaCrawlLURL.Entry) entry.getValue()).url());
+ path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url());
paths.put(path, entry.getKey());
//if (path != null) path = shortenPath(path);
//if (path != null) paths.put(path, entry.getKey());
@@ -184,7 +184,7 @@ public final class plasmaSearchResult {
String shorten;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
- path = urlPath(((plasmaCrawlLURL.Entry) entry.getValue()).url());
+ path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url());
shorten = shortenPath(path);
// scan all subpaths of the url
while (shorten != null) {
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index 6ee1f2de8..e6e6516aa 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -629,7 +629,7 @@ public class plasmaSnippetCache {
public void fetch(plasmaSearchResult acc, Set queryhashes, String urlmask, int fetchcount, long maxTime) {
// fetch snippets
int i = 0;
- plasmaCrawlLURL.Entry urlentry;
+ plasmaCrawlLURLEntry urlentry;
String urlstring;
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) {
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index bc9ed397c..4f1e77eab 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -1011,7 +1011,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// possibly delete entries from last chunk
if ((this.dhtTransferChunk != null) &&
(this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_COMPLETE)) {
- int deletedURLs = this.dhtTransferChunk.deleteTransferIndexes();
+ String deletedURLs = this.dhtTransferChunk.deleteTransferIndexes();
this.log.logFine("Deleted from " + this.dhtTransferChunk.containers().length + " transferred RWIs locally, removed " + deletedURLs + " URL references");
this.dhtTransferChunk = null;
}
@@ -1556,7 +1556,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
// create a new loaded URL db entry
- plasmaCrawlLURL.Entry newEntry = urlPool.loadedURL.newEntry(
+ plasmaCrawlLURLEntry newEntry = urlPool.loadedURL.newEntry(
entry.url(), // URL
docDescription, // document description
docDate, // modification date
@@ -1965,7 +1965,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
- plasmaCrawlLURL.Entry entry = urlPool.loadedURL.newEntry(propStr, true);
+ plasmaCrawlLURLEntry entry = urlPool.loadedURL.newEntry(propStr, true);
urlPool.loadedURL.store(entry, false);
urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
urlPool.noticeURL.remove(entry.hash());
@@ -2045,7 +2045,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
int i = 0;
int p;
URL url;
- plasmaCrawlLURL.Entry urlentry;
+ plasmaCrawlLURLEntry urlentry;
String urlstring, urlname, filename, urlhash;
String host, hash, address, descr = "";
yacySeed seed;
@@ -2192,7 +2192,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// finally, delete the url entry
// determine the url string
- plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null);
+ plasmaCrawlLURLEntry entry = urlPool.loadedURL.load(urlhash, null);
if (entry == null) return 0;
URL url = entry.url();
diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java
index 51d66e748..cfe7b1391 100644
--- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java
+++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java
@@ -333,7 +333,7 @@ public class plasmaSwitchboardQueue {
public URL referrerURL() {
if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null;
- plasmaCrawlLURL.Entry entry = lurls.load(referrerHash, null);
+ plasmaCrawlLURLEntry entry = lurls.load(referrerHash, null);
if (entry == null) referrerURL = null; else referrerURL = entry.url();
}
return referrerURL;
diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java
index eaa6f1cca..d1d4e0940 100644
--- a/source/de/anomic/plasma/plasmaURLPool.java
+++ b/source/de/anomic/plasma/plasmaURLPool.java
@@ -83,7 +83,7 @@ public class plasmaURLPool {
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
if (ne != null) return ne.url();
} catch (IOException e) {}
- plasmaCrawlLURL.Entry le = loadedURL.load(urlhash, null);
+ plasmaCrawlLURLEntry le = loadedURL.load(urlhash, null);
if (le != null) return le.url();
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url();
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index df03367d6..bcf518db2 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -484,13 +484,25 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
int removed = 0;
removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete);
removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete);
- if (removed == urlHashes.size()) return removed;
+ //if (removed == urlHashes.size()) return removed;
if (useCollectionIndex) {
removed += collections.removeEntries(wordHash, urlHashes, deleteComplete);
- if (removed == urlHashes.size()) return removed;
+ //if (removed == urlHashes.size()) return removed;
}
removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete);
- if (removed == urlHashes.size()) return removed;
+ //if (removed == urlHashes.size()) return removed;
+ removed += backend.removeEntries(wordHash, urlHashes, deleteComplete);
+ return removed;
+ }
+
+ public String removeEntriesExpl(String wordHash, Set urlHashes, boolean deleteComplete) {
+ String removed = "";
+ removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
+ removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
+ if (useCollectionIndex) {
+ removed += collections.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
+ } else removed += "0, ";
+ removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
removed += backend.removeEntries(wordHash, urlHashes, deleteComplete);
return removed;
}
@@ -772,7 +784,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
waiter();
entry = (indexEntry) containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash());
- plasmaCrawlLURL.Entry ue = lurl.load(entry.urlHash(), null);
+ plasmaCrawlLURLEntry ue = lurl.load(entry.urlHash(), null);
if (ue == null) {
urlHashs.add(entry.urlHash());
} else {
diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java
index 6da6f90a7..61a5009cb 100644
--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@@ -62,6 +62,7 @@ import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
+import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSearchTimingProfile;
import de.anomic.plasma.plasmaSnippetCache;
@@ -496,7 +497,7 @@ public final class yacyClient {
}
// insert results to containers
- plasmaCrawlLURL.Entry urlEntry;
+ plasmaCrawlLURLEntry urlEntry;
String[] urls = new String[results];
for (int n = 0; n < results; n++) {
// get one single search result
@@ -862,7 +863,7 @@ public final class yacyClient {
-er crawlt, Ergebnis erscheint aber unter falschem initiator
*/
- public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, plasmaCrawlLURL.Entry entry, String wordhashes) {
+ public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, plasmaCrawlLURLEntry entry, String wordhashes) {
if (targetSeed == null) { return null; }
if (yacyCore.seedDB.mySeed == null) { return null; }
if (yacyCore.seedDB.mySeed == targetSeed) { return null; }
@@ -981,9 +982,9 @@ public final class yacyClient {
if (uhs.length == 0) { return resultObj; } // all url's known
// extract the urlCache from the result
- plasmaCrawlLURL.Entry[] urls = new plasmaCrawlLURL.Entry[uhs.length];
+ plasmaCrawlLURLEntry[] urls = new plasmaCrawlLURLEntry[uhs.length];
for (int i = 0; i < uhs.length; i++) {
- urls[i] = (plasmaCrawlLURL.Entry) urlCache.get(uhs[i]);
+ urls[i] = (plasmaCrawlLURLEntry) urlCache.get(uhs[i]);
if (urls[i] == null) {
yacyCore.log.logFine("DEBUG transferIndex: requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'");
}
@@ -1092,7 +1093,7 @@ public final class yacyClient {
}
}
- private static HashMap transferURL(yacySeed targetSeed, plasmaCrawlLURL.Entry[] urls, boolean gzipBody, int timeout) {
+ private static HashMap transferURL(yacySeed targetSeed, plasmaCrawlLURLEntry[] urls, boolean gzipBody, int timeout) {
// this post a message to the remote message board
final String address = targetSeed.getAddress();
if (address == null) { return null; }
diff --git a/source/yacy.java b/source/yacy.java
index 8a4b8e25e..f937f3002 100644
--- a/source/yacy.java
+++ b/source/yacy.java
@@ -75,11 +75,15 @@ import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroDyn;
+import de.anomic.kelondro.kelondroFlexSplitTable;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMap;
+import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLURL;
+import de.anomic.plasma.plasmaCrawlLURLEntry;
+import de.anomic.plasma.plasmaCrawlLURLOldEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURLPool;
@@ -730,7 +734,7 @@ public final class yacy {
iEntry = (indexEntry) wordIdxEntries.next();
String urlHash = iEntry.urlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
- plasmaCrawlLURL.Entry urlEntry = currentUrlDB.load(urlHash, null);
+ plasmaCrawlLURLEntry urlEntry = currentUrlDB.load(urlHash, null);
urlCounter++;
minimizedUrlDB.store(urlEntry, false);
if (urlCounter % 500 == 0) {
@@ -950,10 +954,10 @@ public final class yacy {
long start = System.currentTimeMillis();
if (source.equals("lurl")) {
Iterator eiter = pool.loadedURL.entries(true, false, null);
- plasmaCrawlLURL.Entry entry;
+ plasmaCrawlLURLEntry entry;
while (eiter.hasNext()) {
try {
- entry = (plasmaCrawlLURL.Entry) eiter.next();
+ entry = (plasmaCrawlLURLEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), null);
} catch (Exception e) {
// here a MalformedURLException may occur
@@ -1061,9 +1065,9 @@ public final class yacy {
if (source.equals("lurl")) {
Iterator eiter = pool.loadedURL.entries(true, false, null);
- plasmaCrawlLURL.Entry entry;
+ plasmaCrawlLURLEntry entry;
while (eiter.hasNext()) {
- entry = (plasmaCrawlLURL.Entry) eiter.next();
+ entry = (plasmaCrawlLURLEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
if (html) {
bos.write(("" + entry.descr() + "
").getBytes("UTF-8"));
@@ -1114,6 +1118,27 @@ public final class yacy {
}
}
+ private static void migratelurls(String homePath) {
+ File root = new File(homePath);
+ try {
+ plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, false, 1000, false, 1000, false, 10000);
+ kelondroFlexSplitTable fsp = new kelondroFlexSplitTable(new File(root, "DATA//INDEX/PUBLIC/TEXT"), "urls", 1000, -1, plasmaCrawlLURLOldEntry.rowdef, kelondroNaturalOrder.naturalOrder);
+
+ Iterator eiter = pool.loadedURL.entries(true, false, null);
+ plasmaCrawlLURLEntry entry;
+ while (eiter.hasNext()) {
+ entry = (plasmaCrawlLURLEntry) eiter.next();
+ if ((entry != null) && (entry.url() != null)) {
+ fsp.put(entry.toRowEntry(), entry.loaddate());
+ }
+ }
+
+ pool.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
private static String[] shift(String[] args, int pos, int count) {
String[] newargs = new String[args.length - count];
System.arraycopy(args, 0, newargs, 0, pos);
@@ -1365,6 +1390,8 @@ public final class yacy {
if (args.length == 2) applicationRoot= args[1];
String outfile = "urllist_" + source + "_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt");
urllist(applicationRoot, source, html, outfile);
+ } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-migratelurls"))) {
+ migratelurls(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
// generate a url list and save it in a file
if (args.length == 2) applicationRoot= args[1];