From 109ed0a0bb23982897b58117ed65dc579c616f11 Mon Sep 17 00:00:00 2001
From: orbiter
Date: Tue, 5 Dec 2006 02:47:51 +0000
Subject: [PATCH] - cleaned up code; removed methods to write the old data
structures - added an assortment importer. the old database structures can
be imported with java -classpath classes yacy -migrateassortments -
modified wordmigration. The indexes from WORDS are now imported to the
collection database. The call is java -classpath classes yacy -migratewords
(as it was)
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3044 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
doc/Download.html | 10 +-
htroot/Bookmarks.java | 2 +-
htroot/IndexCleaner_p.java | 6 +-
htroot/IndexControl_p.java | 27 +-
htroot/IndexCreateIndexingQueue_p.java | 14 +-
htroot/IndexCreateWWWGlobalQueue_p.java | 8 +-
htroot/IndexCreateWWWLocalQueue_p.java | 16 +-
htroot/IndexCreate_p.java | 16 +-
htroot/IndexImport_p.java | 4 +-
htroot/IndexMonitor.java | 22 +-
htroot/IndexShare_p.java | 4 +-
htroot/IndexTransfer_p.java | 2 +-
htroot/PerformanceMemory_p.java | 34 +-
htroot/PerformanceQueues_p.java | 2 +-
htroot/QuickCrawlLink_p.java | 6 +-
htroot/ViewFile.java | 2 +-
htroot/htdocsdefault/dir.java | 8 +-
htroot/xml/queues_p.java | 8 +-
htroot/yacy/crawlOrder.java | 4 +-
htroot/yacy/crawlReceipt.java | 18 +-
htroot/yacy/query.java | 2 +-
htroot/yacy/search.java | 4 +-
htroot/yacy/transferRWI.java | 70 +--
htroot/yacy/transferURL.java | 10 +-
htroot/yacysearch.java | 2 +-
source/de/anomic/index/indexCachedRI.java | 21 +-
source/de/anomic/index/indexCollectionRI.java | 20 +-
source/de/anomic/index/indexContainer.java | 40 +-
source/de/anomic/index/indexRAMRI.java | 45 +-
source/de/anomic/index/indexRI.java | 6 +-
source/de/anomic/index/indexRWIEntryNew.java | 3 +-
.../plasma/crawler/AbstractCrawlWorker.java | 4 +-
.../plasma/crawler/http/CrawlWorker.java | 2 +-
.../plasma/dbImport/AbstractImporter.java | 19 +-
.../plasma/dbImport/AssortmentImporter.java | 58 ++-
.../plasma/dbImport/dbImportManager.java | 6 +-
.../de/anomic/plasma/dbImport/dbImporter.java | 2 +-
.../dbImport/plasmaCrawlNURLImporter.java | 13 +-
.../plasma/dbImport/plasmaDbImporter.java | 52 +--
source/de/anomic/plasma/plasmaCrawlEURL.java | 7 +-
source/de/anomic/plasma/plasmaCrawlLURL.java | 6 +-
.../de/anomic/plasma/plasmaCrawlStacker.java | 12 +-
source/de/anomic/plasma/plasmaDHTChunk.java | 8 +-
source/de/anomic/plasma/plasmaDHTFlush.java | 2 +-
source/de/anomic/plasma/plasmaHTCache.java | 2 +-
.../de/anomic/plasma/plasmaSwitchboard.java | 166 +++----
source/de/anomic/plasma/plasmaURLPool.java | 99 -----
source/de/anomic/plasma/plasmaWordIndex.java | 194 +++------
.../plasma/plasmaWordIndexAssortment.java | 149 +------
.../plasmaWordIndexAssortmentCluster.java | 408 ------------------
.../de/anomic/plasma/plasmaWordIndexFile.java | 207 +--------
.../plasma/plasmaWordIndexFileCluster.java | 94 +---
.../anomic/urlRedirector/urlRedirectord.java | 6 +-
source/de/anomic/yacy/yacyClient.java | 29 +-
source/de/anomic/yacy/yacyDHTAction.java | 1 +
source/de/anomic/yacy/yacyPeerActions.java | 4 +-
source/yacy.java | 172 +++++---
57 files changed, 601 insertions(+), 1557 deletions(-)
delete mode 100644 source/de/anomic/plasma/plasmaURLPool.java
delete mode 100644 source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java
diff --git a/doc/Download.html b/doc/Download.html
index 7e0a228b7..b4aea23c8 100644
--- a/doc/Download.html
+++ b/doc/Download.html
@@ -53,19 +53,19 @@ globalheader();
If you download the software, you must accept the License.
Latest Release:
-The latest YaCy release version is 0.48
+The latest YaCy release version is 0.49
Nightly builds from compiles out of SVN can be obtained from http://latest.yacy-forum.net/.
- Generic release of YaCy (all platforms with J2SE 1.4.2: Linux, Mac OS X, Windows, Solaris):
- Windows-flavour release of YaCy (same code as generic release, but with convenient Windows-Installer):
diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java
index 42a36bc6d..28f5fef1d 100644
--- a/htroot/Bookmarks.java
+++ b/htroot/Bookmarks.java
@@ -147,7 +147,7 @@ public class Bookmarks {
bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash);
if (bookmark == null) {
// try to get the bookmark from the LURL database
- indexURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
+ indexURLEntry urlentry = switchboard.wordIndex.loadedURL.load(urlHash, null);
plasmaParserDocument document = null;
if (urlentry != null) {
indexURLEntry.Components comp = urlentry.comp();
diff --git a/htroot/IndexCleaner_p.java b/htroot/IndexCleaner_p.java
index fc36ed171..6195bd13e 100755
--- a/htroot/IndexCleaner_p.java
+++ b/htroot/IndexCleaner_p.java
@@ -62,7 +62,7 @@ public class IndexCleaner_p {
prop.put("bla", "post!=null");
if (post.get("action").equals("ustart")) {
if (urldbCleanerThread==null || !urldbCleanerThread.isAlive()) {
- urldbCleanerThread = sb.urlPool.loadedURL.makeCleaner();
+ urldbCleanerThread = sb.wordIndex.loadedURL.makeCleaner();
urldbCleanerThread.start();
}
else {
@@ -77,7 +77,7 @@ public class IndexCleaner_p {
}
else if (post.get("action").equals("rstart")) {
if (indexCleanerThread==null || !indexCleanerThread.isAlive()) {
- indexCleanerThread = sb.wordIndex.makeCleaner(sb.urlPool.loadedURL, post.get("wordHash","--------"));
+ indexCleanerThread = sb.wordIndex.makeCleaner(sb.wordIndex.loadedURL, post.get("wordHash","--------"));
indexCleanerThread.start();
}
else {
@@ -98,7 +98,7 @@ public class IndexCleaner_p {
}
if (urldbCleanerThread!=null) {
prop.put("urldb", 1);
- prop.put("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/sb.urlPool.loadedURL.size())*100 + "");
+ prop.put("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/sb.wordIndex.loadedURL.size())*100 + "");
prop.put("urldb_blacklisted", urldbCleanerThread.blacklistedUrls);
prop.put("urldb_total", urldbCleanerThread.totalSearchedUrls);
prop.put("urldb_lastBlacklistedUrl", urldbCleanerThread.lastBlacklistedUrl);
diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java
index 146c6bfdd..62758aafb 100644
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@@ -63,7 +63,6 @@ import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSwitchboard;
-import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCodings;
import de.anomic.server.serverObjects;
@@ -87,7 +86,7 @@ public class IndexControl_p {
prop.put("urlhash", "");
prop.put("result", "");
prop.put("wcount", Integer.toString(switchboard.wordIndex.size()));
- prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size()));
+ prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size()));
prop.put("otherHosts", "");
prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : "");
prop.put("indexDistributeWhileCrawling", (switchboard.getConfig("allowDistributeIndexWhileCrawling", "true").equals("true")) ? "checked" : "");
@@ -170,7 +169,7 @@ public class IndexControl_p {
}
if (delurl || delurlref) {
for (int i = 0; i < urlx.length; i++) {
- switchboard.urlPool.loadedURL.remove(urlx[i]);
+ switchboard.wordIndex.loadedURL.remove(urlx[i]);
}
}
switchboard.wordIndex.deleteContainer(keyhash);
@@ -190,7 +189,7 @@ public class IndexControl_p {
}
if (delurl || delurlref) {
for (int i = 0; i < urlx.length; i++) {
- switchboard.urlPool.loadedURL.remove(urlx[i]);
+ switchboard.wordIndex.loadedURL.remove(urlx[i]);
}
}
Set urlHashes = new HashSet();
@@ -217,13 +216,13 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashdelete")) {
- indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
+ indexURLEntry entry = switchboard.wordIndex.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
urlstring = entry.comp().url().toNormalform();
prop.put("urlstring", "");
- switchboard.urlPool.loadedURL.remove(urlhash);
+ switchboard.wordIndex.loadedURL.remove(urlhash);
prop.put("result", "Removed URL " + urlstring);
}
}
@@ -282,7 +281,7 @@ public class IndexControl_p {
indexURLEntry lurl;
while (urlIter.hasNext()) {
iEntry = (indexRWIEntry) urlIter.next();
- lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null);
+ lurl = switchboard.wordIndex.loadedURL.load(iEntry.urlHash(), null);
if (lurl == null) {
unknownURLEntries.add(iEntry.urlHash());
urlIter.remove();
@@ -307,7 +306,7 @@ public class IndexControl_p {
// generate list
if (post.containsKey("keyhashsimilar")) {
- final Iterator containerIt = switchboard.wordIndex.indexContainerSet(keyhash, plasmaWordIndex.RL_WORDFILES, true, 256).iterator();
+ final Iterator containerIt = switchboard.wordIndex.indexContainerSet(keyhash, false, true, 256).iterator();
indexContainer container;
int i = 0;
int rows = 0, cols = 0;
@@ -333,7 +332,7 @@ public class IndexControl_p {
URL url = new URL(urlstring);
urlhash = plasmaURL.urlHash(url);
prop.put("urlhash", urlhash);
- indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
+ indexURLEntry entry = switchboard.wordIndex.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("urlstring", "unknown url: " + urlstring);
prop.put("urlhash", "");
@@ -347,7 +346,7 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashsearch")) {
- indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
+ indexURLEntry entry = switchboard.wordIndex.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash);
} else {
@@ -359,7 +358,7 @@ public class IndexControl_p {
// generate list
if (post.containsKey("urlhashsimilar")) {
try {
- final Iterator entryIt = switchboard.urlPool.loadedURL.entries(true, true, urlhash);
+ final Iterator entryIt = switchboard.wordIndex.loadedURL.entries(true, true, urlhash);
StringBuffer result = new StringBuffer("Sequential List of URL-Hashes:
");
indexURLEntry entry;
int i = 0;
@@ -403,7 +402,7 @@ public class IndexControl_p {
// insert constants
prop.put("wcount", Integer.toString(switchboard.wordIndex.size()));
- prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size()));
+ prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size()));
prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : "");
prop.put("indexDistributeWhileCrawling", (switchboard.getConfig("allowDistributeIndexWhileCrawling", "true").equals("true")) ? "checked" : "");
prop.put("indexReceiveChecked", (switchboard.getConfig("allowReceiveIndex", "true").equals("true")) ? "checked" : "");
@@ -422,7 +421,7 @@ public class IndexControl_p {
}
indexURLEntry.Components comp = entry.comp();
String referrer = null;
- indexURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
+ indexURLEntry le = switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null);
if (le == null) {
referrer = "";
} else {
@@ -471,7 +470,7 @@ public class IndexControl_p {
while (en.hasNext()) {
xi = (indexRWIEntry) en.next();
uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())};
- indexURLEntry le = switchboard.urlPool.loadedURL.load(uh[0], null);
+ indexURLEntry le = switchboard.wordIndex.loadedURL.load(uh[0], null);
if (le == null) {
tm.put(uh[0], uh);
} else {
diff --git a/htroot/IndexCreateIndexingQueue_p.java b/htroot/IndexCreateIndexingQueue_p.java
index 32e66590e..35d1eabbc 100644
--- a/htroot/IndexCreateIndexingQueue_p.java
+++ b/htroot/IndexCreateIndexingQueue_p.java
@@ -76,7 +76,7 @@ public class IndexCreateIndexingQueue_p {
}
if (post.containsKey("clearRejected")) {
- switchboard.urlPool.errorURL.clearStack();
+ switchboard.errorURL.clearStack();
}
if (post.containsKey("moreRejected")) {
showRejectedCount = Integer.parseInt(post.get("showRejected", "10"));
@@ -172,11 +172,11 @@ public class IndexCreateIndexingQueue_p {
}
// failure cases
- if (switchboard.urlPool.errorURL.stackSize() != 0) {
- if (showRejectedCount > switchboard.urlPool.errorURL.stackSize()) showRejectedCount = switchboard.urlPool.errorURL.stackSize();
+ if (switchboard.errorURL.stackSize() != 0) {
+ if (showRejectedCount > switchboard.errorURL.stackSize()) showRejectedCount = switchboard.errorURL.stackSize();
prop.put("rejected", 1);
- prop.put("rejected_num", switchboard.urlPool.errorURL.stackSize());
- if (showRejectedCount != switchboard.urlPool.errorURL.stackSize()) {
+ prop.put("rejected_num", switchboard.errorURL.stackSize());
+ if (showRejectedCount != switchboard.errorURL.stackSize()) {
prop.put("rejected_only-latest", 1);
prop.put("rejected_only-latest_num", showRejectedCount);
prop.put("rejected_only-latest_newnum", ((int) (showRejectedCount * 1.5)));
@@ -189,9 +189,9 @@ public class IndexCreateIndexingQueue_p {
plasmaCrawlEURL.Entry entry;
yacySeed initiatorSeed, executorSeed;
int j=0;
- for (int i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) {
+ for (int i = switchboard.errorURL.stackSize() - 1; i >= (switchboard.errorURL.stackSize() - showRejectedCount); i--) {
try {
- entry = switchboard.urlPool.errorURL.stackPopEntry(i);
+ entry = switchboard.errorURL.stackPopEntry(i);
url = entry.url();
if (url == null) continue;
diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java
index 10cea766f..3c8767480 100644
--- a/htroot/IndexCreateWWWGlobalQueue_p.java
+++ b/htroot/IndexCreateWWWGlobalQueue_p.java
@@ -79,8 +79,8 @@ public class IndexCreateWWWGlobalQueue_p {
}
if (post.containsKey("clearcrawlqueue")) {
- int c = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
- switchboard.urlPool.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_LIMIT);
+ int c = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
+ switchboard.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_LIMIT);
try { switchboard.cleanProfiles(); } catch (InterruptedException e) { /* Ignore this */}
/*
int c = 0;
@@ -94,12 +94,12 @@ public class IndexCreateWWWGlobalQueue_p {
}
}
- int stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
+ int stackSize = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
if (stackSize == 0) {
prop.put("crawler-queue", 0);
} else {
prop.put("crawler-queue", 1);
- plasmaCrawlNURL.Entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, showLimit);
+ plasmaCrawlNURL.Entry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, showLimit);
prop.put("crawler-queue_num", stackSize);//num Entries
plasmaCrawlNURL.Entry urle;
boolean dark = true;
diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java
index 137ce33c4..5d5042949 100644
--- a/htroot/IndexCreateWWWLocalQueue_p.java
+++ b/htroot/IndexCreateWWWLocalQueue_p.java
@@ -90,8 +90,8 @@ public class IndexCreateWWWLocalQueue_p {
String pattern = post.get("pattern", ".*").trim();
String option = post.get("option", ".*").trim();
if (pattern.equals(".*")) {
- c = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
- switchboard.urlPool.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE);
+ c = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
+ switchboard.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE);
try { switchboard.cleanProfiles(); } catch (InterruptedException e) {/* ignore this */}
} else{
Pattern compiledPattern = null;
@@ -100,13 +100,13 @@ public class IndexCreateWWWLocalQueue_p {
compiledPattern = Pattern.compile(pattern);
// iterating through the list of URLs
- Iterator iter = switchboard.urlPool.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
+ Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
while (iter.hasNext()) {
String value = null;
String nextHash = new String((byte[]) iter.next());
Entry entry = null;
try {
- entry = switchboard.urlPool.noticeURL.getEntry(nextHash);
+ entry = switchboard.noticeURL.getEntry(nextHash);
} catch (IOException e) {
continue;
}
@@ -137,7 +137,7 @@ public class IndexCreateWWWLocalQueue_p {
if (value != null) {
Matcher matcher = compiledPattern.matcher(value);
if (matcher.find()) {
- switchboard.urlPool.noticeURL.remove(nextHash);
+ switchboard.noticeURL.remove(nextHash);
}
}
@@ -151,18 +151,18 @@ public class IndexCreateWWWLocalQueue_p {
prop.put("info_numEntries", c);
} else if (post.containsKey("deleteEntry")) {
String urlHash = (String) post.get("deleteEntry");
- switchboard.urlPool.noticeURL.remove(urlHash);
+ switchboard.noticeURL.remove(urlHash);
prop.put("LOCATION","");
return prop;
}
}
- int showNum = 0, stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
+ int showNum = 0, stackSize = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
if (stackSize == 0) {
prop.put("crawler-queue", 0);
} else {
prop.put("crawler-queue", 1);
- plasmaCrawlNURL.Entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, (int) (showLimit * 1.20));
+ plasmaCrawlNURL.Entry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, (int) (showLimit * 1.20));
plasmaCrawlNURL.Entry urle;
boolean dark = true;
diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java
index b737083a2..ee5bbda4f 100644
--- a/htroot/IndexCreate_p.java
+++ b/htroot/IndexCreate_p.java
@@ -168,9 +168,9 @@ public class IndexCreate_p {
// stack request
// first delete old entry, if exists
String urlhash = plasmaURL.urlHash(crawlingStart);
- switchboard.urlPool.loadedURL.remove(urlhash);
- switchboard.urlPool.noticeURL.remove(urlhash);
- switchboard.urlPool.errorURL.remove(urlhash);
+ switchboard.wordIndex.loadedURL.remove(urlhash);
+ switchboard.noticeURL.remove(urlhash);
+ switchboard.errorURL.remove(urlhash);
// stack url
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
@@ -203,10 +203,10 @@ public class IndexCreate_p {
prop.put("error_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL"))));
prop.put("error_reasonString", reasonString);
- plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
+ plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
crawlingStartURL.getHost(), reasonString, new kelondroBitfield());
ee.store();
- switchboard.urlPool.errorURL.stackPushEntry(ee);
+ switchboard.errorURL.stackPushEntry(ee);
}
} catch (PatternSyntaxException e) {
prop.put("error", 8); //crawlfilter does not match url
@@ -281,10 +281,10 @@ public class IndexCreate_p {
if (rejectReason == null) {
c++;
} else {
- plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
+ plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new kelondroBitfield());
ee.store();
- switchboard.urlPool.errorURL.stackPushEntry(ee);
+ switchboard.errorURL.stackPushEntry(ee);
}
}
@@ -412,7 +412,7 @@ public class IndexCreate_p {
int queueStackSize = switchboard.sbQueue.size();
int loaderThreadsSize = switchboard.cacheLoader.size();
- int crawlerListSize = switchboard.urlPool.noticeURL.stackSize();
+ int crawlerListSize = switchboard.noticeURL.stackSize();
int completequeue = queueStackSize + loaderThreadsSize + crawlerListSize;
if ((completequeue > 0) || ((post != null) && (post.containsKey("refreshpage")))) {
diff --git a/htroot/IndexImport_p.java b/htroot/IndexImport_p.java
index 25bb87950..1045675ce 100644
--- a/htroot/IndexImport_p.java
+++ b/htroot/IndexImport_p.java
@@ -98,7 +98,7 @@ public final class IndexImport_p {
if (startImport) {
dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType);
if (importerThread != null) {
- importerThread.init(new File(importPath), switchboard.indexPath, cacheSize, 100);
+ importerThread.init(new File(importPath), cacheSize, 100);
importerThread.startIt();
}
prop.put("LOCATION","");
@@ -147,7 +147,7 @@ public final class IndexImport_p {
}
prop.put("wcount", Integer.toString(switchboard.wordIndex.size()));
- prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size()));
+ prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size()));
/*
* Loop over all currently running jobs
diff --git a/htroot/IndexMonitor.java b/htroot/IndexMonitor.java
index 97568020e..24dd5c6d1 100644
--- a/htroot/IndexMonitor.java
+++ b/htroot/IndexMonitor.java
@@ -109,12 +109,12 @@ public class IndexMonitor {
}
// do the commands
- if (post.containsKey("clearlist")) sb.urlPool.loadedURL.clearStack(tabletype);
+ if (post.containsKey("clearlist")) sb.wordIndex.loadedURL.clearStack(tabletype);
if (post.containsKey("deleteentry")) {
String hash = post.get("hash", null);
if (hash != null) {
// delete from database
- sb.urlPool.loadedURL.remove(hash);
+ sb.wordIndex.loadedURL.remove(hash);
}
}
if (post.containsKey("moreIndexed")) {
@@ -126,18 +126,18 @@ public class IndexMonitor {
// create table
if (tabletype == 0) {
prop.put("table", 2);
- } else if (sb.urlPool.loadedURL.getStackSize(tabletype) == 0) {
+ } else if (sb.wordIndex.loadedURL.getStackSize(tabletype) == 0) {
prop.put("table", 0);
} else {
prop.put("table", 1);
- if (lines > sb.urlPool.loadedURL.getStackSize(tabletype)) lines = sb.urlPool.loadedURL.getStackSize(tabletype);
- if (lines == sb.urlPool.loadedURL.getStackSize(tabletype)) {
+ if (lines > sb.wordIndex.loadedURL.getStackSize(tabletype)) lines = sb.wordIndex.loadedURL.getStackSize(tabletype);
+ if (lines == sb.wordIndex.loadedURL.getStackSize(tabletype)) {
prop.put("table_size", 0);
} else {
prop.put("table_size", 1);
prop.put("table_size_count", lines);
}
- prop.put("table_size_all", sb.urlPool.loadedURL.getStackSize(tabletype));
+ prop.put("table_size_all", sb.wordIndex.loadedURL.getStackSize(tabletype));
prop.put("table_feedbackpage", "IndexMonitor.html");
prop.put("table_tabletype", tabletype);
prop.put("table_showInit", (showInit) ? 1 : 0);
@@ -153,14 +153,14 @@ public class IndexMonitor {
final plasmaHTCache cacheManager = sb.getCacheManager();
int i, cnt = 0;
- for (i = sb.urlPool.loadedURL.getStackSize(tabletype) - 1; i >= (sb.urlPool.loadedURL.getStackSize(tabletype) - lines); i--) {
- initiatorHash = sb.urlPool.loadedURL.getInitiatorHash(tabletype, i);
- executorHash = sb.urlPool.loadedURL.getExecutorHash(tabletype, i);
+ for (i = sb.wordIndex.loadedURL.getStackSize(tabletype) - 1; i >= (sb.wordIndex.loadedURL.getStackSize(tabletype) - lines); i--) {
+ initiatorHash = sb.wordIndex.loadedURL.getInitiatorHash(tabletype, i);
+ executorHash = sb.wordIndex.loadedURL.getExecutorHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash);
- urlHash = sb.urlPool.loadedURL.getUrlHash(tabletype, i);
+ urlHash = sb.wordIndex.loadedURL.getUrlHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try {
- urle = sb.urlPool.loadedURL.load(urlHash, null);
+ urle = sb.wordIndex.loadedURL.load(urlHash, null);
indexURLEntry.Components comp = urle.comp();
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);
diff --git a/htroot/IndexShare_p.java b/htroot/IndexShare_p.java
index b6f362e42..a174beb45 100644
--- a/htroot/IndexShare_p.java
+++ b/htroot/IndexShare_p.java
@@ -66,7 +66,7 @@ public class IndexShare_p {
prop.put("dtable", "");
prop.put("rtable", "");
prop.put("wcount", Integer.toString(switchboard.wordIndex.size()));
- prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size()));
+ prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size()));
return prop; // be save
}
@@ -79,7 +79,7 @@ public class IndexShare_p {
// insert constants
prop.put("wcount", Integer.toString(switchboard.wordIndex.size()));
- prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size()));
+ prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size()));
// return rewrite properties
return prop;
}
diff --git a/htroot/IndexTransfer_p.java b/htroot/IndexTransfer_p.java
index 1c71c091e..f6ac9952a 100644
--- a/htroot/IndexTransfer_p.java
+++ b/htroot/IndexTransfer_p.java
@@ -96,7 +96,7 @@ public final class IndexTransfer_p {
// insert constants
prop.put("wcount", Integer.toString(switchboard.wordIndex.size()));
- prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size()));
+ prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size()));
prop.put("running",(switchboard.transferIdxThread==null)?0:1);
if (switchboard.transferIdxThread != null) {
String[] status = switchboard.transferIdxThread.getStatus();
diff --git a/htroot/PerformanceMemory_p.java b/htroot/PerformanceMemory_p.java
index 591a67e3a..ffafb1ab2 100644
--- a/htroot/PerformanceMemory_p.java
+++ b/htroot/PerformanceMemory_p.java
@@ -175,11 +175,11 @@ public class PerformanceMemory_p {
ost = sb.cacheManager.cacheObjectStatus();
putprop(prop, env, "", "HTTP", set);
- req = sb.urlPool.loadedURL.size();
- chk = sb.urlPool.loadedURL.cacheNodeChunkSize();
- obj = sb.urlPool.loadedURL.cacheObjectChunkSize();
- slt = sb.urlPool.loadedURL.cacheNodeStatus();
- ost = sb.urlPool.loadedURL.cacheObjectStatus();
+ req = sb.wordIndex.loadedURL.size();
+ chk = sb.wordIndex.loadedURL.cacheNodeChunkSize();
+ obj = sb.wordIndex.loadedURL.cacheObjectChunkSize();
+ slt = sb.wordIndex.loadedURL.cacheNodeStatus();
+ ost = sb.wordIndex.loadedURL.cacheObjectStatus();
putprop(prop, env, "", "LURL", set);
if (sb.sbStackCrawlThread.getDBType() != de.anomic.plasma.plasmaCrawlStacker.QUEUE_DB_TYPE_TREE) {
@@ -194,27 +194,27 @@ public class PerformanceMemory_p {
putprop(prop, env, "usePreNURLCache", "PreNURL", set);
}
- if (sb.urlPool.noticeURL.getUseNewDB()) {
+ if (sb.noticeURL.getUseNewDB()) {
prop.put("useNURLCache", 0);
} else {
prop.put("useNURLCache", 1);
- req = sb.urlPool.noticeURL.size();
- chk = sb.urlPool.noticeURL.cacheNodeChunkSize();
- obj = sb.urlPool.noticeURL.cacheObjectChunkSize();
- slt = sb.urlPool.noticeURL.cacheNodeStatus();
- ost = sb.urlPool.noticeURL.cacheObjectStatus();
+ req = sb.noticeURL.size();
+ chk = sb.noticeURL.cacheNodeChunkSize();
+ obj = sb.noticeURL.cacheObjectChunkSize();
+ slt = sb.noticeURL.cacheNodeStatus();
+ ost = sb.noticeURL.cacheObjectStatus();
putprop(prop, env, "useNURLCache", "NURL", set);
}
- if (sb.urlPool.errorURL.getUseNewDB()) {
+ if (sb.errorURL.getUseNewDB()) {
prop.put("useEURLCache", 0);
} else {
prop.put("useEURLCache", 1);
- req = sb.urlPool.errorURL.size();
- chk = sb.urlPool.errorURL.cacheNodeChunkSize();
- obj = sb.urlPool.errorURL.cacheObjectChunkSize();
- slt = sb.urlPool.errorURL.cacheNodeStatus();
- ost = sb.urlPool.errorURL.cacheObjectStatus();
+ req = sb.errorURL.size();
+ chk = sb.errorURL.cacheNodeChunkSize();
+ obj = sb.errorURL.cacheObjectChunkSize();
+ slt = sb.errorURL.cacheNodeStatus();
+ ost = sb.errorURL.cacheObjectStatus();
putprop(prop, env, "useEURLCache", "EURL", set);
}
diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java
index 6f3cafc30..1df1e0e27 100644
--- a/htroot/PerformanceQueues_p.java
+++ b/htroot/PerformanceQueues_p.java
@@ -262,7 +262,7 @@ public class PerformanceQueues_p {
}
// table cache settings
- prop.put("urlCacheSize", switchboard.urlPool.loadedURL.writeCacheSize());
+ prop.put("urlCacheSize", switchboard.wordIndex.loadedURL.writeCacheSize());
prop.put("wordCacheWSize", switchboard.wordIndex.dhtOutCacheSize());
prop.put("wordCacheKSize", switchboard.wordIndex.dhtInCacheSize());
prop.put("maxURLinWCache", "" + switchboard.wordIndex.maxURLinDHTOutCache());
diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
index 27d205924..dff5eef4d 100644
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@@ -149,9 +149,9 @@ public class QuickCrawlLink_p {
}
String urlhash = plasmaURL.urlHash(crawlingStart);
- switchboard.urlPool.loadedURL.remove(urlhash);
- switchboard.urlPool.noticeURL.remove(urlhash);
- switchboard.urlPool.errorURL.remove(urlhash);
+ switchboard.wordIndex.loadedURL.remove(urlhash);
+ switchboard.noticeURL.remove(urlhash);
+ switchboard.errorURL.remove(urlhash);
// create crawling profile
plasmaCrawlProfile.entry pe = null;
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index 8af387d82..ad20fdc50 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -108,7 +108,7 @@ public class ViewFile {
if (urlHash.length() > 0) {
// getting the urlEntry that belongs to the url hash
indexURLEntry urlEntry = null;
- urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
+ urlEntry = sb.wordIndex.loadedURL.load(urlHash, null);
if (urlEntry == null) {
prop.put("error",2);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java
index 3d94c6de4..5b6bfdab1 100644
--- a/htroot/htdocsdefault/dir.java
+++ b/htroot/htdocsdefault/dir.java
@@ -361,7 +361,7 @@ public class dir {
try {
final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()), "UTF-8");
- final indexURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
+ final indexURLEntry newEntry = switchboard.wordIndex.loadedURL.newEntry(
url,
"YaCyShare: " + descr,
yacyCore.seedDB.mySeed.getName(),
@@ -379,8 +379,8 @@ public class dir {
"**", // language
0,0,0,0,0,0
);
- switchboard.urlPool.loadedURL.store(newEntry);
- switchboard.urlPool.loadedURL.stack(
+ switchboard.wordIndex.loadedURL.store(newEntry);
+ switchboard.wordIndex.loadedURL.stack(
newEntry,
"____________", /*initiator*/
yacyCore.seedDB.mySeed.hash, /*executor*/
@@ -401,7 +401,7 @@ public class dir {
entry = (Map.Entry) words.next();
switchboard.wordIndex.removeEntry(plasmaCondenser.word2hash((String) entry.getKey()), urlhash, true);
}
- switchboard.urlPool.loadedURL.remove(urlhash);
+ switchboard.wordIndex.loadedURL.remove(urlhash);
} catch (Exception e) {
serverLog.logSevere("DIR", "INTERNAL ERROR in dir.deletePhrase", e);
}
diff --git a/htroot/xml/queues_p.java b/htroot/xml/queues_p.java
index dadd71ab2..1ab477abb 100644
--- a/htroot/xml/queues_p.java
+++ b/htroot/xml/queues_p.java
@@ -164,17 +164,17 @@ public class queues_p {
//local crawl queue
prop.put("localCrawlSize", Integer.toString(switchboard.getThread("50_localcrawl").getJobCount()));
- int stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
- addNTable(prop, "list-local", switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, Math.min(10, stackSize)));
+ int stackSize = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
+ addNTable(prop, "list-local", switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, Math.min(10, stackSize)));
//global crawl queue
prop.put("remoteCrawlSize", Integer.toString(switchboard.getThread("61_globalcrawltrigger").getJobCount()));
//prop.put("remoteCrawlSize", Integer.toString(switchboard.getThread("62_remotetriggeredcrawl").getJobCount()));
- stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
+ stackSize = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
if (stackSize == 0) {
prop.put("list-remote", 0);
} else {
- addNTable(prop, "list-remote", switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, Math.min(10, stackSize)));
+ addNTable(prop, "list-remote", switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, Math.min(10, stackSize)));
}
// return rewrite properties
diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java
index c9add44ae..ae6a17bfa 100644
--- a/htroot/yacy/crawlOrder.java
+++ b/htroot/yacy/crawlOrder.java
@@ -249,13 +249,13 @@ public final class crawlOrder {
// case where we have already the url loaded;
reason = reasonString;
// send lurl-Entry as response
- indexURLEntry entry = switchboard.urlPool.loadedURL.load(plasmaURL.urlHash(url), null);
+ indexURLEntry entry = switchboard.wordIndex.loadedURL.load(plasmaURL.urlHash(url), null);
if (entry == null) {
response = "rejected";
lurl = "";
} else {
response = "double";
- switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
+ switchboard.wordIndex.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
lurl = crypt.simpleEncode(entry.toString());
}
} else {
diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java
index bc8cc87bc..c4e90e520 100644
--- a/htroot/yacy/crawlReceipt.java
+++ b/htroot/yacy/crawlReceipt.java
@@ -124,7 +124,7 @@ public final class crawlReceipt {
prop.put("delay", "3600");
} else if (result.equals("fill")) {
// generating a new loaded URL entry
- indexURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr);
+ indexURLEntry entry = switchboard.wordIndex.loadedURL.newEntry(propStr);
if (entry == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr);
@@ -135,16 +135,16 @@ public final class crawlReceipt {
"\n\tURL properties: "+ propStr);
} else try {
// put new entry into database
- switchboard.urlPool.loadedURL.store(entry);
- switchboard.urlPool.loadedURL.stack(entry, youare, iam, 1);
+ switchboard.wordIndex.loadedURL.store(entry);
+ switchboard.wordIndex.loadedURL.stack(entry, youare, iam, 1);
// generating url hash
String newUrlHash = plasmaURL.urlHash(comp.url());
String oldUrlHash = plasmaURL.oldurlHash(comp.url());
// removing URL from notice URL
- switchboard.urlPool.noticeURL.remove(newUrlHash);
- switchboard.urlPool.noticeURL.remove(oldUrlHash);
+ switchboard.noticeURL.remove(newUrlHash);
+ switchboard.noticeURL.remove(oldUrlHash);
log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + comp.url().toNormalform());
} catch (IOException e) {
@@ -155,11 +155,11 @@ public final class crawlReceipt {
prop.put("delay", "10");
} else {
try {
- plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash);
- plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new kelondroBitfield());
+ plasmaCrawlNURL.Entry en = switchboard.noticeURL.getEntry(receivedUrlhash);
+ plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new kelondroBitfield());
ee.store();
- switchboard.urlPool.errorURL.stackPushEntry(ee);
- switchboard.urlPool.noticeURL.remove(receivedUrlhash);
+ switchboard.errorURL.stackPushEntry(ee);
+ switchboard.noticeURL.remove(receivedUrlhash);
} catch (IOException e) {
}
diff --git a/htroot/yacy/query.java b/htroot/yacy/query.java
index 1c7560117..24c33da18 100644
--- a/htroot/yacy/query.java
+++ b/htroot/yacy/query.java
@@ -98,7 +98,7 @@ public final class query {
if (obj.equals("lurlcount")) {
// return the number of all available l-url's
- prop.put("response", sb.urlPool.loadedURL.size());
+ prop.put("response", sb.wordIndex.loadedURL.size());
return prop;
}
diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java
index 96f929166..96ca414c7 100644
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@@ -144,7 +144,7 @@ public final class search {
plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(squery.maximumTime, squery.wantedResults);
plasmaSearchTimingProfile remoteTiming = null;
- plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
+ plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.wordIndex.loadedURL, sb.snippetCache);
Map containers = theSearch.localSearchContainers(plasmaSearchQuery.hashes2Set(urls));
if (containers != null) {
Iterator ci = containers.entrySet().iterator();
@@ -173,7 +173,7 @@ public final class search {
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery,
rankingProfile, localTiming, remoteTiming, true,
- yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL,
+ yacyCore.log, sb.wordIndex, sb.wordIndex.loadedURL,
sb.snippetCache);
Map containers = theSearch.localSearchContainers(plasmaSearchQuery.hashes2Set(urls));
diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java
index 80c8be09a..f9152a133 100644
--- a/htroot/yacy/transferRWI.java
+++ b/htroot/yacy/transferRWI.java
@@ -53,7 +53,6 @@ import java.util.List;
import de.anomic.http.httpHeader;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryNew;
-import de.anomic.index.indexRWIEntryOld;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore;
@@ -93,6 +92,7 @@ public final class transferRWI {
StringBuffer unknownURLs = new StringBuffer();
int pause = 0;
+ /*
boolean shortCacheFlush = false;
if ((granted) && (sb.wordIndex.busyCacheFlush)) {
// wait a little bit, maybe we got into a short flush slot
@@ -101,9 +101,10 @@ public final class transferRWI {
shortCacheFlush = true;
break;
}
- try {Thread.sleep(100);} catch (InterruptedException e) {/* */}
+ try {Thread.sleep(100);} catch (InterruptedException e) {}
}
}
+ */
if (!granted) {
// we dont want to receive indexes
@@ -152,42 +153,45 @@ public final class transferRWI {
Iterator i = v.iterator();
while (i.hasNext()) {
serverCore.checkInterruption();
-
estring = (String) i.next();
+
+ // check if RWI entry is well-formed
p = estring.indexOf("{");
- if (p > 0) {
- wordHash = estring.substring(0, p);
- wordhashes[received] = wordHash;
- if (estring.indexOf("x=") > 0)
- iEntry = new indexRWIEntryNew(estring.substring(p));
- else
- iEntry = new indexRWIEntryOld(estring.substring(p));
- urlHash = iEntry.urlHash();
- if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(plasmaURLPattern.BLACKLIST_DHT, urlHash))) {
- int deleted = sb.wordIndex.tryRemoveURLs(urlHash);
- yacyCore.log.logFine("transferRWI: blocked blacklisted URLHash '" + urlHash + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
- blocked++;
- } else {
- sb.wordIndex.addEntry(wordHash, iEntry, System.currentTimeMillis(), true);
- serverCore.checkInterruption();
+ if ((p < 0) || (estring.indexOf("x=") < 0)) {
+ blocked++;
+ continue;
+ }
+ wordHash = estring.substring(0, p);
+ wordhashes[received] = wordHash;
+ iEntry = new indexRWIEntryNew(estring.substring(p));
+ urlHash = iEntry.urlHash();
+
+ // block blacklisted entries
+ if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(plasmaURLPattern.BLACKLIST_DHT, urlHash))) {
+ int deleted = sb.wordIndex.tryRemoveURLs(urlHash);
+ yacyCore.log.logFine("transferRWI: blocked blacklisted URLHash '" + urlHash + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
+ blocked++;
+ continue;
+ }
+
+ // learn entry
+ sb.wordIndex.addEntry(wordHash, iEntry, System.currentTimeMillis(), true);
+ serverCore.checkInterruption();
- if (!(knownURL.contains(urlHash)||unknownURL.contains(urlHash))) {
- try {
- if (sb.urlPool.loadedURL.exists(urlHash)) {
- knownURL.add(urlHash);
- } else {
- unknownURL.add(urlHash);
- }
- } catch (Exception ex) {
- sb.getLog().logWarning(
- "transferRWI: DB-Error while trying to determine if URL with hash '" +
- urlHash + "' is known.", ex);
- }
- receivedURL++;
- }
- received++;
+ // check if we need to ask for the corresponding URL
+ if (!(knownURL.contains(urlHash)||unknownURL.contains(urlHash))) try {
+ if (sb.wordIndex.loadedURL.exists(urlHash)) {
+ knownURL.add(urlHash);
+ } else {
+ unknownURL.add(urlHash);
}
+ receivedURL++;
+ } catch (Exception ex) {
+ sb.getLog().logWarning(
+ "transferRWI: DB-Error while trying to determine if URL with hash '" +
+ urlHash + "' is known.", ex);
}
+ received++;
}
yacyCore.seedDB.mySeed.incRI(received);
diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java
index 6984bf679..bfcdbce05 100644
--- a/htroot/yacy/transferURL.java
+++ b/htroot/yacy/transferURL.java
@@ -87,7 +87,7 @@ public final class transferURL {
if (granted) {
int received = 0;
int blocked = 0;
- final int sizeBefore = sb.urlPool.loadedURL.size();
+ final int sizeBefore = sb.wordIndex.loadedURL.size();
// read the urls from the other properties and store
String urls;
indexURLEntry lEntry;
@@ -97,7 +97,7 @@ public final class transferURL {
if (urls == null) {
yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName);
} else {
- lEntry = sb.urlPool.loadedURL.newEntry(urls);
+ lEntry = sb.wordIndex.loadedURL.newEntry(urls);
if (lEntry == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message???
@@ -113,8 +113,8 @@ public final class transferURL {
lEntry = null;
blocked++;
} else try {
- sb.urlPool.loadedURL.store(lEntry);
- sb.urlPool.loadedURL.stack(lEntry, iam, iam, 3);
+ sb.wordIndex.loadedURL.store(lEntry);
+ sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName);
received++;
} catch (IOException e) {
@@ -128,7 +128,7 @@ public final class transferURL {
yacyCore.seedDB.mySeed.incRU(received);
// return rewrite properties
- final int more = sb.urlPool.loadedURL.size() - sizeBefore;
+ final int more = sb.wordIndex.loadedURL.size() - sizeBefore;
doublevalues = Integer.toString(received - more);
sb.getLog().logInfo("Received " + received + " URLs from peer " + otherPeerName + " in " + (System.currentTimeMillis() - start) + " ms, Blocked " + blocked + " URLs");
if ((received - more) > 0) sb.getLog().logSevere("Received " + doublevalues + " double URLs from peer " + otherPeerName);
diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java
index 308f4de05..33b30ff4a 100644
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@@ -201,7 +201,7 @@ public class yacysearch {
return prop;
}
final String recommendHash = post.get("recommendref", ""); // urlhash
- indexURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
+ indexURLEntry urlentry = sb.wordIndex.loadedURL.load(recommendHash, null);
if (urlentry != null) {
indexURLEntry.Components comp = urlentry.comp();
plasmaParserDocument document;
diff --git a/source/de/anomic/index/indexCachedRI.java b/source/de/anomic/index/indexCachedRI.java
index 6acb9b148..40bc468a3 100644
--- a/source/de/anomic/index/indexCachedRI.java
+++ b/source/de/anomic/index/indexCachedRI.java
@@ -86,7 +86,7 @@ public class indexCachedRI implements indexRI {
return entries.updated();
}
- public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean intern) {
+ public void addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean intern) {
// add the entry
if (intern) {
riIntern.addEntry(wordHash, entry, updateTime, true);
@@ -94,10 +94,9 @@ public class indexCachedRI implements indexRI {
riExtern.addEntry(wordHash, entry, updateTime, false);
flushControl();
}
- return null;
}
- public indexContainer addEntries(indexContainer entries, long updateTime, boolean intern) {
+ public void addEntries(indexContainer entries, long updateTime, boolean intern) {
// add the entry
if (intern) {
riIntern.addEntries(entries, updateTime, true);
@@ -105,7 +104,6 @@ public class indexCachedRI implements indexRI {
riExtern.addEntries(entries, updateTime, false);
flushControl();
}
- return null;
}
public void flushCacheSome(boolean busy) {
@@ -133,12 +131,7 @@ public class indexCachedRI implements indexRI {
// flush the wordHash
indexContainer c = ram.deleteContainer(wordHash);
- if (c != null) {
- indexContainer feedback = backend.addEntries(c, c.updated(), false);
- if (feedback != null) {
- throw new RuntimeException("indexCollectionRI shall not return feedback entries; feedback = " + feedback.toString());
- }
- }
+ if (c != null) backend.addEntries(c, c.updated(), false);
// pause to next loop to give other processes a chance to use IO
//try {this.wait(8);} catch (InterruptedException e) {}
@@ -206,11 +199,11 @@ public class indexCachedRI implements indexRI {
return size;
}
- public void close(int waitingBoundSeconds) {
+ public void close() {
synchronized (this) {
- riIntern.close(waitingBoundSeconds);
- riExtern.close(waitingBoundSeconds);
- backend.close(-1);
+ riIntern.close();
+ riExtern.close();
+ backend.close();
}
}
diff --git a/source/de/anomic/index/indexCollectionRI.java b/source/de/anomic/index/indexCollectionRI.java
index ca0bdd3d7..116139a59 100644
--- a/source/de/anomic/index/indexCollectionRI.java
+++ b/source/de/anomic/index/indexCollectionRI.java
@@ -104,7 +104,7 @@ public class indexCollectionRI implements indexRI {
byte[] key = (byte[]) oo[0];
kelondroRowSet collection = (kelondroRowSet) oo[1];
if (collection == null) return null;
- return new indexContainer(new String(key), collection, true);
+ return new indexContainer(new String(key), collection);
}
public void remove() {
@@ -118,7 +118,7 @@ public class indexCollectionRI implements indexRI {
kelondroRowSet collection = collectionIndex.get(wordHash.getBytes(), deleteIfEmpty);
if (collection != null) collection.select(urlselection);
if ((collection == null) || (collection.size() == 0)) return null;
- return new indexContainer(wordHash, collection, true);
+ return new indexContainer(wordHash, collection);
} catch (IOException e) {
return null;
}
@@ -128,7 +128,7 @@ public class indexCollectionRI implements indexRI {
try {
kelondroRowSet collection = collectionIndex.delete(wordHash.getBytes());
if (collection == null) return null;
- return new indexContainer(wordHash, collection, true);
+ return new indexContainer(wordHash, collection);
} catch (IOException e) {
return null;
}
@@ -152,26 +152,24 @@ public class indexCollectionRI implements indexRI {
}
}
- public synchronized indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
- indexContainer container = new indexContainer(wordHash, collectionIndex.payloadRow(), true);
+ public synchronized void addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
+ indexContainer container = new indexContainer(wordHash, collectionIndex.payloadRow());
container.add(newEntry);
- return addEntries(container, updateTime, dhtCase);
+ addEntries(container, updateTime, dhtCase);
}
- public synchronized indexContainer addEntries(indexContainer newEntries, long creationTime, boolean dhtCase) {
+ public synchronized void addEntries(indexContainer newEntries, long creationTime, boolean dhtCase) {
String wordHash = newEntries.getWordHash();
try {
collectionIndex.merge(wordHash.getBytes(), (kelondroRowCollection) newEntries);
- return null; // merge does allways 'eat' up all entries unlike the assortments; they may return an overflow container
} catch (kelondroOutOfLimitsException e) {
e.printStackTrace();
- return null;
} catch (IOException e) {
- return null;
+ e.printStackTrace();
}
}
- public synchronized void close(int waitingSeconds) {
+ public synchronized void close() {
try {
collectionIndex.close();
} catch (IOException e) {
diff --git a/source/de/anomic/index/indexContainer.java b/source/de/anomic/index/indexContainer.java
index 68085ba4e..0717eb431 100644
--- a/source/de/anomic/index/indexContainer.java
+++ b/source/de/anomic/index/indexContainer.java
@@ -41,33 +41,24 @@ import de.anomic.kelondro.kelondroRowSet;
public class indexContainer extends kelondroRowSet {
private String wordHash;
- private boolean newRWI;
- public indexContainer(String wordHash, kelondroRow rowdef, int objectCount, byte[] cache, boolean newRWI) {
- super(rowdef, objectCount, cache, kelondroBase64Order.enhancedCoder, 0, 0);
- this.wordHash = wordHash;
- this.newRWI = newRWI;
- }
-
- public indexContainer(String wordHash, kelondroRow rowdef, boolean newRWI) {
- this(wordHash, rowdef, kelondroBase64Order.enhancedCoder, 0, newRWI);
+ public indexContainer(String wordHash, kelondroRow rowdef) {
+ this(wordHash, rowdef, kelondroBase64Order.enhancedCoder, 0);
}
- public indexContainer(String wordHash, kelondroRowSet collection, boolean newRWI) {
+ public indexContainer(String wordHash, kelondroRowSet collection) {
super(collection);
this.wordHash = wordHash;
- this.newRWI = newRWI;
}
- public indexContainer(String wordHash, kelondroRow rowdef, kelondroOrder ordering, int column, boolean newRWI) {
+ public indexContainer(String wordHash, kelondroRow rowdef, kelondroOrder ordering, int column) {
super(rowdef, ordering, column, 0);
this.wordHash = wordHash;
this.lastTimeWrote = 0;
- this.newRWI = newRWI;
}
public indexContainer topLevelClone() {
- indexContainer newContainer = new indexContainer(this.wordHash, this.rowdef, this.sortOrder, this.sortColumn, this.newRWI);
+ indexContainer newContainer = new indexContainer(this.wordHash, this.rowdef, this.sortOrder, this.sortColumn);
newContainer.add(this, -1);
return newContainer;
}
@@ -133,7 +124,7 @@ public class indexContainer extends kelondroRowSet {
if (entry instanceof indexRWIEntryNew)
oldEntry = new indexRWIEntryNew(oldEntryRow);
else
- oldEntry = new indexRWIEntryOld(oldEntryRow); // FIXME: see if cloning is necessary
+ oldEntry = new indexRWIEntryNew(new indexRWIEntryOld(oldEntryRow));
if (entry.isOlder(oldEntry)) { // A more recent Entry is already in this container
this.put(oldEntry.toKelondroEntry()); // put it back
return false;
@@ -146,19 +137,13 @@ public class indexContainer extends kelondroRowSet {
public indexRWIEntry get(String urlHash) {
kelondroRow.Entry entry = this.get(urlHash.getBytes());
if (entry == null) return null;
- if (this.newRWI)
- return new indexRWIEntryNew(entry);
- else
- return new indexRWIEntryOld(entry);
+ return new indexRWIEntryNew(entry);
}
public indexRWIEntry remove(String urlHash) {
kelondroRow.Entry entry = this.remove(urlHash.getBytes());
if (entry == null) return null;
- if (this.newRWI)
- return new indexRWIEntryNew(entry);
- else
- return new indexRWIEntryOld(entry);
+ return new indexRWIEntryNew(entry);
}
public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) {
@@ -194,10 +179,7 @@ public class indexContainer extends kelondroRowSet {
public Object next() {
kelondroRow.Entry rentry = (kelondroRow.Entry) rowEntryIterator.next();
if (rentry == null) return null;
- if (newRWI)
- return new indexRWIEntryNew(rentry);
- else
- return new indexRWIEntryOld(rentry);
+ return new indexRWIEntryNew(rentry);
}
public void remove() {
@@ -307,7 +289,7 @@ public class indexContainer extends kelondroRowSet {
assert small.rowdef.equals(large.rowdef) : "small = " + small.rowdef.toString() + "; large = " + large.rowdef.toString();
int keylength = small.rowdef.width(0);
assert (keylength == large.rowdef.width(0));
- indexContainer conj = new indexContainer(null, small.rowdef, small.newRWI); // start with empty search result
+ indexContainer conj = new indexContainer(null, small.rowdef); // start with empty search result
Iterator se = small.entries();
indexRWIEntry ie0, ie1;
long stamp = System.currentTimeMillis();
@@ -330,7 +312,7 @@ public class indexContainer extends kelondroRowSet {
assert i1.rowdef.equals(i2.rowdef) : "i1 = " + i1.rowdef.toString() + "; i2 = " + i2.rowdef.toString();
int keylength = i1.rowdef.width(0);
assert (keylength == i2.rowdef.width(0));
- indexContainer conj = new indexContainer(null, i1.rowdef, i1.newRWI); // start with empty search result
+ indexContainer conj = new indexContainer(null, i1.rowdef); // start with empty search result
if (!((i1.order().signature().equals(i2.order().signature())) &&
(i1.primarykey() == i2.primarykey()))) return conj; // ordering must be equal
Iterator e1 = i1.entries();
diff --git a/source/de/anomic/index/indexRAMRI.java b/source/de/anomic/index/indexRAMRI.java
index a80a8d18a..8d20646dc 100644
--- a/source/de/anomic/index/indexRAMRI.java
+++ b/source/de/anomic/index/indexRAMRI.java
@@ -58,7 +58,6 @@ public final class indexRAMRI implements indexRI {
private String indexArrayFileName;
private kelondroRow payloadrow;
private kelondroRow bufferStructureBasis;
- private boolean newRWI;
// calculated constants
private static String maxKey;
@@ -67,7 +66,7 @@ public final class indexRAMRI implements indexRI {
//minKey = ""; for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-';
}
- public indexRAMRI(File databaseRoot, kelondroRow payloadrow, int wCacheReferenceLimitInit, String dumpname, serverLog log, boolean newRWI) {
+ public indexRAMRI(File databaseRoot, kelondroRow payloadrow, int wCacheReferenceLimitInit, String dumpname, serverLog log) {
// creates a new index cache
// the cache has a back-end where indexes that do not fit in the cache are flushed
@@ -79,7 +78,6 @@ public final class indexRAMRI implements indexRI {
this.cacheMaxCount = 10000;
this.cacheReferenceLimit = wCacheReferenceLimitInit;
this.log = log;
- this.newRWI = newRWI;
this.indexArrayFileName = dumpname;
this.payloadrow = payloadrow;
this.bufferStructureBasis = new kelondroRow(
@@ -103,7 +101,7 @@ public final class indexRAMRI implements indexRI {
return entries.updated();
}
- private void dump(int waitingSeconds) throws IOException {
+ private void dump() throws IOException {
log.logConfig("creating dump for index cache '" + indexArrayFileName + "', " + cache.size() + " words (and much more urls)");
File indexDumpFile = new File(databaseRoot, indexArrayFileName);
if (indexDumpFile.exists()) indexDumpFile.delete();
@@ -180,10 +178,7 @@ public final class indexRAMRI implements indexRI {
if ((row == null) || (row.empty(0)) || (row.empty(3))) continue;
wordHash = row.getColString(0, "UTF-8");
//creationTime = kelondroRecords.bytes2long(row[2]);
- if (newRWI)
- wordEntry = new indexRWIEntryNew(row.getColBytes(3));
- else
- wordEntry = new indexRWIEntryOld(row.getColBytes(3));
+ wordEntry = new indexRWIEntryNew(row.getColBytes(3));
// store to cache
addEntry(wordHash, wordEntry, startTime, false);
urlCount++;
@@ -423,10 +418,10 @@ public final class indexRAMRI implements indexRI {
return delCount;
}
- public synchronized indexContainer addEntries(indexContainer container, long updateTime, boolean dhtCase) {
+ public synchronized void addEntries(indexContainer container, long updateTime, boolean dhtCase) {
// this puts the entries into the cache, not into the assortment directly
int added = 0;
- if ((container == null) || (container.size() == 0)) return null;
+ if ((container == null) || (container.size() == 0)) return;
// put new words into cache
String wordHash = container.getWordHash();
@@ -443,28 +438,26 @@ public final class indexRAMRI implements indexRI {
hashDate.setScore(wordHash, intTime(updateTime));
}
entries = null;
- return null;
}
- public synchronized indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
- indexContainer container = (indexContainer) cache.get(wordHash);
- if (container == null) container = new indexContainer(wordHash, this.payloadrow, true);
- indexRWIEntry[] entries = new indexRWIEntry[] { newEntry };
- if (container.add(entries, updateTime) > 0) {
- cache.put(wordHash, container);
- hashScore.incScore(wordHash);
- hashDate.setScore(wordHash, intTime(updateTime));
- return null;
- }
- container = null;
- entries = null;
- return null;
+ public synchronized void addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
+ indexContainer container = (indexContainer) cache.get(wordHash);
+ if (container == null) container = new indexContainer(wordHash, this.payloadrow);
+ indexRWIEntry[] entries = new indexRWIEntry[] { newEntry };
+ if (container.add(entries, updateTime) > 0) {
+ cache.put(wordHash, container);
+ hashScore.incScore(wordHash);
+ hashDate.setScore(wordHash, intTime(updateTime));
+ return;
+ }
+ container = null;
+ entries = null;
}
- public synchronized void close(int waitingSeconds) {
+ public synchronized void close() {
// dump cache
try {
- dump(waitingSeconds);
+ dump();
} catch (IOException e){
log.logSevere("unable to dump cache: " + e.getMessage(), e);
}
diff --git a/source/de/anomic/index/indexRI.java b/source/de/anomic/index/indexRI.java
index 9618e0303..4313dbe9f 100644
--- a/source/de/anomic/index/indexRI.java
+++ b/source/de/anomic/index/indexRI.java
@@ -44,9 +44,9 @@ public interface indexRI {
public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete);
public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete);
- public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtCase);
- public indexContainer addEntries(indexContainer newEntries, long creationTime, boolean dhtCase);
+ public void addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtCase);
+ public void addEntries(indexContainer newEntries, long creationTime, boolean dhtCase);
- public void close(int waitingSeconds);
+ public void close();
}
diff --git a/source/de/anomic/index/indexRWIEntryNew.java b/source/de/anomic/index/indexRWIEntryNew.java
index dc2efb9b0..f064b1844 100644
--- a/source/de/anomic/index/indexRWIEntryNew.java
+++ b/source/de/anomic/index/indexRWIEntryNew.java
@@ -152,10 +152,9 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
assert oldEntry.urlHash() != null;
this.entry = urlEntryRow.newEntry();
int mddlm = plasmaWordIndex.microDateDays(oldEntry.lastModified());
- int mddct = plasmaWordIndex.microDateDays(System.currentTimeMillis());
this.entry.setCol(col_urlhash, oldEntry.urlHash(), null);
this.entry.setCol(col_lastModified, mddlm);
- this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
+ this.entry.setCol(col_freshUntil, 0);
this.entry.setCol(col_wordsInTitle, 20); // guessed
this.entry.setCol(col_wordsInText, oldEntry.wordcount());
this.entry.setCol(col_phrasesInText, oldEntry.phrasecount());
diff --git a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java
index 00c0fa711..4e302ef94 100644
--- a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java
+++ b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java
@@ -290,7 +290,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
String referrerHash = (this.refererURLString==null)?null:plasmaURL.urlHash(this.refererURLString);
// create a new errorURL DB entry
- plasmaCrawlEURL.Entry ee = this.sb.urlPool.errorURL.newEntry(
+ plasmaCrawlEURL.Entry ee = this.sb.errorURL.newEntry(
this.url,
referrerHash,
this.initiator,
@@ -304,7 +304,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
ee.store();
// push it onto the stack
- this.sb.urlPool.errorURL.stackPushEntry(ee);
+ this.sb.errorURL.stackPushEntry(ee);
// delete the cache file
File cacheFile = this.cacheManager.getCachePath(this.url);
diff --git a/source/de/anomic/plasma/crawler/http/CrawlWorker.java b/source/de/anomic/plasma/crawler/http/CrawlWorker.java
index 1429f8e44..cfe4e43ae 100644
--- a/source/de/anomic/plasma/crawler/http/CrawlWorker.java
+++ b/source/de/anomic/plasma/crawler/http/CrawlWorker.java
@@ -342,7 +342,7 @@ public final class CrawlWorker extends AbstractCrawlWorker {
String urlhash = plasmaURL.urlHash(redirectionUrl);
// removing url from loader queue
- plasmaCrawlLoader.switchboard.urlPool.noticeURL.remove(urlhash);
+ plasmaCrawlLoader.switchboard.noticeURL.remove(urlhash);
// retry crawling with new url
this.url = redirectionUrl;
diff --git a/source/de/anomic/plasma/dbImport/AbstractImporter.java b/source/de/anomic/plasma/dbImport/AbstractImporter.java
index 823f6642a..4dcdd8798 100644
--- a/source/de/anomic/plasma/dbImport/AbstractImporter.java
+++ b/source/de/anomic/plasma/dbImport/AbstractImporter.java
@@ -2,7 +2,7 @@ package de.anomic.plasma.dbImport;
import java.io.File;
-import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.logging.serverLog;
public abstract class AbstractImporter extends Thread implements dbImporter{
@@ -13,8 +13,7 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
protected boolean stopped = false;
protected boolean paused = false;
- protected plasmaSwitchboard sb;
- protected File importPath, indexPath;
+ protected File importPath;
protected int cacheSize;
protected long preloadTime;
@@ -23,27 +22,27 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
protected long globalPauseLast;
protected long globalPauseDuration;
protected String error;
+ protected plasmaWordIndex wi;
- public AbstractImporter(plasmaSwitchboard theSb) {
- super(theSb.dbImportManager.runningJobs,"");
- this.sb = theSb;
+ public AbstractImporter(plasmaWordIndex wi) {
+ //super(theSb.dbImportManager.runningJobs,"");
+ this.wi = wi;
}
public String getError() {
return this.error;
}
- public void init(File theImportPath, File theIndexPath) {
+ public void init(File theImportPath) {
if (theImportPath == null) throw new NullPointerException("The Import path must not be null.");
this.importPath = theImportPath;
- this.indexPath = theIndexPath;
// getting a job id from the import manager
- this.jobID = this.sb.dbImportManager.getJobID();
+ //this.jobID = this.sb.dbImportManager.getJobID();
// initializing the logger and setting a more verbose thread name
this.log = new serverLog("IMPORT_" + this.jobType + "_" + this.jobID);
- this.setName("IMPORT_" + this.jobType + "_" + this.sb.dbImportManager.getJobID());
+ this.setName("IMPORT_" + this.jobType /*+ "_" + this.sb.dbImportManager.getJobID()*/);
}
public void startIt() {
diff --git a/source/de/anomic/plasma/dbImport/AssortmentImporter.java b/source/de/anomic/plasma/dbImport/AssortmentImporter.java
index 20a5640eb..2e70f25ff 100644
--- a/source/de/anomic/plasma/dbImport/AssortmentImporter.java
+++ b/source/de/anomic/plasma/dbImport/AssortmentImporter.java
@@ -5,8 +5,7 @@ import java.io.IOException;
import java.util.Iterator;
import de.anomic.index.indexContainer;
-import de.anomic.index.indexRWIEntryOld;
-import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.plasmaWordIndexAssortment;
public class AssortmentImporter extends AbstractImporter implements dbImporter{
@@ -15,31 +14,29 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{
private int wordEntityCount = 0;
private int wordEntryCount = 0;
- private File importAssortmentFile;
private plasmaWordIndexAssortment assortmentFile;
- public AssortmentImporter(plasmaSwitchboard sb) {
- super(sb);
+ public AssortmentImporter(plasmaWordIndex wi) {
+ super(wi);
this.jobType = "ASSORTMENT";
}
- public void init(File theImportAssortmentFile, File theIndexFile, int theCacheSize, long preloadTime) {
- super.init(theImportAssortmentFile, theIndexFile);
- this.importAssortmentFile = theImportAssortmentFile;
+ public void init(File theImportAssortmentFile, int theCacheSize, long preloadTime) {
+ super.init(theImportAssortmentFile);
this.cacheSize = theCacheSize;
if (this.cacheSize < 2*1024*1024) this.cacheSize = 2*1024*1024;
String errorMsg = null;
- if (!this.importAssortmentFile.getName().matches("indexAssortment0[0-6][0-9]\\.db"))
- errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' has an invalid name.";
- if (!this.importAssortmentFile.exists())
- errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' does not exist.";
- else if (this.importAssortmentFile.isDirectory())
- errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' is a directory.";
- else if (!this.importAssortmentFile.canRead())
- errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' is not readable.";
- else if (!this.importAssortmentFile.canWrite())
- errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' is not writeable.";
+ if (!this.importPath.getName().matches("indexAssortment0[0-6][0-9]\\.db"))
+ errorMsg = "AssortmentFile '" + this.importPath + "' has an invalid name.";
+ if (!this.importPath.exists())
+ errorMsg = "AssortmentFile '" + this.importPath + "' does not exist.";
+ else if (this.importPath.isDirectory())
+ errorMsg = "AssortmentFile '" + this.importPath + "' is a directory.";
+ else if (!this.importPath.canRead())
+ errorMsg = "AssortmentFile '" + this.importPath + "' is not readable.";
+ else if (!this.importPath.canWrite())
+ errorMsg = "AssortmentFile '" + this.importPath + "' is not writeable.";
if (errorMsg != null) {
this.log.logSevere(errorMsg);
throw new IllegalStateException(errorMsg);
@@ -49,10 +46,10 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{
File importAssortmentPath = null;
int assortmentNr = -1;
try {
- importAssortmentPath = new File(this.importAssortmentFile.getParent());
- assortmentNr = Integer.valueOf(this.importAssortmentFile.getName().substring("indexAssortment".length(),"indexAssortment".length()+3)).intValue();
+ importAssortmentPath = new File(this.importPath.getParent());
+ assortmentNr = Integer.valueOf(this.importPath.getName().substring("indexAssortment".length(),"indexAssortment".length()+3)).intValue();
if (assortmentNr <1 || assortmentNr > 64) {
- errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' has an invalid name.";
+ errorMsg = "AssortmentFile '" + this.importPath + "' has an invalid name.";
}
} catch (NumberFormatException e) {
errorMsg = "Unable to parse the assortment file number.";
@@ -61,9 +58,9 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{
}
// initializing the import assortment db
- this.log.logInfo("Initializing source assortment file");
+ this.log.logInfo("Initializing source assortment file " + theImportAssortmentFile);
try {
- this.assortmentFile = new plasmaWordIndexAssortment(importAssortmentPath, indexRWIEntryOld.urlEntryRow, assortmentNr, this.cacheSize/1024, preloadTime, this.log);
+ this.assortmentFile = new plasmaWordIndexAssortment(importAssortmentPath, assortmentNr, this.cacheSize/1024, preloadTime, this.log);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
@@ -95,7 +92,7 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{
public void run() {
try {
// getting a content interator
- Iterator contentIterator = this.assortmentFile.containers(null, true, false);
+ Iterator contentIterator = this.assortmentFile.wordContainers(null, true, false);
while (contentIterator.hasNext()) {
this.wordEntityCount++;
@@ -105,14 +102,11 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{
this.wordEntryCount += container.size();
// importing entity container to home db
- this.sb.wordIndex.addEntries(container, System.currentTimeMillis(), false);
+ wi.addEntries(container, System.currentTimeMillis(), false);
- if (this.wordEntityCount % 500 == 0) {
+ if (this.wordEntityCount % 1000 == 0) {
this.log.logFine(this.wordEntityCount + " word entities processed so far.");
}
- if (this.wordEntryCount % 2000 == 0) {
- this.log.logFine(this.wordEntryCount + " word entries processed so far.");
- }
if (isAborted()) break;
}
} catch (Exception e) {
@@ -121,8 +115,12 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{
} finally {
this.log.logInfo("Import process finished.");
this.globalEnd = System.currentTimeMillis();
- this.sb.dbImportManager.finishedJobs.add(this);
+ //this.sb.dbImportManager.finishedJobs.add(this);
this.assortmentFile.close();
+ File bkpPath = new File(importPath.getParentFile(), "imported");
+ bkpPath.mkdirs();
+ File bkpFile = new File(bkpPath, importPath.getName());
+ importPath.renameTo(bkpFile);
}
}
diff --git a/source/de/anomic/plasma/dbImport/dbImportManager.java b/source/de/anomic/plasma/dbImport/dbImportManager.java
index fbca00a2a..0c8c253e2 100644
--- a/source/de/anomic/plasma/dbImport/dbImportManager.java
+++ b/source/de/anomic/plasma/dbImport/dbImportManager.java
@@ -58,10 +58,8 @@ public class dbImportManager {
if (type.length() == 0) return null;
dbImporter newImporter = null;
- if (type.equals("plasmaDB")) {
- newImporter = new plasmaDbImporter(this.sb);
- } else if (type.equalsIgnoreCase("ASSORTMENT")) {
- newImporter = new AssortmentImporter(this.sb);
+ if (type.equalsIgnoreCase("ASSORTMENT")) {
+ newImporter = new AssortmentImporter(this.sb.wordIndex);
} else if (type.equalsIgnoreCase("NURL")) {
newImporter = new plasmaCrawlNURLImporter(this.sb);
}
diff --git a/source/de/anomic/plasma/dbImport/dbImporter.java b/source/de/anomic/plasma/dbImport/dbImporter.java
index 81fe9de94..c141f68fc 100644
--- a/source/de/anomic/plasma/dbImport/dbImporter.java
+++ b/source/de/anomic/plasma/dbImport/dbImporter.java
@@ -24,6 +24,6 @@ public interface dbImporter {
public String getError();
public String getStatus();
- public void init(File importPath, File indexPath, int cacheSize, long preloadTime);
+ public void init(File indexPath, int cacheSize, long preloadTime);
public void startIt();
}
diff --git a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java
index 258956b47..2385b5345 100644
--- a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java
+++ b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java
@@ -18,9 +18,10 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
private int importStartSize;
private int urlCount = 0;
private int profileCount = 0;
+ private plasmaSwitchboard sb;
public plasmaCrawlNURLImporter(plasmaSwitchboard theSb) {
- super(theSb);
+ super(theSb.wordIndex);
this.jobType="NURL";
}
@@ -45,8 +46,8 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
return theStatus.toString();
}
- public void init(File theImportPath, File theIndexPath, int theCacheSize, long preloadTime) {
- super.init(theImportPath, theIndexPath);
+ public void init(File theImportPath, int theCacheSize, long preloadTime) {
+ super.init(theImportPath);
this.cacheSize = theCacheSize;
this.preloadTime = preloadTime;
@@ -174,10 +175,10 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
}
// if the url does not alredy exists in the destination stack we insert it now
- if (!this.sb.urlPool.noticeURL.existsInStack(nextHash)) {
- plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(nextEntry);
+ if (!this.sb.noticeURL.existsInStack(nextHash)) {
+ plasmaCrawlNURL.Entry ne = this.sb.noticeURL.newEntry(nextEntry);
ne.store();
- this.sb.urlPool.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.url().getHost(), ne.hash());
+ this.sb.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.url().getHost(), ne.hash());
}
// removing hash from the import db
diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
index 77ec24cfa..8a3b08d58 100644
--- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
+++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
@@ -9,17 +9,11 @@ import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroNaturalOrder;
-import de.anomic.plasma.plasmaCrawlLURL;
-import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverDate;
public class plasmaDbImporter extends AbstractImporter implements dbImporter {
- private plasmaCrawlLURL homeUrlDB;
- private plasmaWordIndex homeWordIndex;
-
- private plasmaCrawlLURL importUrlDB;
private plasmaWordIndex importWordIndex;
private int importStartSize;
@@ -30,8 +24,9 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
private long urlCounter = 0, wordCounter = 0, entryCounter = 0, notBoundEntryCounter = 0;
- public plasmaDbImporter(plasmaSwitchboard theSb) {
- super(theSb);
+ public plasmaDbImporter(plasmaWordIndex homeWI, plasmaWordIndex importWI) {
+ super(homeWI);
+ this.importWordIndex = importWI;
this.jobType = "PLASMADB";
}
@@ -51,18 +46,12 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
return theStatus.toString();
}
- public void init(File theImportPath, File theIndexPath, int theCacheSize, long preloadTime) {
- super.init(theImportPath, theIndexPath);
+ public void init(File theImportPath, int theCacheSize, long preloadTime) {
+ super.init(theImportPath);
- this.homeWordIndex = this.sb.wordIndex;
- this.homeUrlDB = this.sb.urlPool.loadedURL;
this.cacheSize = theCacheSize;
if (this.cacheSize < 2*1024*1024) this.cacheSize = 8*1024*1024;
- if (this.homeWordIndex.getRoot().equals(this.importPath)) {
- throw new IllegalArgumentException("Import and home DB directory must not be equal");
- }
-
// configure import DB
String errorMsg = null;
if (!this.importPath.exists()) errorMsg = "Import directory does not exist.";
@@ -75,10 +64,8 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
}
this.log.logFine("Initializing source word index db.");
- this.importWordIndex = new plasmaWordIndex(this.importPath, this.indexPath, true, (this.cacheSize/2)/1024, preloadTime / 2, this.log);
+ this.importWordIndex = new plasmaWordIndex(this.importPath, this.cacheSize/2, this.cacheSize/2, preloadTime / 2, this.log);
- this.log.logFine("Initializing import URL db.");
- this.importUrlDB = new plasmaCrawlLURL(this.importPath, this.indexPath, (this.cacheSize/2)/1024, preloadTime / 2);
this.importStartSize = this.importWordIndex.size();
}
@@ -87,7 +74,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
importWordsDB();
} finally {
this.globalEnd = System.currentTimeMillis();
- this.sb.dbImportManager.finishedJobs.add(this);
+ //this.sb.dbImportManager.finishedJobs.add(this);
}
}
@@ -107,16 +94,16 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
this.log.logInfo("STARTING DB-IMPORT");
try {
- this.log.logInfo("Importing DB from '" + this.importPath.getAbsolutePath() + "' to '" + this.homeWordIndex.getRoot().getAbsolutePath() + "'.");
- this.log.logInfo("Home word index contains " + this.homeWordIndex.size() + " words and " + this.homeUrlDB.size() + " URLs.");
- this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importUrlDB.size() + " URLs.");
+ this.log.logInfo("Importing DB from '" + this.importPath.getAbsolutePath() + "'");
+ this.log.logInfo("Home word index contains " + wi.size() + " words and " + wi.loadedURL.size() + " URLs.");
+ this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.loadedURL.size() + " URLs.");
HashSet unknownUrlBuffer = new HashSet();
HashSet importedUrlBuffer = new HashSet();
// iterate over all words from import db
//Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
- Iterator indexContainerIterator = this.importWordIndex.indexContainerSet(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false, 100).iterator();
+ Iterator indexContainerIterator = this.importWordIndex.indexContainerSet(this.wordChunkStartHash, false, false, 100).iterator();
while (!isAborted() && indexContainerIterator.hasNext()) {
TreeSet entityUrls = new TreeSet(new kelondroNaturalOrder(true));
@@ -157,11 +144,11 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// we need to import the url
// getting the url entry
- indexURLEntry urlEntry = this.importUrlDB.load(urlHash, null);
+ indexURLEntry urlEntry = this.importWordIndex.loadedURL.load(urlHash, null);
if (urlEntry != null) {
/* write it into the home url db */
- this.homeUrlDB.store(urlEntry);
+ wi.loadedURL.store(urlEntry);
importedUrlBuffer.add(urlHash);
this.urlCounter++;
@@ -183,7 +170,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
if (isAborted()) break;
// importing entity container to home db
- if (newContainer.size() > 0) { this.homeWordIndex.addEntries(newContainer, System.currentTimeMillis(), false); }
+ if (newContainer.size() > 0) { wi.addEntries(newContainer, System.currentTimeMillis(), false); }
// delete complete index entity file
this.importWordIndex.deleteContainer(this.wordHash);
@@ -203,7 +190,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
"Speed: "+ 500*1000/duration + " word entities/s" +
" | Elapsed time: " + serverDate.intervalToString(getElapsedTime()) +
" | Estimated time: " + serverDate.intervalToString(getEstimatedTime()) + "\n" +
- "Home Words = " + this.homeWordIndex.size() +
+ "Home Words = " + wi.size() +
" | Import Words = " + this.importWordIndex.size());
this.wordChunkStart = this.wordChunkEnd;
this.wordChunkStartHash = this.wordChunkEndHash;
@@ -217,7 +204,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
if (!indexContainerIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes
- TreeSet containers = this.importWordIndex.indexContainerSet(this.wordHash, plasmaWordIndex.RL_WORDFILES, false, 100);
+ TreeSet containers = this.importWordIndex.indexContainerSet(this.wordHash, false, false, 100);
indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals(((indexContainer) indexContainerIterator.next()).getWordHash()))) {
@@ -226,16 +213,15 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
}
}
- this.log.logInfo("Home word index contains " + this.homeWordIndex.size() + " words and " + this.homeUrlDB.size() + " URLs.");
- this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importUrlDB.size() + " URLs.");
+ this.log.logInfo("Home word index contains " + wi.size() + " words and " + wi.loadedURL.size() + " URLs.");
+ this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.loadedURL.size() + " URLs.");
} catch (Exception e) {
this.log.logSevere("Database import failed.",e);
e.printStackTrace();
this.error = e.toString();
} finally {
this.log.logInfo("Import process finished.");
- if (this.importUrlDB != null) try { this.importUrlDB.close(); } catch (Exception e){}
- if (this.importWordIndex != null) try { this.importWordIndex.close(5000); } catch (Exception e){}
+ if (this.importWordIndex != null) try { this.importWordIndex.close(); } catch (Exception e){}
}
}
diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java
index e2ef4dd9b..b6c805358 100644
--- a/source/de/anomic/plasma/plasmaCrawlEURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlEURL.java
@@ -171,9 +171,12 @@ public class plasmaCrawlEURL {
}
}
- public void close() throws IOException {
+ public void close() {
if (urlIndexFile != null) {
- urlIndexFile.close();
+ try {
+ urlIndexFile.close();
+ } catch (IOException e) {
+ }
urlIndexFile = null;
}
}
diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java
index 63496276f..75368f06d 100644
--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@@ -95,11 +95,11 @@ public final class plasmaCrawlLURL {
// the class object
private kelondroIndex urlIndexFile = null;
- public plasmaCrawlLURL(File plasmaPath, File indexPath, int bufferkb, long preloadTime) {
+ public plasmaCrawlLURL(File indexPath, long buffer, long preloadTime) {
super();
try {
- urlIndexFile = new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", bufferkb * 0x400, preloadTime, indexURLEntryNew.rowdef, kelondroBase64Order.enhancedCoder);
+ urlIndexFile = new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", buffer, preloadTime, indexURLEntryNew.rowdef, kelondroBase64Order.enhancedCoder);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
@@ -583,7 +583,7 @@ public final class plasmaCrawlLURL {
} catch (MalformedURLException e) {}
if (args[0].equals("-l")) try {
// arg 1 is path to URLCache
- final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), new File(args[2]), 1, 0);
+ final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[2]), 1, 0);
final Iterator enu = urls.entries(true, false, null);
while (enu.hasNext()) {
System.out.println(((indexURLEntry) enu.next()).toString());
diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java
index 888df3367..229a7100f 100644
--- a/source/de/anomic/plasma/plasmaCrawlStacker.java
+++ b/source/de/anomic/plasma/plasmaCrawlStacker.java
@@ -393,9 +393,9 @@ public final class plasmaCrawlStacker {
// check if the url is double registered
checkInterruption();
String nexturlhash = plasmaURL.urlHash(nexturl);
- String dbocc = this.sb.urlPool.exists(nexturlhash);
+ String dbocc = this.sb.urlExists(nexturlhash);
indexURLEntry oldEntry = null;
- oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null);
+ oldEntry = this.sb.wordIndex.loadedURL.load(nexturlhash, null);
boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
if ((dbocc != null) && (!(recrawl))) {
reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")";
@@ -437,7 +437,7 @@ public final class plasmaCrawlStacker {
// add the url into the crawling queue
checkInterruption();
- plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
+ plasmaCrawlNURL.Entry ne = this.sb.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
nexturl, /* url clear text string */
loadDate, /* load date */
referrerHash, /* last url in crawling queue */
@@ -448,7 +448,7 @@ public final class plasmaCrawlStacker {
0 /*forkfactor, default value */
);
ne.store();
- this.sb.urlPool.noticeURL.push(
+ this.sb.noticeURL.push(
((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/,
nexturl.getHost(),
@@ -1053,7 +1053,7 @@ public final class plasmaCrawlStacker {
// if the url was rejected we store it into the error URL db
if (rejectReason != null) {
- plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry(
+ plasmaCrawlEURL.Entry ee = sb.errorURL.newEntry(
new URL(this.theMsg.url()),
this.theMsg.referrerHash(),
this.theMsg.initiatorHash(),
@@ -1063,7 +1063,7 @@ public final class plasmaCrawlStacker {
new kelondroBitfield()
);
ee.store();
- sb.urlPool.errorURL.stackPushEntry(ee);
+ sb.errorURL.stackPushEntry(ee);
}
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java
index f4d93825b..7fac09f00 100644
--- a/source/de/anomic/plasma/plasmaDHTChunk.java
+++ b/source/de/anomic/plasma/plasmaDHTChunk.java
@@ -180,12 +180,12 @@ public class plasmaDHTChunk {
private void selectTransferContainers(String hash, int mincount, int maxcount, int maxtime) throws InterruptedException {
try {
this.selectionStartTime = System.currentTimeMillis();
- int refcountRAM = selectTransferContainersResource(hash, plasmaWordIndex.RL_RAMCACHE, maxcount, maxtime);
+ int refcountRAM = selectTransferContainersResource(hash, true, maxcount, maxtime);
if (refcountRAM >= mincount) {
log.logFine("DHT selection from RAM: " + refcountRAM + " entries");
return;
}
- int refcountFile = selectTransferContainersResource(hash, plasmaWordIndex.RL_WORDFILES, maxcount, maxtime);
+ int refcountFile = selectTransferContainersResource(hash, false, maxcount, maxtime);
log.logFine("DHT selection from FILE: " + refcountFile + " entries, RAM provided only " + refcountRAM + " entries");
return;
} finally {
@@ -193,11 +193,11 @@ public class plasmaDHTChunk {
}
}
- private int selectTransferContainersResource(String hash, int resourceLevel, int maxcount, int maxtime) throws InterruptedException {
+ private int selectTransferContainersResource(String hash, boolean ram, int maxcount, int maxtime) throws InterruptedException {
// the hash is a start hash from where the indexes are picked
ArrayList tmpContainers = new ArrayList(maxcount);
try {
- Iterator indexContainerIterator = wordIndex.indexContainerSet(hash, resourceLevel, true, maxcount).iterator();
+ Iterator indexContainerIterator = wordIndex.indexContainerSet(hash, ram, true, maxcount).iterator();
indexContainer container;
Iterator urlIter;
indexRWIEntry iEntry;
diff --git a/source/de/anomic/plasma/plasmaDHTFlush.java b/source/de/anomic/plasma/plasmaDHTFlush.java
index 10d95f563..f8ad29a23 100644
--- a/source/de/anomic/plasma/plasmaDHTFlush.java
+++ b/source/de/anomic/plasma/plasmaDHTFlush.java
@@ -169,7 +169,7 @@ public class plasmaDHTFlush extends Thread {
// selecting 500 words to transfer
this.status = "Running: Selecting chunk " + iteration;
- newDHTChunk = new plasmaDHTChunk(this.log, this.wordIndex, this.sb.urlPool.loadedURL, this.chunkSize/3*2, this.chunkSize, -1, this.startPointHash);
+ newDHTChunk = new plasmaDHTChunk(this.log, this.wordIndex, this.sb.wordIndex.loadedURL, this.chunkSize/3*2, this.chunkSize, -1, this.startPointHash);
/* If we havn't selected a word chunk this could be because of
* a) no words are left in the index
diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java
index a5d103ef8..9de1c4440 100644
--- a/source/de/anomic/plasma/plasmaHTCache.java
+++ b/source/de/anomic/plasma/plasmaHTCache.java
@@ -744,7 +744,7 @@ public final class plasmaHTCache {
URL url = null;
// try the urlPool
try {
- url = plasmaSwitchboard.getSwitchboard().urlPool.getURL(urlHash);
+ url = plasmaSwitchboard.getSwitchboard().getURL(urlHash);
} catch (Exception e) {
log.logWarning("getURL(" + urlHash + "): " /*+ e.getMessage()*/, e);
url = null;
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 5c8174b92..6964213fc 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -134,6 +134,7 @@ import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
+import de.anomic.index.indexRWIEntryNew;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
@@ -206,7 +207,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public File rankingPath;
public File workPath;
public HashMap rankingPermissions;
- public plasmaURLPool urlPool;
+ public plasmaCrawlNURL noticeURL;
+ public plasmaCrawlEURL errorURL;
public plasmaWordIndex wordIndex;
public plasmaHTCache cacheManager;
public plasmaSnippetCache snippetCache;
@@ -366,10 +368,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// read memory amount
- int ramLURL = (int) getConfigLong("ramCacheLURL", 1024) / 1024;
+ int ramLURL = (int) getConfigLong("ramCacheLURL", 1024);
long ramLURL_time = getConfigLong("ramCacheLURL_time", 1000);
- ramLURL = Math.max((int) (serverMemory.available() / 2 / 1024), ramLURL);
- setConfig("ramCacheLURL", ramLURL * 1024);
+ ramLURL = Math.max((int) (serverMemory.available() / 2), ramLURL);
+ setConfig("ramCacheLURL", ramLURL);
int ramNURL = (int) getConfigLong("ramCacheNURL", 1024) / 1024;
long ramNURL_time = getConfigLong("ramCacheNURL_time", 1000);
ramNURL = Math.max((int) (serverMemory.available() / 10 / 1024), ramNURL);
@@ -378,10 +380,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
long ramEURL_time = getConfigLong("ramCacheEURL_time", 1000);
ramEURL = Math.max((int) (serverMemory.available() / 20 / 1024), ramEURL);
setConfig("ramCacheEURL", ramEURL * 1024);
- int ramRWI = (int) getConfigLong("ramCacheRWI", 1024) / 1024;
+ int ramRWI = (int) getConfigLong("ramCacheRWI", 1024);
long ramRWI_time = getConfigLong("ramCacheRWI_time", 1000);
- ramRWI = Math.max((int) (serverMemory.available() / 4 / 1024), ramRWI);
- setConfig("ramCacheRWI", ramRWI * 1024);
+ ramRWI = Math.max((int) (serverMemory.available() / 4), ramRWI);
+ setConfig("ramCacheRWI", ramRWI);
int ramHTTP = (int) getConfigLong("ramCacheHTTP", 1024) / 1024;
long ramHTTP_time = getConfigLong("ramCacheHTTP_time", 1000);
int ramMessage = (int) getConfigLong("ramCacheMessage", 1024) / 1024;
@@ -429,12 +431,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// start indexing management
log.logConfig("Starting Indexing Management");
- urlPool = new plasmaURLPool(plasmaPath, indexPath,
- ramLURL,
- ramNURL,
- ramEURL,
- ramLURL_time);
- wordIndex = new plasmaWordIndex(plasmaPath, indexPath, true, ramRWI, ramRWI_time, log);
+ wordIndex = new plasmaWordIndex(indexPath, ramRWI, ramLURL, ramRWI_time, log);
+ noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL, -1);
+ errorURL = new plasmaCrawlEURL(plasmaPath, ramEURL, -1);
// set a high maximum cache size to current size; this is adopted later automatically
int wordCacheMaxCount = Math.max((int) getConfigLong("wordCacheInitCount", 30000),
@@ -471,7 +470,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
* initialize switchboard queue
* ====================================================================== */
// create queue
- this.sbQueue = new plasmaSwitchboardQueue(this.cacheManager, this.urlPool.loadedURL, new File(this.plasmaPath, "switchboardQueue1.stack"), this.profiles);
+ this.sbQueue = new plasmaSwitchboardQueue(this.cacheManager, this.wordIndex.loadedURL, new File(this.plasmaPath, "switchboardQueue1.stack"), this.profiles);
// setting the indexing queue slots
indexingSlots = (int) getConfigLong("indexer.slots", 100);
@@ -727,6 +726,29 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public boolean isRobinsonMode() {
return (yacyCore.seedDB.sizeConnected() == 0) && (yacyCore.seedDB.mySeed.isVirgin());
}
+
+ public String urlExists(String hash) {
+ // tests if hash occurrs in any database
+ // if it exists, the name of the database is returned,
+ // if it not exists, null is returned
+ if (wordIndex.loadedURL.exists(hash)) return "loaded";
+ if (noticeURL.existsInStack(hash)) return "crawler";
+ if (errorURL.exists(hash)) return "errors";
+ return null;
+ }
+
+ public URL getURL(String urlhash) throws IOException {
+ if (urlhash.equals(plasmaURL.dummyHash)) return null;
+ try {
+ plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
+ if (ne != null) return ne.url();
+ } catch (IOException e) {}
+ indexURLEntry le = wordIndex.loadedURL.load(urlhash, null);
+ if (le != null) return le.comp().url();
+ plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
+ if (ee != null) return ee.url();
+ return null;
+ }
/**
* This method changes the HTCache size.
@@ -796,7 +818,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public boolean cleanProfiles() throws InterruptedException {
- if ((sbQueue.size() > 0) || (cacheLoader.size() > 0) || (urlPool.noticeURL.stackSize() > 0)) return false;
+ if ((sbQueue.size() > 0) || (cacheLoader.size() > 0) || (noticeURL.stackSize() > 0)) return false;
final Iterator iter = profiles.profiles(true);
plasmaCrawlProfile.entry entry;
boolean hasDoneSomething = false;
@@ -970,9 +992,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
sbQueue.close();
flushCitationReference(crg, "crg");
log.logConfig("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager (stand by...)");
- int waitingBoundSeconds = Integer.parseInt(getConfig("maxWaitingWordFlush", "120"));
- urlPool.close();
- wordIndex.close(waitingBoundSeconds);
+ noticeURL.close();
+ errorURL.close();
+ wordIndex.close();
log.logConfig("SWITCHBOARD SHUTDOWN TERMINATED");
}
@@ -1017,7 +1039,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// flush some entries from the RAM cache
// (new permanent cache flushing)
wordIndex.flushCacheSome(sbQueue.size() != 0);
- urlPool.loadedURL.flushCacheSome();
+ wordIndex.loadedURL.flushCacheSome();
boolean doneSomething = false;
@@ -1041,7 +1063,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
) {
// generate new chunk
int minChunkSize = (int) getConfigLong("indexDistribution.minChunkSize", 30);
- dhtTransferChunk = new plasmaDHTChunk(this.log, this.wordIndex, this.urlPool.loadedURL, minChunkSize, dhtTransferIndexCount, 5000);
+ dhtTransferChunk = new plasmaDHTChunk(this.log, wordIndex, wordIndex.loadedURL, minChunkSize, dhtTransferIndexCount, 5000);
doneSomething = true;
}
@@ -1079,10 +1101,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// do one processing step
log.logFine("DEQUEUE: sbQueueSize=" + sbQueue.size() +
- ", coreStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) +
- ", limitStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) +
- ", overhangStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) +
- ", remoteStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE));
+ ", coreStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) +
+ ", limitStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) +
+ ", overhangStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) +
+ ", remoteStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE));
try {
nextentry = sbQueue.pop();
if (nextentry == null) {
@@ -1112,9 +1134,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public int cleanupJobSize() {
int c = 0;
- if ((urlPool.errorURL.stackSize() > 1000)) c++;
+ if ((errorURL.stackSize() > 1000)) c++;
for (int i = 1; i <= 6; i++) {
- if (urlPool.loadedURL.getStackSize(i) > 1000) c++;
+ if (wordIndex.loadedURL.getStackSize(i) > 1000) c++;
}
return c;
}
@@ -1133,17 +1155,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// clean up error stack
checkInterruption();
- if ((urlPool.errorURL.stackSize() > 1000)) {
- log.logFine("Cleaning Error-URLs report stack, " + urlPool.errorURL.stackSize() + " entries on stack");
- urlPool.errorURL.clearStack();
+ if ((errorURL.stackSize() > 1000)) {
+ log.logFine("Cleaning Error-URLs report stack, " + errorURL.stackSize() + " entries on stack");
+ errorURL.clearStack();
hasDoneSomething = true;
}
// clean up loadedURL stack
for (int i = 1; i <= 6; i++) {
checkInterruption();
- if (urlPool.loadedURL.getStackSize(i) > 1000) {
- log.logFine("Cleaning Loaded-URLs report stack, " + urlPool.loadedURL.getStackSize(i) + " entries on stack " + i);
- urlPool.loadedURL.clearStack(i);
+ if (wordIndex.loadedURL.getStackSize(i) > 1000) {
+ log.logFine("Cleaning Loaded-URLs report stack, " + wordIndex.loadedURL.getStackSize(i) + " entries on stack " + i);
+ wordIndex.loadedURL.clearStack(i);
hasDoneSomething = true;
}
}
@@ -1209,11 +1231,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public int coreCrawlJobSize() {
- return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
+ return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
}
public boolean coreCrawlJob() {
- if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) {
+ if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) {
//log.logDebug("CoreCrawl: queue is empty");
return false;
}
@@ -1247,10 +1269,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// do a local crawl
plasmaCrawlNURL.Entry urlEntry = null;
- while (urlEntry == null && urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) {
- String stats = "LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
+ while (urlEntry == null && noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) {
+ String stats = "LOCALCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try {
- urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE);
+ urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE);
String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
@@ -1276,11 +1298,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public int limitCrawlTriggerJobSize() {
- return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
+ return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
}
public boolean limitCrawlTriggerJob() {
- if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) == 0) {
+ if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) == 0) {
//log.logDebug("LimitCrawl: queue is empty");
return false;
}
@@ -1292,7 +1314,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (toshift > 1000) toshift = 1000;
if (toshift > limitCrawlTriggerJobSize()) toshift = limitCrawlTriggerJobSize();
for (int i = 0; i < toshift; i++) {
- urlPool.noticeURL.shift(plasmaCrawlNURL.STACK_TYPE_LIMIT, plasmaCrawlNURL.STACK_TYPE_CORE);
+ noticeURL.shift(plasmaCrawlNURL.STACK_TYPE_LIMIT, plasmaCrawlNURL.STACK_TYPE_CORE);
}
log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl");
}
@@ -1312,10 +1334,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// start a global crawl, if possible
- String stats = "REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
- + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
+ String stats = "REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
+ + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try {
- plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
+ plasmaCrawlNURL.Entry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
@@ -1327,7 +1349,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logFine("plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter="
+ profile.generalFilter() + ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false")));
- boolean tryRemote = ((urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (sbQueue.size() != 0)) &&
+ boolean tryRemote = ((noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (sbQueue.size() != 0)) &&
(profile.remoteIndexing()) &&
(urlEntry.initiator() != null) &&
// (!(urlEntry.initiator().equals(indexURL.dummyHash))) &&
@@ -1359,7 +1381,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public int remoteTriggeredCrawlJobSize() {
- return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE);
+ return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE);
}
public boolean remoteTriggeredCrawlJob() {
@@ -1367,7 +1389,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// do nothing if either there are private processes to be done
// or there is no global crawl on the stack
- if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) == 0) {
+ if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) == 0) {
//log.logDebug("GlobalCrawl: queue is empty");
return false;
}
@@ -1398,10 +1420,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
- String stats = "REMOTETRIGGEREDCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
- + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
+ String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
+ + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try {
- plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE);
+ plasmaCrawlNURL.Entry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE);
String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " +
@@ -1531,7 +1553,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} catch (MalformedURLException e1) {}
}
log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.normalizedURLString() +
- ", NEW CRAWL STACK SIZE IS " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
+ ", NEW CRAWL STACK SIZE IS " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
}
stackEndTime = System.currentTimeMillis();
@@ -1568,7 +1590,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
// create a new loaded URL db entry
- indexURLEntry newEntry = urlPool.loadedURL.newEntry(
+ indexURLEntry newEntry = wordIndex.loadedURL.newEntry(
entry.url(), // URL
docDescription, // document description
"", // author
@@ -1594,8 +1616,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
/* ========================================================================
* STORE URL TO LOADED-URL-DB
* ======================================================================== */
- urlPool.loadedURL.store(newEntry);
- urlPool.loadedURL.stack(
+ wordIndex.loadedURL.store(newEntry);
+ wordIndex.loadedURL.stack(
newEntry, // loaded url db entry
initiatorPeerHash, // initiator peer hash
yacyCore.seedDB.mySeed.hash, // executor peer hash
@@ -1672,7 +1694,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String word = (String) wentry.getKey();
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
String wordHash = plasmaCondenser.word2hash(word);
- indexRWIEntry wordIdxEntry = wordIndex.newRWIEntry(
+ indexRWIEntry wordIdxEntry = new indexRWIEntryNew(
urlHash,
urlLength, urlComps,
wordStat.count,
@@ -1807,7 +1829,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// removing current entry from notice URL queue
- boolean removed = urlPool.noticeURL.remove(entry.urlHash()); // worked-off
+ boolean removed = noticeURL.remove(entry.urlHash()); // worked-off
if (!removed) {
log.logFinest("Unable to remove indexed URL " + entry.url() + " from Crawler Queue. This could be because of an URL redirect.");
}
@@ -1911,7 +1933,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
URL refererURL = null;
String refererHash = urlEntry.referrerHash();
if ((refererHash != null) && (!refererHash.equals(plasmaURL.dummyHash))) try {
- refererURL = this.urlPool.getURL(refererHash);
+ refererURL = this.getURL(refererHash);
} catch (IOException e) {
refererURL = null;
}
@@ -1924,7 +1946,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// return true iff another peer has/will index(ed) the url
if (urlEntry == null) {
- log.logInfo("REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null");
+ log.logInfo("REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null");
return true; // superfluous request; true correct in this context
}
@@ -1952,7 +1974,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// do the request
try {
- HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), urlPool.getURL(urlEntry.referrerHash()), 6000);
+ HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), getURL(urlEntry.referrerHash()), 6000);
// check success
/*
@@ -1990,10 +2012,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
- indexURLEntry entry = urlPool.loadedURL.newEntry(propStr);
- urlPool.loadedURL.store(entry);
- urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
- urlPool.noticeURL.remove(entry.hash());
+ indexURLEntry entry = wordIndex.loadedURL.newEntry(propStr);
+ wordIndex.loadedURL.store(entry);
+ wordIndex.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
+ noticeURL.remove(entry.hash());
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'");
return true;
} else {
@@ -2051,7 +2073,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
//}
// create a new search event
- plasmaSearchEvent theSearch = new plasmaSearchEvent(query, ranking, localTiming, remoteTiming, postsort, log, wordIndex, urlPool.loadedURL, snippetCache);
+ plasmaSearchEvent theSearch = new plasmaSearchEvent(query, ranking, localTiming, remoteTiming, postsort, log, wordIndex, wordIndex.loadedURL, snippetCache);
plasmaSearchResult acc = theSearch.search();
// fetch snippets
@@ -2094,7 +2116,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ((seed == null) || ((address = seed.getAddress()) == null)) {
// seed is not known from here
removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes(), "UTF-8"));
- urlPool.loadedURL.remove(urlentry.hash()); // clean up
+ wordIndex.loadedURL.remove(urlentry.hash()); // clean up
continue; // next result
}
urlname = "http://share." + seed.getName() + ".yacy" + filename;
@@ -2217,7 +2239,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// finally, delete the url entry
// determine the url string
- indexURLEntry entry = urlPool.loadedURL.load(urlhash, null);
+ indexURLEntry entry = wordIndex.loadedURL.load(urlhash, null);
if (entry == null) return 0;
indexURLEntry.Components comp = entry.comp();
if (comp.url() == null) return 0;
@@ -2245,7 +2267,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (witer != null) count = removeReferences(urlhash, witer);
// finally delete the url entry itself
- urlPool.loadedURL.remove(urlhash);
+ wordIndex.loadedURL.remove(urlhash);
return count;
} catch (ParserException e) {
return 0;
@@ -2373,15 +2395,15 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (getConfig("allowDistributeIndex","false").equalsIgnoreCase("false")) {
return "no DHT distribution: not enabled";
}
- if (urlPool.loadedURL.size() < 10) {
- return "no DHT distribution: loadedURL.size() = " + urlPool.loadedURL.size();
+ if (wordIndex.loadedURL.size() < 10) {
+ return "no DHT distribution: loadedURL.size() = " + wordIndex.loadedURL.size();
}
if (wordIndex.size() < 100) {
return "no DHT distribution: not enough words - wordIndex.size() = " + wordIndex.size();
}
if ((getConfig("allowDistributeIndexWhileCrawling","false").equalsIgnoreCase("false")) &&
- ((urlPool.noticeURL.stackSize() > 0) || (sbQueue.size() > 3))) {
- return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " + urlPool.noticeURL.stackSize() + ", sbQueue.size() = " + sbQueue.size();
+ ((noticeURL.stackSize() > 0) || (sbQueue.size() > 3))) {
+ return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " + noticeURL.stackSize() + ", sbQueue.size() = " + sbQueue.size();
}
return null;
}
@@ -2522,7 +2544,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
kelondroBitfield flags
) {
// create a new errorURL DB entry
- plasmaCrawlEURL.Entry ee = this.urlPool.errorURL.newEntry(
+ plasmaCrawlEURL.Entry ee = this.errorURL.newEntry(
url,
referrerHash,
initiator,
@@ -2534,7 +2556,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// store the entry
ee.store();
// push it onto the stack
- this.urlPool.errorURL.stackPushEntry(ee);
+ this.errorURL.stackPushEntry(ee);
}
public void checkInterruption() throws InterruptedException {
diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java
deleted file mode 100644
index 59fec6dee..000000000
--- a/source/de/anomic/plasma/plasmaURLPool.java
+++ /dev/null
@@ -1,99 +0,0 @@
-// plasmaURLPool.java
-// -----------------------
-// part of YaCy
-// (C) by Michael Peter Christen; mc@anomic.de
-// first published on http://www.anomic.de
-// Frankfurt, Germany, 2005
-// last major change: 16.06.2005
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-//
-// Using this software in any meaning (reading, learning, copying, compiling,
-// running) means that you agree that the Author(s) is (are) not responsible
-// for cost, loss of data or any harm that may be caused directly or indirectly
-// by usage of this softare or this documentation. The usage of this software
-// is on your own risk. The installation and usage (starting/running) of this
-// software may allow other people or application to access your computer and
-// any attached devices and is highly dependent on the configuration of the
-// software which must be done by the user of the software; the author(s) is
-// (are) also not responsible for proper configuration and usage of the
-// software, even if provoked by documentation provided together with
-// the software.
-//
-// Any changes to this file according to the GPL as documented in the file
-// gpl.txt aside this file in the shipment you received can be done to the
-// lines that follows this copyright notice here, but changes must not be
-// done inside the copyright notive above. A re-distribution must contain
-// the intact and unchanged copyright notice.
-// Contributions and changes to the program code must be marked as such.
-
-// this class combines all url storage methods into one. It is the host for all url storage
-
-
-package de.anomic.plasma;
-
-import java.io.File;
-import java.io.IOException;
-
-import de.anomic.plasma.plasmaURL;
-import de.anomic.index.indexURLEntry;
-import de.anomic.net.URL;
-
-public class plasmaURLPool {
-
-
- public final plasmaCrawlLURL loadedURL;
- public final plasmaCrawlNURL noticeURL;
- public final plasmaCrawlEURL errorURL;
-
- public plasmaURLPool(File plasmaPath, File indexPath,
- int ramLURL,
- int ramNURL,
- int ramEURL,
- long preloadTime) {
- loadedURL = new plasmaCrawlLURL(plasmaPath, indexPath, ramLURL, preloadTime);
- noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL, -1);
- errorURL = new plasmaCrawlEURL(plasmaPath, ramEURL, -1);
- }
-
- public String exists(String hash) {
- // tests if hash occurrs in any database
- // if it exists, the name of the database is returned,
- // if it not exists, null is returned
- if (loadedURL.exists(hash)) return "loaded";
- if (noticeURL.existsInStack(hash)) return "crawler";
- if (errorURL.exists(hash)) return "errors";
- return null;
- }
-
- public URL getURL(String urlhash) throws IOException {
- if (urlhash.equals(plasmaURL.dummyHash)) return null;
- try {
- plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
- if (ne != null) return ne.url();
- } catch (IOException e) {}
- indexURLEntry le = loadedURL.load(urlhash, null);
- if (le != null) return le.comp().url();
- plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
- if (ee != null) return ee.url();
- return null;
- }
-
- public void close() {
- try {loadedURL.close();} catch (IOException e) {}
- noticeURL.close();
- try {errorURL.close();} catch (IOException e) {}
- }
-}
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index d1a78b912..08df1d71d 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -27,7 +27,6 @@
package de.anomic.plasma;
import java.io.File;
-import java.io.IOException;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
@@ -47,11 +46,8 @@ import de.anomic.index.indexRWIEntryNew;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
-import de.anomic.kelondro.kelondroBitfield;
-import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMergeIterator;
import de.anomic.kelondro.kelondroOrder;
-import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.logging.serverLog;
@@ -59,67 +55,33 @@ import de.anomic.yacy.yacyDHTAction;
public final class plasmaWordIndex implements indexRI {
- private static final kelondroRow payloadrowold = indexRWIEntryOld.urlEntryRow;
- private static final kelondroRow payloadrownew = indexRWIEntryNew.urlEntryRow;
+ private final kelondroOrder indexOrder = kelondroBase64Order.enhancedCoder;
+ private final indexRAMRI dhtOutCache, dhtInCache;
+ private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster
+ public boolean busyCacheFlush; // shows if a cache flush is currently performed
+ private int idleDivisor, busyDivisor;
+ public final plasmaCrawlLURL loadedURL;
- private final File oldDatabaseRoot;
- private final kelondroOrder indexOrder = kelondroBase64Order.enhancedCoder;
- private final indexRAMRI dhtOutCache, dhtInCache;
- private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster
- public boolean busyCacheFlush; // shows if a cache flush is currently performed
- private int idleDivisor, busyDivisor;
-
- public plasmaWordIndex(File oldDatabaseRoot, File newIndexRoot, boolean dummy, int bufferkb, long preloadTime, serverLog log) {
- this.oldDatabaseRoot = oldDatabaseRoot;
- File textindexcache = new File(newIndexRoot, "PUBLIC/TEXT/RICACHE");
+ public plasmaWordIndex(File indexRoot, long rwibuffer, long lurlbuffer, long preloadTime, serverLog log) {
+ File textindexcache = new File(indexRoot, "PUBLIC/TEXT/RICACHE");
if (!(textindexcache.exists())) textindexcache.mkdirs();
- this.dhtOutCache = new indexRAMRI(textindexcache, payloadrownew, 1024, "dump1.array", log, true);
- this.dhtInCache = new indexRAMRI(textindexcache, payloadrownew, 1024, "dump2.array", log, true);
+ this.dhtOutCache = new indexRAMRI(textindexcache, indexRWIEntryNew.urlEntryRow, 2040, "dump1.array", log);
+ this.dhtInCache = new indexRAMRI(textindexcache, indexRWIEntryNew.urlEntryRow, 2040, "dump2.array", log);
// create collections storage path
- File textindexcollections = new File(newIndexRoot, "PUBLIC/TEXT/RICOLLECTION");
+ File textindexcollections = new File(indexRoot, "PUBLIC/TEXT/RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
- this.collections = new indexCollectionRI(textindexcollections, "collection", bufferkb * 1024, preloadTime, payloadrownew);
+ this.collections = new indexCollectionRI(textindexcollections, "collection", rwibuffer, preloadTime, indexRWIEntryNew.urlEntryRow);
+
+ // create LURL-db
+ loadedURL = new plasmaCrawlLURL(indexRoot, lurlbuffer, preloadTime);
+ // performance settings
busyCacheFlush = false;
this.busyDivisor = 5000;
this.idleDivisor = 420;
}
- public kelondroRow payloadrow() {
- return payloadrownew;
- }
-
- public indexRWIEntry newRWIEntry(
- String urlHash,
- int urlLength,
- int urlComps,
- int titleLength,
- int hitcount,
- int wordcount,
- int phrasecount,
- int posintext,
- int posinphrase,
- int posofphrase,
- int worddistance,
- int sizeOfPage,
- long lastmodified,
- long updatetime,
- int quality,
- String language,
- char doctype,
- int outlinksSame,
- int outlinksOther,
- kelondroBitfield flags ) {
- return new indexRWIEntryNew(urlHash, urlLength, urlComps, titleLength, hitcount, wordcount, phrasecount,
- posintext, posinphrase, posofphrase, worddistance, sizeOfPage, lastmodified, updatetime, quality, language, doctype,
- outlinksSame, outlinksOther, flags);
- }
-
- public File getRoot() {
- return oldDatabaseRoot;
- }
-
public int maxURLinDHTOutCache() {
return dhtOutCache.maxURLinCache();
}
@@ -184,12 +146,12 @@ public final class plasmaWordIndex implements indexRI {
}
public indexContainer emptyContainer(String wordHash) {
- return new indexContainer(wordHash, payloadrow(), true);
+ return new indexContainer(wordHash, indexRWIEntryNew.urlEntryRow);
}
- public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtInCase) {
+ public void addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtInCase) {
if (entry instanceof indexRWIEntryOld) {
- if (entry.urlHash() == null) return null;
+ if (entry.urlHash() == null) return;
entry = new indexRWIEntryNew((indexRWIEntryOld) entry);
}
@@ -203,12 +165,12 @@ public final class plasmaWordIndex implements indexRI {
dhtOutCache.addEntry(wordHash, entry, updateTime, false);
flushControl();
}
- return null;
}
+ /*
private indexContainer convertOld2New(indexContainer entries) {
// convert old entries to new entries
- indexContainer newentries = new indexContainer(entries.getWordHash(), payloadrownew, true);
+ indexContainer newentries = new indexContainer(entries.getWordHash(), indexRWIEntryNew.urlEntryRow);
Iterator i = entries.entries();
indexRWIEntryOld old;
while (i.hasNext()) {
@@ -219,9 +181,9 @@ public final class plasmaWordIndex implements indexRI {
}
return newentries;
}
-
- public indexContainer addEntries(indexContainer entries, long updateTime, boolean dhtInCase) {
- if (entries.row().objectsize() == payloadrowold.objectsize()) entries = convertOld2New(entries);
+ */
+ public void addEntries(indexContainer entries, long updateTime, boolean dhtInCase) {
+ assert (entries.row().objectsize() == indexRWIEntryNew.urlEntryRow.objectsize());
// set dhtInCase depending on wordHash
if ((!dhtInCase) && (yacyDHTAction.shallBeOwnWord(entries.getWordHash()))) dhtInCase = true;
@@ -233,7 +195,6 @@ public final class plasmaWordIndex implements indexRI {
dhtOutCache.addEntries(entries, updateTime, false);
flushControl();
}
- return null;
}
public void flushCacheSome(boolean busy) {
@@ -263,12 +224,7 @@ public final class plasmaWordIndex implements indexRI {
// flush the wordHash
indexContainer c = ram.deleteContainer(wordHash);
- if (c != null) {
- indexContainer feedback = collections.addEntries(c, c.updated(), false);
- if (feedback != null) {
- throw new RuntimeException("indexCollectionRI shall not return feedback entries; feedback = " + feedback.toString());
- }
- }
+ if (c != null) collections.addEntries(c, c.updated(), false);
// pause to next loop to give other processes a chance to use IO
//try {this.wait(8);} catch (InterruptedException e) {}
@@ -330,7 +286,7 @@ public final class plasmaWordIndex implements indexRI {
wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = plasmaCondenser.word2hash(word);
- ientry = newRWIEntry(urlHash,
+ ientry = new indexRWIEntryNew(urlHash,
urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(),
wprop.count,
condenser.RESULT_SIMI_WORDS,
@@ -415,16 +371,16 @@ public final class plasmaWordIndex implements indexRI {
return size;
}
- public void close(int waitingBoundSeconds) {
+ public void close() {
synchronized (this) {
- dhtInCache.close(waitingBoundSeconds);
- dhtOutCache.close(waitingBoundSeconds);
- collections.close(-1);
+ dhtInCache.close();
+ dhtOutCache.close();
+ collections.close();
}
}
public indexContainer deleteContainer(String wordHash) {
- indexContainer c = new indexContainer(wordHash, payloadrow(), true);
+ indexContainer c = new indexContainer(wordHash, indexRWIEntryNew.urlEntryRow);
c.add(dhtInCache.deleteContainer(wordHash), -1);
c.add(dhtOutCache.deleteContainer(wordHash), -1);
c.add(collections.deleteContainer(wordHash), -1);
@@ -456,9 +412,7 @@ public final class plasmaWordIndex implements indexRI {
}
public static final int RL_RAMCACHE = 0;
- public static final int RL_COLLECTIONS = 1; // the new index structure
- public static final int RL_ASSORTMENTS = 2; // (to be) outdated structure
- public static final int RL_WORDFILES = 3; // (to be) outdated structure
+ public static final int RL_COLLECTIONS = 1;
public int tryRemoveURLs(String urlHash) {
// this tries to delete an index from the cache that has this
@@ -468,14 +422,14 @@ public final class plasmaWordIndex implements indexRI {
return dhtInCache.tryRemoveURLs(urlHash) | dhtOutCache.tryRemoveURLs(urlHash);
}
- public TreeSet indexContainerSet(String startHash, int resourceLevel, boolean rot, int count) {
+ public TreeSet indexContainerSet(String startHash, boolean ram, boolean rot, int count) {
// creates a set of indexContainers
// this does not use the dhtInCache
kelondroOrder containerOrder = new indexContainerOrder((kelondroOrder) indexOrder.clone());
containerOrder.rotate(startHash.getBytes());
TreeSet containers = new TreeSet(containerOrder);
- Iterator i = wordContainers(startHash, resourceLevel, rot);
- if (resourceLevel == plasmaWordIndex.RL_RAMCACHE) count = Math.min(dhtOutCache.size(), count);
+ Iterator i = wordContainers(startHash, ram, rot);
+ if (ram) count = Math.min(dhtOutCache.size(), count);
indexContainer container;
while ((count > 0) && (i.hasNext())) {
container = (indexContainer) i.next();
@@ -486,38 +440,35 @@ public final class plasmaWordIndex implements indexRI {
}
return containers;
}
+
- public Iterator wordContainers(String startHash, boolean rot) {
- // returns an iteration of indexContainers
- return wordContainers(startHash, RL_WORDFILES, rot);
- }
-
- public Iterator wordContainers(String startHash, int resourceLevel, boolean rot) {
- if (rot) return new rotatingContainerIterator(startHash, resourceLevel);
- else return wordContainers(startHash, resourceLevel);
+ public Iterator wordContainers(String startHash, boolean ram, boolean rot) {
+ if (rot) return new rotatingContainerIterator(startHash, ram);
+ else return wordContainers(startHash, ram);
}
- private Iterator wordContainers(String startWordHash, int resourceLevel) {
+ public Iterator wordContainers(String startWordHash, boolean ram) {
kelondroOrder containerOrder = new indexContainerOrder((kelondroOrder) indexOrder.clone());
containerOrder.rotate(startWordHash.getBytes());
- if (resourceLevel == plasmaWordIndex.RL_RAMCACHE) {
+ if (ram) {
return dhtOutCache.wordContainers(startWordHash, false);
- }
- return new kelondroMergeIterator(
+ } else {
+ return new kelondroMergeIterator(
dhtOutCache.wordContainers(startWordHash, false),
collections.wordContainers(startWordHash, false),
containerOrder,
indexContainer.containerMergeMethod,
true);
+ }
}
public class rotatingContainerIterator implements Iterator {
Iterator i;
- int resourceLevel;
+ boolean ram;
- public rotatingContainerIterator(String startWordHash, int resourceLevel) {
- this.resourceLevel = resourceLevel;
- i = wordContainers(startWordHash, resourceLevel);
+ public rotatingContainerIterator(String startWordHash, boolean ram) {
+ this.ram = ram;
+ i = wordContainers(startWordHash, ram);
}
public void finalize() {
@@ -527,7 +478,7 @@ public final class plasmaWordIndex implements indexRI {
public boolean hasNext() {
if (i.hasNext()) return true;
else {
- i = wordContainers("------------", resourceLevel);
+ i = wordContainers("------------", ram);
return i.hasNext();
}
}
@@ -541,44 +492,6 @@ public final class plasmaWordIndex implements indexRI {
}
} // class rotatingContainerIterator
- public Object migrateWords2index(String wordhash) throws IOException {
- // returns the number of entries that had been added to the assortments
- // can be negative if some assortments have been moved to the backend
- File db = plasmaWordIndexFile.wordHash2path(oldDatabaseRoot, wordhash);
- if (!(db.exists())) return "not available";
- plasmaWordIndexFile entity = null;
- try {
- entity = new plasmaWordIndexFile(oldDatabaseRoot, wordhash, true);
- int size = entity.size();
- indexContainer container = new indexContainer(wordhash, payloadrow(), true);
-
- try {
- Iterator entries = entity.elements(true);
- indexRWIEntry entry;
- while (entries.hasNext()) {
- entry = (indexRWIEntry) entries.next();
- // System.out.println("ENTRY = " + entry.getUrlHash());
- container.add(new indexRWIEntry[] { entry }, System.currentTimeMillis());
- }
- // we have read all elements, now delete the entity
- entity.deleteComplete();
- entity.close();
- entity = null;
-
- indexContainer feedback = collections.addEntries(container, container.updated(), false);
- if (feedback != null) return feedback;
- return new Integer(size);
- } catch (kelondroException e) {
- // database corrupted, we simply give up the database and delete it
- try { entity.close(); } catch (Exception ee) { }
- entity = null;
- try { db.delete(); } catch (Exception ee) { }
- return "database corrupted; deleted";
- }
- } finally {
- if (entity != null) try {entity.close();}catch(Exception e){}
- }
- }
// The Cleaner class was provided as "UrldbCleaner" by Hydrox
// see http://www.yacy-forum.de/viewtopic.php?p=18093#18093
@@ -609,7 +522,7 @@ public final class plasmaWordIndex implements indexRI {
indexRWIEntry entry = null;
URL url = null;
HashSet urlHashs = new HashSet();
- Iterator indexContainerIterator = indexContainerSet(startHash, plasmaWordIndex.RL_WORDFILES, false, 100).iterator();
+ Iterator indexContainerIterator = indexContainerSet(startHash, false, false, 100).iterator();
while (indexContainerIterator.hasNext() && run) {
waiter();
container = (indexContainer) indexContainerIterator.next();
@@ -639,7 +552,7 @@ public final class plasmaWordIndex implements indexRI {
}
if (!containerIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes
- TreeSet containers = indexContainerSet(container.getWordHash(), plasmaWordIndex.RL_WORDFILES, false, 100);
+ TreeSet containers = indexContainerSet(container.getWordHash(), false, false, 100);
indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext()) && (!container.getWordHash().equals(((indexContainer) indexContainerIterator.next()).getWordHash()))) {
@@ -693,13 +606,14 @@ public final class plasmaWordIndex implements indexRI {
public static void main(String[] args) {
// System.out.println(kelondroMSetTools.fastStringComparator(true).compare("RwGeoUdyDQ0Y", "rwGeoUdyDQ0Y"));
// System.out.println(new Date(reverseMicroDateDays(microDateDays(System.currentTimeMillis()))));
- File plasmadb = new File("D:\\dev\\proxy\\DATA\\PLASMADB");
+ /*
File indexdb = new File("D:\\dev\\proxy\\DATA\\INDEX");
- plasmaWordIndex index = new plasmaWordIndex(plasmadb, indexdb, true, 555, 1000, new serverLog("TESTAPP"));
+ plasmaWordIndex index = new plasmaWordIndex(indexdb, true, 555, 1000, new serverLog("TESTAPP"));
Iterator containerIter = index.wordContainers("5A8yhZMh_Kmv", plasmaWordIndex.RL_WORDFILES, true);
while (containerIter.hasNext()) {
System.out.println("File: " + (indexContainer) containerIter.next());
}
+ */
}
}
diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java
index 5542e4366..7a04961a6 100644
--- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java
+++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java
@@ -58,6 +58,7 @@ import java.util.Iterator;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
+import de.anomic.index.indexRWIEntryNew;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroColumn;
@@ -74,12 +75,9 @@ public final class plasmaWordIndexAssortment {
// class variables
private File assortmentFile;
- private int assortmentLength;
private serverLog log;
private kelondroCache assortments;
private long bufferSize;
- private long preloadTime;
- private kelondroRow payloadrow;
private static String intx(int x) {
String s = Integer.toString(x);
@@ -92,23 +90,20 @@ public final class plasmaWordIndexAssortment {
structure[0] = new kelondroColumn("byte[] wordhash-" + yacySeedDB.commonHashLength);
structure[1] = new kelondroColumn("Cardinal occ-4 {b256}");
structure[2] = new kelondroColumn("Cardinal time-8 {b256}");
- kelondroColumn p = new kelondroColumn("byte[] urlprops-" + payloadrow.objectsize());
+ kelondroColumn p = new kelondroColumn("byte[] urlprops-" + indexRWIEntryOld.urlEntryRow.objectsize());
for (int i = 0; i < assortmentCapacity; i++) structure[3 + i] = p;
return new kelondroRow(structure);
}
private int assortmentCapacity(int rowsize) {
- return (rowsize - yacySeedDB.commonHashLength - 12) / payloadrow.objectsize();
+ return (rowsize - yacySeedDB.commonHashLength - 12) / indexRWIEntryOld.urlEntryRow.objectsize();
}
- public plasmaWordIndexAssortment(File storagePath, kelondroRow payloadrow, int assortmentLength, int bufferkb, long preloadTime, serverLog log) throws IOException {
+ public plasmaWordIndexAssortment(File storagePath, int assortmentLength, int bufferkb, long preloadTime, serverLog log) throws IOException {
if (!(storagePath.exists())) storagePath.mkdirs();
- this.payloadrow = payloadrow;
this.assortmentFile = new File(storagePath, assortmentFileName + intx(assortmentLength) + ".db");
- this.assortmentLength = assortmentLength;
//this.bufferStructureLength = 3 + 2 * assortmentLength;
this.bufferSize = bufferkb * 1024;
- this.preloadTime = preloadTime;
this.log = log;
// open assortment tree file
long start = System.currentTimeMillis();
@@ -122,138 +117,26 @@ public final class plasmaWordIndexAssortment {
assortments.cacheNodeStatus()[1] + " preloaded");
}
-
- public void store(indexContainer newContainer) throws IOException {
- // stores a word index to assortment database
- // this throws an exception if the word hash already existed
- //log.logDebug("storeAssortment: wordHash=" + wordHash + ", urlHash=" + entry.getUrlHash() + ", time=" + creationTime);
- if (newContainer.size() != assortmentLength) throw new RuntimeException("plasmaWordIndexAssortment.store: wrong container size");
- kelondroRow.Entry row = assortments.row().newEntry();
- row.setCol(0, newContainer.getWordHash().getBytes());
- row.setCol(1, 1);
- row.setCol(2, newContainer.updated());
- Iterator entries = newContainer.entries();
- indexRWIEntry entry;
- for (int i = 0; i < assortmentLength; i++) {
- entry = (indexRWIEntry) entries.next();
- row.setCol(3 + i, entry.toKelondroEntry().bytes());
- }
- kelondroRow.Entry oldrow = null;
- try {
- oldrow = assortments.put(row);
- } catch (IOException e) {
- e.printStackTrace();
- log.logSevere("storeAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB " + assortmentFile, e);
- resetDatabase();
- } catch (IndexOutOfBoundsException e) {
- e.printStackTrace();
- log.logSevere("storeAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB " + assortmentFile, e);
- resetDatabase();
- } catch (kelondroException e) {
- e.printStackTrace();
- log.logSevere("storeAssortment/kelondro-error: " + e.getMessage() + " - reset assortment-DB " + assortmentFile, e);
- resetDatabase();
- }
- if (oldrow != null) throw new RuntimeException("Store to assortment ambiguous");
- }
-
- public indexContainer remove(String wordHash) {
- // deletes a word index from assortment database
- // and returns the content record
- kelondroRow.Entry row = null;
- try {
- row = assortments.remove(wordHash.getBytes());
- } catch (IOException e) {
- log.logSevere("removeAssortment/IO-error: " + e.getMessage()
- + " - reset assortment-DB " + assortmentFile, e);
- resetDatabase();
- return null;
- } catch (kelondroException e) {
- log.logSevere("removeAssortment/kelondro-error: " + e.getMessage()
- + " - reset assortment-DB " + assortmentFile, e);
- resetDatabase();
- return null;
- }
- return row2container(row);
- }
-
- public boolean contains(String wordHash) {
- // gets a word index from assortment database
- // and returns the content record
- kelondroRow.Entry row = null;
- try {
- row = assortments.get(wordHash.getBytes());
- return (row != null);
- } catch (IOException e) {
- return false;
- } catch (kelondroException e) {
- log.logSevere("removeAssortment/kelondro-error: " + e.getMessage()
- + " - reset assortment-DB " + assortmentFile, e);
- resetDatabase();
- return false;
- }
- }
-
- public indexContainer get(String wordHash) {
- // gets a word index from assortment database
- // and returns the content record
- kelondroRow.Entry row = null;
- try {
- row = assortments.get(wordHash.getBytes());
- } catch (IOException e) {
- log.logSevere("removeAssortment/IO-error: " + e.getMessage()
- + " - reset assortment-DB " + assortmentFile, e);
- resetDatabase();
- return null;
- } catch (kelondroException e) {
- log.logSevere("removeAssortment/kelondro-error: " + e.getMessage()
- + " - reset assortment-DB " + assortmentFile, e);
- resetDatabase();
- return null;
- }
- return row2container(row);
- }
public final indexContainer row2container(kelondroRow.Entry row) {
if (row == null) return null;
String wordHash = row.getColString(0, null);
final long updateTime = row.getColLong(2);
- indexContainer container = new indexContainer(wordHash, payloadrow, false);
+ indexContainer container = new indexContainer(wordHash, indexRWIEntryNew.urlEntryRow);
int al = assortmentCapacity(row.objectsize());
for (int i = 0; i < al; i++) {
- container.add(new indexRWIEntry[] { new indexRWIEntryOld(row.getColBytes(3 + i)) }, updateTime);
+ // fill AND convert old entries to new entries
+ container.add(new indexRWIEntry[] { new indexRWIEntryNew(new indexRWIEntryOld(row.getColBytes(3 + i))) }, updateTime);
}
return container;
}
- private void resetDatabase() {
- // deletes the assortment database and creates a new one
- if (assortments != null) try {
- assortments.close();
- } catch (IOException e) {}
-
- try {
- // make a back-up
- File backupPath = new File(assortmentFile.getParentFile(), "ABKP");
- if (!(backupPath.exists())) backupPath.mkdirs();
- File backupFile = new File(backupPath, assortmentFile.getName() + System.currentTimeMillis());
- assortmentFile.renameTo(backupFile);
- log.logInfo("a back-up of the deleted assortment file is in " + backupFile.toString());
- if (assortmentFile.exists()) assortmentFile.delete();
- assortments = new kelondroCache(kelondroTree.open(assortmentFile, bufferSize / 2, preloadTime, bufferStructure(assortmentLength)), bufferSize / 2, true, false);
- } catch (Exception e) {
- // if this fails, delete the file
- if (!(assortmentFile.delete())) throw new RuntimeException("cannot delete assortment database");
- }
- }
-
- public Iterator containers(String startWordHash, boolean up, boolean rot) throws IOException {
+ public Iterator wordContainers(String startWordHash, boolean up, boolean rot) throws IOException {
// returns an iteration of indexContainer elements
try {
return new containerIterator(startWordHash, up, rot);
} catch (kelondroException e) {
log.logSevere("iterateAssortment/kelondro-error: " + e.getMessage() + " - reset assortment-DB " + assortmentFile, e);
- resetDatabase();
return null;
}
}
@@ -288,22 +171,6 @@ public final class plasmaWordIndexAssortment {
return 0;
}
}
-
- public int cacheNodeChunkSize() {
- return assortments.cacheNodeChunkSize();
- }
-
- public int cacheObjectChunkSize() {
- return assortments.cacheObjectChunkSize();
- }
-
- public int[] cacheNodeStatus() {
- return assortments.cacheNodeStatus();
- }
-
- public long[] cacheObjectStatus() {
- return assortments.cacheObjectStatus();
- }
public void close() {
try {
diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java
deleted file mode 100644
index 377cc8f09..000000000
--- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java
+++ /dev/null
@@ -1,408 +0,0 @@
-// plasmaWordIndexAssortmentCluster.java
-// -------------------------------------
-// part of YACY
-// (C) by Michael Peter Christen; mc@anomic.de
-// first published on http://www.anomic.de
-// Frankfurt, Germany, 2005
-// last major change: 20.5.2005
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-//
-// Using this software in any meaning (reading, learning, copying, compiling,
-// running) means that you agree that the Author(s) is (are) not responsible
-// for cost, loss of data or any harm that may be caused directly or indirectly
-// by usage of this softare or this documentation. The usage of this software
-// is on your own risk. The installation and usage (starting/running) of this
-// software may allow other people or application to access your computer and
-// any attached devices and is highly dependent on the configuration of the
-// software which must be done by the user of the software; the author(s) is
-// (are) also not responsible for proper configuration and usage of the
-// software, even if provoked by documentation provided together with
-// the software.
-//
-// Any changes to this file according to the GPL as documented in the file
-// gpl.txt aside this file in the shipment you received can be done to the
-// lines that follows this copyright notice here, but changes must not be
-// done inside the copyright notive above. A re-distribution must contain
-// the intact and unchanged copyright notice.
-// Contributions and changes to the program code must be marked as such.
-
-/*
- An assortment-cluster is a set of assortments.
- Each one carries a different number of URL's
- */
-
-package de.anomic.plasma;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Set;
-
-import de.anomic.index.indexContainer;
-import de.anomic.index.indexContainerOrder;
-import de.anomic.index.indexRWIEntry;
-import de.anomic.index.indexRI;
-import de.anomic.kelondro.kelondroCache;
-import de.anomic.kelondro.kelondroMergeIterator;
-import de.anomic.kelondro.kelondroNaturalOrder;
-import de.anomic.kelondro.kelondroRecords;
-import de.anomic.kelondro.kelondroRow;
-import de.anomic.server.logging.serverLog;
-
-public final class plasmaWordIndexAssortmentCluster implements indexRI {
-
- // class variables
- private int clusterCount; // number of cluster files
- public int clusterCapacity; // number of all url referrences that can be stored to a single word in the cluster
-
- //private serverLog log;
- private plasmaWordIndexAssortment[] assortments;
- private long completeBufferKB;
- private kelondroRow payloadrow;
-
- public plasmaWordIndexAssortmentCluster(File assortmentsPath, int clusterCount, kelondroRow payloadrow, int bufferkb, long preloadTime, serverLog log) throws IOException {
- // set class variables
- if (!(assortmentsPath.exists())) assortmentsPath.mkdirs();
- this.payloadrow = payloadrow;
- this.clusterCount = clusterCount;
- this.clusterCapacity = clusterCount * (clusterCount + 1) / 2;
- this.completeBufferKB = bufferkb;
- // this.log = log;
- this.assortments = new plasmaWordIndexAssortment[clusterCount];
-
- // open cluster and close it directly again to detect the element sizes
- int[] sizes = new int[clusterCount];
- int sumSizes = 1;
- plasmaWordIndexAssortment testAssortment;
- for (int i = 0; i < clusterCount; i++) {
- testAssortment = new plasmaWordIndexAssortment(assortmentsPath, payloadrow, i + 1, 0, 0, null);
- sizes[i] = testAssortment.size() + clusterCount - i;
- sumSizes += sizes[i];
- testAssortment.close();
- testAssortment = null;
- }
-
- // initialize cluster using the cluster elements size for optimal buffer
- // size
- long nextTime;
- long startTime;
- long sS = (long) sumSizes;
- for (int i = 0; i < clusterCount; i++) {
- nextTime = Math.max(0, preloadTime * ((long) sizes[i]) / sS);
- startTime = System.currentTimeMillis();
- assortments[i] = new plasmaWordIndexAssortment(
- assortmentsPath,
- payloadrow,
- i + 1,
- (int) (completeBufferKB * (long) sizes[i] / (long) sumSizes),
- nextTime,
- log);
- preloadTime -= System.currentTimeMillis() - startTime;
- sS -= sizes[i];
- }
- }
-
- private indexContainer storeSingular(indexContainer newContainer) throws IOException {
- // this tries to store the record. If the record does not fit, or a same hash already
- // exists and would not fit together with the new record, then the record is deleted from
- // the assortmen(s) and returned together with the newRecord.
- // if storage was successful, NULL is returned.
- if (newContainer.size() > clusterCount) return newContainer; // it will not fit
- indexContainer buffer;
- while ((buffer = assortments[newContainer.size() - 1].remove(newContainer.getWordHash())) != null) {
- if (newContainer.add(buffer, -1) == 0) return newContainer; // security check; othervise this loop does not terminate
- if (newContainer.size() > clusterCount) return newContainer; // it will not fit
- }
- // the assortment (newContainer.size() - 1) should now be empty. put it in there
- assortments[newContainer.size() - 1].store(newContainer);
- // return null to show that we have stored the new Record successfully
- return null;
- }
-
- private void storeForced(indexContainer newContainer) throws IOException {
- // this stores the record and overwrites an existing record.
- // this is safe if we can be shure that the record does not exist before.
- if ((newContainer == null) || (newContainer.size() == 0) || (newContainer.size() > clusterCount)) return; // it will not fit
- assortments[newContainer.size() - 1].store(newContainer);
- }
-
- private void storeStretched(indexContainer newContainer) throws IOException {
- // this stores the record and stretches the storage over
- // all the assortments that are necessary to fit in the record
- // IMPORTANT: it must be ensured that the wordHash does not exist in the cluster before
- // i.e. by calling removeFromAll
- if (newContainer.size() <= clusterCount) {
- storeForced(newContainer);
- return;
- }
-
- // calculate minimum cluster insert point
- int clusterMinStart = clusterCount;
- int cap = clusterCapacity - newContainer.size() - 2 * clusterCount;
- while (cap > 0) {
- cap -= clusterMinStart;
- clusterMinStart--;
- }
-
- // point the real cluster insert point somewhere between the minimum and the maximum
- int clusterStart = clusterCount - (int) (Math.random() * (clusterCount - clusterMinStart));
-
- // do the insert
- indexContainer c;
- Iterator i = newContainer.entries();
- for (int j = clusterStart; j >= 1; j--) {
- c = new indexContainer(newContainer.getWordHash(), payloadrow, false);
- for (int k = 0; k < j; k++) {
- if (i.hasNext()) {
- c.add((indexRWIEntry) i.next(), newContainer.updated());
- } else {
- storeForced(c);
- return;
- }
- }
- storeForced(c);
- }
- }
-
- public indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
- indexContainer container = new indexContainer(wordHash, payloadrow, false);
- container.add(newEntry);
- return addEntries(container, updateTime, dhtCase);
- }
-
- public long getUpdateTime(String wordHash) {
- indexContainer entries = getContainer(wordHash, null, false, -1);
- if (entries == null) return 0;
- return entries.updated();
- }
-
- public indexContainer addEntries(indexContainer newContainer, long creationTime, boolean dhtCase) {
- // this is called by the index ram cache flush process
- // it returnes NULL if the storage was successful
- // it returnes a new container if the given container cannot be stored
- // containers that are returned will be stored in a WORDS file
- if (newContainer == null) return null;
- if (newContainer.size() > clusterCapacity) return newContainer; // it will not fit
-
- // split the container into several smaller containers that will take the whole thing
- // first find out how the container can be splitted
- int testsize = Math.min(clusterCount, newContainer.size());
- int [] spaces = new int[testsize];
- for (int i = testsize - 1; i >= 0; i--) spaces[i] = 0;
- int need = newContainer.size();
- int selectedAssortment = testsize - 1;
- while (selectedAssortment >= 0) {
- if (selectedAssortment + 1 <= need) {
- spaces[selectedAssortment] = (assortments[selectedAssortment].get(newContainer.getWordHash()) == null) ? (selectedAssortment + 1) : 0;
- need -= spaces[selectedAssortment];
- assert (need >= 0);
- if (need == 0) break;
- }
- selectedAssortment--;
- }
- if (need == 0) {
- // we found spaces so that we can put in the newContainer into these spaces
- indexContainer c;
- Iterator i = newContainer.entries();
- for (int j = testsize - 1; j >= 0; j--) {
- if (spaces[j] == 0) continue;
- c = new indexContainer(newContainer.getWordHash(), payloadrow, false);
- for (int k = 0; k <= j; k++) {
- assert (i.hasNext());
- c.add((indexRWIEntry) i.next(), newContainer.updated());
- }
- try {
- storeForced(c);
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- return null;
- }
-
- if (newContainer.size() <= clusterCount) try {
- newContainer = storeSingular(newContainer);
- } catch (IOException e) {
- e.printStackTrace();
- }
- if (newContainer == null) return null;
-
- // clean up the whole thing and try to insert the container then
- newContainer.add(deleteContainer(newContainer.getWordHash(), -1), -1);
- if (newContainer.size() > clusterCapacity) return newContainer;
- try {
- storeStretched(newContainer);
- } catch (IOException e) {
- e.printStackTrace();
- }
- return null;
- }
-
- public indexContainer deleteContainer(String wordHash) {
- return deleteContainer(wordHash, -1);
- }
-
- public indexContainer deleteContainer(String wordHash, long maxTime) {
- // removes all records from all the assortments and return them
- indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false);
- long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
- long remainingTime;
- for (int i = 0; i < clusterCount; i++) {
- buffer = assortments[i].remove(wordHash);
- remainingTime = limitTime - System.currentTimeMillis();
- if (0 > remainingTime) break;
- if (buffer != null) record.add(buffer, remainingTime);
- }
- return record;
- }
-
- /*
- public int removeEntries(String wordHash, String[] referenceHashes, boolean deleteComplete) {
- indexContainer c = deleteContainer(wordHash, -1);
- int b = c.size();
- c.removeEntries(wordHash, referenceHashes, false);
- if (c.size() != 0) {
- addEntries(c, c.updated(), false);
- }
- return b - c.size();
- }
- */
-
- public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) {
- indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false);
- boolean found = false;
- for (int i = 0; i < clusterCount; i++) {
- buffer = assortments[i].remove(wordHash);
- if ((buffer != null) && (buffer.remove(urlHash) != null)) found = true;
- record.add(buffer, -1);
- if (found) break;
- }
- // put back remaining
- if (record.size() != 0) {
- addEntries(record, record.updated(), false);
- }
- return found;
- }
-
- public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete) {
- indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false);
- int initialSize = urlHashes.size();
- for (int i = 0; i < clusterCount; i++) {
- buffer = assortments[i].remove(wordHash);
- if (buffer != null) {
- // sort out url hashes that shall be deleted
- Iterator bi = buffer.entries();
- indexRWIEntry entry;
- while (bi.hasNext()) {
- entry = (indexRWIEntry) bi.next();
- if (urlHashes.remove(entry.urlHash())) bi.remove();
- }
- record.add(buffer, -1);
- }
- if (urlHashes.size() == 0) break;
- }
- // put back remaining
- if (record.size() != 0) {
- addEntries(record, record.updated(), false);
- }
- return initialSize - urlHashes.size();
- }
-
- public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) {
- // collect all records from all the assortments and return them
- indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false);
- long timeout = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
- for (int i = 0; i < clusterCount; i++) {
- buffer = assortments[i].get(wordHash);
- if (buffer != null) {
- buffer.select(urlselection);
- record.add(buffer, -1);
- }
- if (System.currentTimeMillis() > timeout) break;
- }
- return record;
- }
-
- public int indexSize(String wordHash) {
- int size = 0;
- for (int i = 0; i < clusterCount; i++) {
- if (assortments[i].contains(wordHash)) size += i + 1;
- }
- return size;
- }
-
- public Iterator wordContainers(String startWordHash, boolean rot) {
- try {
- return wordContainers(startWordHash, true, rot);
- } catch (IOException e) {
- return new HashSet().iterator();
- }
- }
-
- public Iterator wordContainers(String startWordHash, boolean up, boolean rot) throws IOException {
- // iterates indexContainer - Objects
- HashSet containerIterators = new HashSet();
- for (int i = 0; i < clusterCount; i++) containerIterators.add(assortments[i].containers(startWordHash, up, rot));
- return kelondroMergeIterator.cascade(containerIterators, new indexContainerOrder(kelondroNaturalOrder.naturalOrder), indexContainer.containerMergeMethod, up);
- }
-
- public int size() {
- int total = 0;
- for (int i = 0; i < clusterCount; i++) total += assortments[i].size();
- return total;
- }
-
- public int[] sizes() {
- int[] sizes = new int[clusterCount];
- for (int i = 0; i < clusterCount; i++) sizes[i] = assortments[i].size();
- return sizes;
- }
-
- public int cacheChunkSizeAvg() {
- int i = 0;
- for (int j = 0; j < clusterCount; j++) {
- i += assortments[j].cacheNodeChunkSize();
- }
- return i / clusterCount;
- }
-
- public int cacheObjectSizeAvg() {
- long c = 0, k = 0;
- for (int j = 0; j < clusterCount; j++) {
- c += assortments[j].size() * assortments[j].cacheObjectChunkSize();
- k += assortments[j].size();
- }
- return (k > 0) ? (int) (c / k) : 0;
- }
-
- public int[] cacheNodeStatus() {
- int[][] a = new int[assortments.length][];
- for (int i = assortments.length - 1; i >= 0; i--) a[i] = assortments[i].cacheNodeStatus();
- return kelondroRecords.cacheCombinedStatus(a, assortments.length);
- }
-
- public long[] cacheObjectStatus() {
- long[][] a = new long[assortments.length][];
- for (int i = assortments.length - 1; i >= 0; i--) a[i] = assortments[i].cacheObjectStatus();
- return kelondroCache.combinedStatus(a, a.length);
- }
-
- public void close(int waitingSeconds) {
- for (int i = 0; i < clusterCount; i++) assortments[i].close();
- }
-
-}
diff --git a/source/de/anomic/plasma/plasmaWordIndexFile.java b/source/de/anomic/plasma/plasmaWordIndexFile.java
index 68ed2691f..d1bd492cd 100644
--- a/source/de/anomic/plasma/plasmaWordIndexFile.java
+++ b/source/de/anomic/plasma/plasmaWordIndexFile.java
@@ -50,6 +50,7 @@ import java.util.Iterator;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
+import de.anomic.index.indexRWIEntryNew;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRow;
@@ -131,7 +132,7 @@ public final class plasmaWordIndexFile {
public indexRWIEntry getEntry(String urlhash) throws IOException {
kelondroRow.Entry n = theIndex.get(urlhash.getBytes());
if (n == null) return null;
- return new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null));
+ return new indexRWIEntryNew(new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null)));
}
public boolean contains(String urlhash) throws IOException {
@@ -142,33 +143,12 @@ public final class plasmaWordIndexFile {
return (theIndex.get(entry.urlHash().getBytes()) != null);
}
- public boolean addEntry(indexRWIEntry entry) throws IOException {
- if (entry == null) return false;
- indexRWIEntry oldEntry = getEntry(entry.urlHash());
- if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity
- return false;
- }
- return (theIndex.put(entry.toKelondroEntry()) == null);
+ public void addEntry(indexRWIEntry entry) {
+ throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above");
}
- public int addEntries(indexContainer container) throws IOException {
- //System.out.println("* adding " + newEntries.size() + " cached word index entries for word " + wordHash); // debug
- // fetch the index cache
- if ((container == null) || (container.size() == 0)) return 0;
-
- // open file
- int count = 0;
-
- // write from vector
- if (container != null) {
- Iterator i = container.entries();
- while (i.hasNext()) {
- if (addEntry((indexRWIEntry) i.next())) count++;
- }
- }
-
- // close and return
- return count;
+ public void addEntries(indexContainer container) {
+ throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above");
}
public boolean deleteComplete() {
@@ -228,7 +208,7 @@ public final class plasmaWordIndexFile {
public Object next() {
if (i == null) return null;
kelondroRow.Entry n = (kelondroRow.Entry) i.next();
- return new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null));
+ return new indexRWIEntryNew(new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null)));
}
public void remove() {
throw new UnsupportedOperationException();
@@ -239,8 +219,7 @@ public final class plasmaWordIndexFile {
return "DB:" + theIndex.toString();
}
-
- public void merge(plasmaWordIndexFile otherEntity, long time) throws IOException {
+ public void merge(plasmaWordIndexFile otherEntity, long time) {
// this is a merge of another entity to this entity
// the merge is interrupted when the given time is over
// a time=-1 means: no timeout
@@ -255,174 +234,4 @@ public final class plasmaWordIndexFile {
}
}
- /*
- // join methods
- private static int log2(int x) {
- int l = 0;
- while (x > 0) {x = x >> 1; l++;}
- return l;
- }
-
- public static plasmaWordIndexEntity joinEntities(Set entities, long time) throws IOException {
-
- // big problem here: there cannot be a time-out for join, since a time-out will leave the joined set too big.
- // this will result in a OR behavior of the search instead of an AND behavior
-
- long stamp = System.currentTimeMillis();
-
- // order entities by their size
- TreeMap map = new TreeMap();
- plasmaWordIndexEntity singleEntity;
- Iterator i = entities.iterator();
- int count = 0;
- while (i.hasNext()) {
- // get next entity:
- singleEntity = (plasmaWordIndexEntity) i.next();
-
- // check result
- if ((singleEntity == null) || (singleEntity.size() == 0)) return new plasmaWordIndexEntity(null); // as this is a cunjunction of searches, we have no result if any word is not known
-
- // store result in order of result size
- map.put(new Long(singleEntity.size() * 1000 + count), singleEntity);
- count++;
- }
-
- // check if there is any result
- if (map.size() == 0) return new plasmaWordIndexEntity(null); // no result, nothing found
-
- // the map now holds the search results in order of number of hits per word
- // we now must pairwise build up a conjunction of these sets
- Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries
- plasmaWordIndexEntity searchA, searchB, searchResult = (plasmaWordIndexEntity) map.remove(k);
- while ((map.size() > 0) && (searchResult.size() > 0)) {
- // take the first element of map which is a result and combine it with result
- k = (Long) map.firstKey(); // the next smallest...
- time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis();
- searchA = searchResult;
- searchB = (plasmaWordIndexEntity) map.remove(k);
- searchResult = plasmaWordIndexEntity.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1));
- // close the input files/structures
- if (searchA != searchResult) searchA.close();
- if (searchB != searchResult) searchB.close();
- }
- searchA = null; // free resources
- searchB = null; // free resources
-
- // in 'searchResult' is now the combined search result
- if (searchResult.size() == 0) return new plasmaWordIndexEntity(null);
- return searchResult;
- }
-
-
- public static plasmaWordIndexEntity joinConstructive(plasmaWordIndexEntity i1, plasmaWordIndexEntity i2, long time) throws IOException {
- if ((i1 == null) || (i2 == null)) return null;
- if ((i1.size() == 0) || (i2.size() == 0)) return new plasmaWordIndexEntity(null);
-
- // decide which method to use
- int high = ((i1.size() > i2.size()) ? i1.size() : i2.size());
- int low = ((i1.size() > i2.size()) ? i2.size() : i1.size());
- int stepsEnum = 10 * (high + low - 1);
- int stepsTest = 12 * log2(high) * low;
-
- // start most efficient method
- if (stepsEnum > stepsTest) {
- if (i1.size() < i2.size())
- return joinConstructiveByTest(i1, i2, time);
- else
- return joinConstructiveByTest(i2, i1, time);
- } else {
- return joinConstructiveByEnumeration(i1, i2, time);
- }
- }
-
- private static plasmaWordIndexEntity joinConstructiveByTest(plasmaWordIndexEntity small, plasmaWordIndexEntity large, long time) throws IOException {
- System.out.println("DEBUG: JOIN METHOD BY TEST");
- plasmaWordIndexEntity conj = new plasmaWordIndexEntity(null); // start with empty search result
- Iterator se = small.elements(true);
- plasmaWordIndexEntry ie0, ie1;
- long stamp = System.currentTimeMillis();
- try {
- while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) {
- ie0 = (plasmaWordIndexEntry) se.next();
- ie1 = large.getEntry(ie0.getUrlHash());
- if (ie1 != null) {
- // this is a hit. Calculate word distance:
- ie0.combineDistance(ie1);
- conj.addEntry(ie0);
- }
- }
- } catch (kelondroException e) {
- //serverLog.logSevere("PLASMA", "joinConstructiveByTest: Database corrupt (" + e.getMessage() + "), deleting index");
- small.deleteComplete();
- return conj;
- }
- return conj;
- }
-
- private static plasmaWordIndexEntity joinConstructiveByEnumeration(plasmaWordIndexEntity i1, plasmaWordIndexEntity i2, long time) throws IOException {
- System.out.println("DEBUG: JOIN METHOD BY ENUMERATION");
- plasmaWordIndexEntity conj = new plasmaWordIndexEntity(null); // start with empty search result
- Iterator e1 = i1.elements(true);
- Iterator e2 = i2.elements(true);
- int c;
- if ((e1.hasNext()) && (e2.hasNext())) {
- plasmaWordIndexEntry ie1;
- plasmaWordIndexEntry ie2;
- try {
- ie1 = (plasmaWordIndexEntry) e1.next();
- } catch (kelondroException e) {
- //serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database corrupt 1 (" + e.getMessage() + "), deleting index");
- i1.deleteComplete();
- return conj;
- }
- try {
- ie2 = (plasmaWordIndexEntry) e2.next();
- } catch (kelondroException e) {
- //serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database corrupt 2 (" + e.getMessage() + "), deleting index");
- i2.deleteComplete();
- return conj;
- }
- long stamp = System.currentTimeMillis();
- while ((System.currentTimeMillis() - stamp) < time) {
- c = ie1.getUrlHash().compareTo(ie2.getUrlHash());
- if (c < 0) {
- try {
- if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
- } catch (kelondroException e) {
- //serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 1 corrupt (" + e.getMessage() + "), deleting index");
- i1.deleteComplete();
- break;
- }
- } else if (c > 0) {
- try {
- if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
- } catch (kelondroException e) {
- //serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 2 corrupt (" + e.getMessage() + "), deleting index");
- i2.deleteComplete();
- break;
- }
- } else {
- // we have found the same urls in different searches!
- ie1.combineDistance(ie2);
- conj.addEntry(ie1);
- try {
- if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
- } catch (kelondroException e) {
- //serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 1 corrupt (" + e.getMessage() + "), deleting index");
- i1.deleteComplete();
- break;
- }
- try {
- if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
- } catch (kelondroException e) {
- //serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 2 corrupt (" + e.getMessage() + "), deleting index");
- i2.deleteComplete();
- break;
- }
- }
- }
- }
- return conj;
- }
-*/
}
diff --git a/source/de/anomic/plasma/plasmaWordIndexFileCluster.java b/source/de/anomic/plasma/plasmaWordIndexFileCluster.java
index 8cd76accd..b14146c2a 100644
--- a/source/de/anomic/plasma/plasmaWordIndexFileCluster.java
+++ b/source/de/anomic/plasma/plasmaWordIndexFileCluster.java
@@ -43,7 +43,6 @@
package de.anomic.plasma;
import java.io.File;
-import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
@@ -53,23 +52,19 @@ import java.util.TreeSet;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRI;
+import de.anomic.index.indexRWIEntryNew;
+import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroNaturalOrder;
-import de.anomic.kelondro.kelondroRow;
-import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
public class plasmaWordIndexFileCluster implements indexRI {
// class variables
- private final File databaseRoot;
- private final serverLog log;
- private int size;
- private kelondroRow payloadrow;
+ private final File databaseRoot;
+ private int size;
- public plasmaWordIndexFileCluster(File databaseRoot, kelondroRow payloadrow, serverLog log) {
+ public plasmaWordIndexFileCluster(File databaseRoot) {
this.databaseRoot = databaseRoot;
- this.payloadrow = payloadrow;
- this.log = log;
this.size = 0;
}
@@ -77,7 +72,6 @@ public class plasmaWordIndexFileCluster implements indexRI {
return size;
}
-
public Iterator wordContainers(String startHash, boolean rot) {
return new containerIterator(wordHashes(startHash, rot));
}
@@ -234,16 +228,16 @@ public class plasmaWordIndexFileCluster implements indexRI {
if ((maxTime < 0) || (maxTime > 60000)) maxTime=60000; // maximum is one minute
if (exists(wordHash)) {
plasmaWordIndexFile entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10);
- indexContainer container = new indexContainer(wordHash, payloadrow, false);
- indexRWIEntry entry;
+ indexContainer container = new indexContainer(wordHash, indexRWIEntryNew.urlEntryRow);
+ indexRWIEntryNew entry;
Iterator i = entity.elements(true);
while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) {
- entry = (indexRWIEntry) i.next();
+ entry = new indexRWIEntryNew((indexRWIEntryOld) i.next());
if ((urlselection == null) || (urlselection.contains(entry.urlHash()))) container.add(entry);
}
return container;
} else {
- return new indexContainer(wordHash, payloadrow, false);
+ return new indexContainer(wordHash, indexRWIEntryNew.urlEntryRow);
}
}
@@ -258,80 +252,26 @@ public class plasmaWordIndexFileCluster implements indexRI {
public indexContainer deleteContainer(String wordHash) {
plasmaWordIndexFile.removePlasmaIndex(databaseRoot, wordHash);
- return new indexContainer(wordHash, payloadrow, false);
+ return null;
}
public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) {
- // removes all given url hashes from a single word index. Returns number of deletions.
- plasmaWordIndexFile pi = null;
- boolean removed = false;
- if (exists(wordHash)) try {
- pi = getEntity(wordHash, true, -1);
- if (pi.removeEntry(urlHash, deleteComplete)) removed = true;
- int size = pi.size();
- pi.close(); pi = null;
- // check if we can remove the index completely
- if ((deleteComplete) && (size == 0)) deleteContainer(wordHash);
- return removed;
- } catch (IOException e) {
- log.logSevere("plasmaWordIndexClassic.removeEntries: " + e.getMessage());
- return false;
- } finally {
- if (pi != null) try{pi.close();}catch(Exception e){}
- } else return false;
+ throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above");
}
public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete) {
- // removes all given url hashes from a single word index. Returns number of deletions.
- plasmaWordIndexFile pi = null;
- int count = 0;
- if (exists(wordHash)) try {
- pi = getEntity(wordHash, true, -1);
- Iterator i = urlHashes.iterator();
- while (i.hasNext()) if (pi.removeEntry((String) i.next(), deleteComplete)) count++;
- int size = pi.size();
- pi.close(); pi = null;
- // check if we can remove the index completely
- if ((deleteComplete) && (size == 0)) deleteContainer(wordHash);
- return count;
- } catch (IOException e) {
- log.logSevere("plasmaWordIndexClassic.removeEntries: " + e.getMessage());
- return count;
- } finally {
- if (pi != null) try{pi.close();}catch(Exception e){}
- } else return 0;
+ throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above");
}
- public indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
- indexContainer container = new indexContainer(wordHash, payloadrow, false);
- container.add(newEntry);
- return addEntries(container, updateTime, dhtCase);
+ public void addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
+ throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above");
}
- public indexContainer addEntries(indexContainer container, long creationTime, boolean highPriority) {
- //System.out.println("* adding " + newEntries.size() + " cached word index entries for word " + wordHash); // debug
- // fetch the index cache
- if ((container == null) || (container.size() == 0)) return null;
-
- // open file
- plasmaWordIndexFile pi = null;
- try {
- pi = new plasmaWordIndexFile(databaseRoot, container.getWordHash(), false);
- pi.addEntries(container);
-
- // close and return
- pi.close(); pi = null;
- return null;
- } catch (IOException e) {
- log.logSevere("plasmaWordIndexClassic.addEntries: " + e.getMessage());
- return container;
- } finally {
- if (pi != null) try{pi.close();}catch (Exception e){}
- }
+ public void addEntries(indexContainer container, long creationTime, boolean highPriority) {
+ throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above");
}
- public void close(int waitingSeconds) {
-
+ public void close() {
}
public int indexSize(String wordHash) {
diff --git a/source/de/anomic/urlRedirector/urlRedirectord.java b/source/de/anomic/urlRedirector/urlRedirectord.java
index ec17d28fe..8f372a69d 100644
--- a/source/de/anomic/urlRedirector/urlRedirectord.java
+++ b/source/de/anomic/urlRedirector/urlRedirectord.java
@@ -190,9 +190,9 @@ public class urlRedirectord implements serverHandler {
) {
// first delete old entry, if exists
String urlhash = plasmaURL.urlHash(this.nextURL);
- switchboard.urlPool.loadedURL.remove(urlhash);
- switchboard.urlPool.noticeURL.remove(urlhash);
- switchboard.urlPool.errorURL.remove(urlhash);
+ switchboard.wordIndex.loadedURL.remove(urlhash);
+ switchboard.noticeURL.remove(urlhash);
+ switchboard.errorURL.remove(urlhash);
// enqueuing URL for crawling
reasonString = switchboard.sbStackCrawlThread.stackCrawl(
diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java
index 398f548d9..2134dcf90 100644
--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@@ -52,7 +52,6 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
-import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
@@ -520,30 +519,12 @@ public final class yacyClient {
// save the url entry
indexRWIEntry entry;
if (urlEntry.word() == null) {
- // the old way to define words
- int urlLength = comp.url().toNormalform().length();
- int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length;
-
- entry = wordIndex.newRWIEntry(
- urlEntry.hash(),
- urlLength,
- urlComps,
- comp.descr().length(),
- urlEntry.wordCount(),
- 0, 0, 0, 0, 0, 0,
- urlEntry.size(),
- urlEntry.moddate().getTime(),
- System.currentTimeMillis(),
- 0,
- urlEntry.language(),
- urlEntry.doctype(),
- 0,0,
- new kelondroBitfield(4)
- );
- } else {
- // the new way: the search-result-url transports all the attributes of word indexes
- entry = urlEntry.word();
+ yacyCore.log.logWarning("DEBUG-SEARCH: no word attached from peer " + targetPeer.getName() + ", version " + targetPeer.getVersion());
+ continue; // no word attached
}
+ // the search-result-url transports all the attributes of word indexes
+ entry = urlEntry.word();
+
if (urlEntry.snippet() != null) {
// we don't store the snippets along the url entry, because they are search-specific.
// instead, they are placed in a snipped-search cache.
diff --git a/source/de/anomic/yacy/yacyDHTAction.java b/source/de/anomic/yacy/yacyDHTAction.java
index 6e3d5ad22..7a5482c06 100644
--- a/source/de/anomic/yacy/yacyDHTAction.java
+++ b/source/de/anomic/yacy/yacyDHTAction.java
@@ -234,6 +234,7 @@ public class yacyDHTAction implements yacyPeerAction {
}
public static boolean shallBeOwnWord(String wordhash) {
+ if (yacyCore.seedDB == null) return false;
if (yacyCore.seedDB.mySeed.isPotential()) return false;
final double distance = dhtDistance(yacyCore.seedDB.mySeed.hash, wordhash);
final double max = 1.2 / yacyCore.seedDB.sizeConnected();
diff --git a/source/de/anomic/yacy/yacyPeerActions.java b/source/de/anomic/yacy/yacyPeerActions.java
index 219169f8e..7644ffba8 100644
--- a/source/de/anomic/yacy/yacyPeerActions.java
+++ b/source/de/anomic/yacy/yacyPeerActions.java
@@ -134,8 +134,8 @@ public class yacyPeerActions {
sb.setConfig("totalPPM", Long.toString(indexedc / 1)); //no division by zero
seedDB.mySeed.put(yacySeed.UPTIME, Long.toString(uptime/60)); // the number of minutes that the peer is up in minutes/day (moving average MA30)
- seedDB.mySeed.put(yacySeed.LCOUNT, Integer.toString(sb.urlPool.loadedURL.size())); // the number of links that the peer has stored (LURL's)
- seedDB.mySeed.put(yacySeed.NCOUNT, Integer.toString(sb.urlPool.noticeURL.stackSize())); // the number of links that the peer has noticed, but not loaded (NURL's)
+ seedDB.mySeed.put(yacySeed.LCOUNT, Integer.toString(sb.wordIndex.loadedURL.size())); // the number of links that the peer has stored (LURL's)
+ seedDB.mySeed.put(yacySeed.NCOUNT, Integer.toString(sb.noticeURL.stackSize())); // the number of links that the peer has noticed, but not loaded (NURL's)
seedDB.mySeed.put(yacySeed.ICOUNT, Integer.toString(sb.cacheSizeMin())); // the minimum number of words that the peer has indexed (as it says)
seedDB.mySeed.put(yacySeed.SCOUNT, Integer.toString(seedDB.sizeConnected())); // the number of seeds that the peer has stored
seedDB.mySeed.put(yacySeed.CCOUNT, Double.toString(((int) ((seedDB.sizeConnected() + seedDB.sizeDisconnected() + seedDB.sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour)
diff --git a/source/yacy.java b/source/yacy.java
index 78abbeb21..fa718369b 100644
--- a/source/yacy.java
+++ b/source/yacy.java
@@ -72,11 +72,12 @@ import de.anomic.http.httpdFileHandler;
import de.anomic.http.httpdProxyHandler;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
-import de.anomic.index.indexRWIEntryOld;
+import de.anomic.index.indexRWIEntryNew;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURLEntryOld;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroDyn;
+import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroRow;
@@ -87,10 +88,11 @@ import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
-import de.anomic.plasma.plasmaURLPool;
import de.anomic.plasma.plasmaWordIndex;
-import de.anomic.plasma.plasmaWordIndexAssortmentCluster;
+import de.anomic.plasma.plasmaWordIndexAssortment;
import de.anomic.plasma.plasmaWordIndexFile;
+import de.anomic.plasma.plasmaWordIndexFileCluster;
+import de.anomic.plasma.dbImport.AssortmentImporter;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils;
@@ -655,34 +657,96 @@ public final class yacy {
File indexRoot = new File(new File(homePath), "DATA/INDEX");
serverLog log = new serverLog("WORDMIGRATION");
log.logInfo("STARTING MIGRATION");
- plasmaWordIndex wordIndexCache = null;
- wordIndexCache = new plasmaWordIndex(dbroot, indexRoot, true, 20000, 10000, log);
+ plasmaWordIndex wordIndexCache = new plasmaWordIndex(indexRoot, 60000000, 60000000, 10000, log);
enumerateFiles words = new enumerateFiles(new File(dbroot, "WORDS"), true, false, true, true);
String wordhash;
File wordfile;
- Object migrationStatus;
+ int migrationCount;
while (words.hasMoreElements())
try {
wordfile = (File) words.nextElement();
wordhash = wordfile.getName().substring(0, 12);
// System.out.println("NOW: " + wordhash);
- migrationStatus = wordIndexCache.migrateWords2index(wordhash);
- if (migrationStatus instanceof Integer) {
- int migrationCount = ((Integer) migrationStatus).intValue();
+ migrationCount = migrateWords2index(dbroot, wordhash, wordIndexCache);
+ if (migrationCount >= 0) {
if (migrationCount == 0)
log.logInfo("SKIPPED " + wordhash + ": empty");
else if (migrationCount > 0)
log.logInfo("MIGRATED " + wordhash + ": " + migrationCount + " entries");
else
log.logInfo("REVERSED " + wordhash + ": " + (-migrationCount) + " entries");
- } else if (migrationStatus instanceof String) {
- log.logInfo("SKIPPED " + wordhash + ": " + migrationStatus);
+ } else {
+ log.logInfo("SKIPPED " + wordhash);
}
} catch (Exception e) {
log.logSevere("Exception", e);
}
log.logInfo("FINISHED MIGRATION JOB, WAIT FOR DUMP");
- wordIndexCache.close(60);
+ wordIndexCache.close();
+ log.logInfo("TERMINATED MIGRATION");
+ }
+
+
+ public static int migrateWords2index(File oldDatabaseRoot, String wordhash, plasmaWordIndex wi) throws IOException {
+ // returns the number of entries that had been added to the assortments
+ // can be negative if some assortments have been moved to the backend
+ File db = plasmaWordIndexFile.wordHash2path(oldDatabaseRoot, wordhash);
+ if (!(db.exists())) {
+ serverLog.logSevere("migrateWordIndex", "word index file for hash " + wordhash + " not found");
+ return -1;
+ }
+ plasmaWordIndexFile entity = null;
+ try {
+ entity = new plasmaWordIndexFile(oldDatabaseRoot, wordhash, true);
+ int size = entity.size();
+ indexContainer container = new indexContainer(wordhash, indexRWIEntryNew.urlEntryRow);
+
+ try {
+ Iterator entries = entity.elements(true);
+ indexRWIEntry entry;
+ while (entries.hasNext()) {
+ entry = (indexRWIEntry) entries.next();
+ // System.out.println("ENTRY = " + entry.getUrlHash());
+ container.add(new indexRWIEntry[] { entry }, System.currentTimeMillis());
+ }
+ // we have read all elements, now delete the entity
+ entity.deleteComplete();
+ entity.close();
+ entity = null;
+
+ wi.addEntries(container, container.updated(), false);
+ return size;
+ } catch (kelondroException e) {
+ // database corrupted, we simply give up the database and delete it
+ try { entity.close(); } catch (Exception ee) { }
+ entity = null;
+ try { db.delete(); } catch (Exception ee) { }
+ serverLog.logSevere("migrateWordIndex", "database for hash " + wordhash + " corrupted; deleted");
+ return -1;
+ }
+ } finally {
+ if (entity != null) try {entity.close();}catch(Exception e){}
+ }
+ }
+
+ public static void migrateAssortments(String homePath) {
+ // run with "java -classpath classes yacy -migrateassortments"
+ try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
+ serverLog log = new serverLog("ASSORTMENTMIGRATION");
+ File aclusterroot = new File(new File(homePath), "DATA/PLASMADB/ACLUSTER");
+ File indexRoot = new File(new File(homePath), "DATA/INDEX");
+ plasmaWordIndex wordIndexCache = new plasmaWordIndex(indexRoot, 60000000, 60000000, 10000, log);
+ log.logInfo("STARTING MIGRATION");
+ String[] a = aclusterroot.list();
+ AssortmentImporter importer = new AssortmentImporter(wordIndexCache);
+ for (int i = a.length - 1; i >= 0; i--) {
+ if (a[i].startsWith("indexAssortment")) {
+ importer.init(new File(aclusterroot, a[i]), 16000000, 2000);
+ importer.run();
+ }
+ }
+ log.logInfo("FINISHED MIGRATION JOB, WAIT FOR DUMP");
+ wordIndexCache.close();
log.logInfo("TERMINATED MIGRATION");
}
@@ -693,7 +757,6 @@ public final class yacy {
public static void minimizeUrlDB(String homePath, int dbcache) {
// run with "java -classpath classes yacy -minimizeUrlDB"
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
- File plasmaroot = new File(new File(homePath), "DATA/PLASMADB");
File indexRoot = new File(new File(homePath), "DATA/INDEX");
serverLog log = new serverLog("URL-CLEANUP");
try {
@@ -702,17 +765,17 @@ public final class yacy {
// db containing all currently loades urls
int cache = dbcache * 1024; // in KB
log.logFine("URLDB-Caches: "+cache+" bytes");
- plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(plasmaroot, indexRoot, cache, 10000);
+ plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexRoot, cache, 10000);
// db used to hold all neede urls
- plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(new File(plasmaroot, "minimized"), indexRoot, cache, 10000);
+ plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(indexRoot, cache, 10000);
Runtime rt = Runtime.getRuntime();
- int cacheMem = (int)((serverMemory.max-rt.totalMemory())/1024)-(2*cache + 8*1024);
- if (cacheMem < 2048) throw new OutOfMemoryError("Not enough memory available to start clean up.");
+ int cacheMem = (int)(serverMemory.max-rt.totalMemory());
+ if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
- plasmaWordIndex wordIndex = new plasmaWordIndex(plasmaroot, indexRoot, true, cacheMem, 10000, log);
- Iterator indexContainerIterator = wordIndex.wordContainers("------------", plasmaWordIndex.RL_WORDFILES, false);
+ plasmaWordIndex wordIndex = new plasmaWordIndex(indexRoot, cacheMem, cacheMem, 10000, log);
+ Iterator indexContainerIterator = wordIndex.wordContainers("------------", false, false);
long urlCounter = 0, wordCounter = 0;
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0;
@@ -767,7 +830,7 @@ public final class yacy {
currentUrlDB.close();
minimizedUrlDB.close();
- wordIndex.close(600);
+ wordIndex.close();
// TODO: rename the mimimized UrlDB to the name of the previous UrlDB
@@ -941,16 +1004,16 @@ public final class yacy {
File root = new File(homePath);
try {
- plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, 1000, 1000, 10000);
+ final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
HashMap doms = new HashMap();
- System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries.");
+ System.out.println("Started domain list extraction from " + sb.wordIndex.loadedURL.size() + " url entries.");
System.out.println("a dump will be written after double-check of all extracted domains.");
System.out.println("This process may fail in case of too less memory. To increase memory, start with");
System.out.println("java -Xmxm -classpath classes yacy -domlist [ -source { nurl | lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ ]");
int c = 0;
long start = System.currentTimeMillis();
if (source.equals("lurl")) {
- Iterator eiter = pool.loadedURL.entries(true, false, null);
+ Iterator eiter = sb.wordIndex.loadedURL.entries(true, false, null);
indexURLEntry entry;
while (eiter.hasNext()) {
try {
@@ -966,11 +1029,11 @@ public final class yacy {
c + " urls checked, " +
doms.size() + " domains collected, " +
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
- ((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
+ ((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
}
if (source.equals("eurl")) {
- Iterator eiter = pool.errorURL.entries(true, false, null);
+ Iterator eiter = sb.errorURL.entries(true, false, null);
plasmaCrawlEURL.Entry entry;
while (eiter.hasNext()) {
try {
@@ -985,11 +1048,11 @@ public final class yacy {
c + " urls checked, " +
doms.size() + " domains collected, " +
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
- ((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
+ ((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
}
if (source.equals("nurl")) {
- Iterator eiter = pool.noticeURL.entries(true, false, null);
+ Iterator eiter = sb.noticeURL.entries(true, false, null);
plasmaCrawlNURL.Entry entry;
while (eiter.hasNext()) {
try {
@@ -1004,7 +1067,7 @@ public final class yacy {
c + " urls checked, " +
doms.size() + " domains collected, " +
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
- ((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
+ ((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
}
@@ -1048,7 +1111,7 @@ public final class yacy {
System.out.println("Started domain list dump to file " + file);
serverFileUtils.saveSet(file, "plain", doms.keySet(), new String(serverCore.crlf));
}
- pool.close();
+ sb.close();
} catch (IOException e) {
e.printStackTrace();
}
@@ -1057,12 +1120,12 @@ public final class yacy {
private static void urllist(String homePath, String source, boolean html, String targetName) {
File root = new File(homePath);
try {
- plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, 1000, 1000, 10000);
+ final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
File file = new File(root, targetName);
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
if (source.equals("lurl")) {
- Iterator eiter = pool.loadedURL.entries(true, false, null);
+ Iterator eiter = sb.wordIndex.loadedURL.entries(true, false, null);
indexURLEntry entry;
while (eiter.hasNext()) {
entry = (indexURLEntry) eiter.next();
@@ -1079,7 +1142,7 @@ public final class yacy {
}
}
if (source.equals("eurl")) {
- Iterator eiter = pool.errorURL.entries(true, false, null);
+ Iterator eiter = sb.errorURL.entries(true, false, null);
plasmaCrawlEURL.Entry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlEURL.Entry) eiter.next();
@@ -1095,7 +1158,7 @@ public final class yacy {
}
}
if (source.equals("nurl")) {
- Iterator eiter = pool.noticeURL.entries(true, false, null);
+ Iterator eiter = sb.noticeURL.entries(true, false, null);
plasmaCrawlNURL.Entry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlNURL.Entry) eiter.next();
@@ -1111,14 +1174,14 @@ public final class yacy {
}
}
bos.close();
- pool.close();
+ sb.close();
} catch (IOException e) {
e.printStackTrace();
}
}
- private static void migratelurls(File root, File urlHash) {
- plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, 1000, 1000, 10000);
+ private static void migratelurls(String homePath, File urlHash) {
+ final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
kelondroTree oldindex = null;
try {
oldindex = new kelondroTree(urlHash, 1000, -1, indexURLEntryOld.rowdef);
@@ -1146,7 +1209,7 @@ public final class yacy {
if (oldrow != null) try {
oldentry = new indexURLEntryOld(oldrow, null);
comp = oldentry.comp();
- newentry = pool.loadedURL.newEntry(
+ newentry = sb.wordIndex.loadedURL.newEntry(
comp.url(),
comp.descr(),
"",
@@ -1163,7 +1226,7 @@ public final class yacy {
new kelondroBitfield(4),
oldentry.language(),
0, 0, 0, 0, 0, 0);
- pool.loadedURL.store(newentry);
+ sb.wordIndex.loadedURL.store(newentry);
c++;
} catch (IOException e) {
// ignore
@@ -1173,7 +1236,7 @@ public final class yacy {
last = System.currentTimeMillis();
}
}
- pool.close();
+ sb.close();
try { oldindex.close(); } catch (IOException e) { }
System.out.println("MIGRATION OF " + c + " URLs FINISHED");
}
@@ -1193,12 +1256,11 @@ public final class yacy {
*/
private static void urldbcleanup(String homePath) {
File root = new File(homePath);
- File plasmaroot = new File(root, "DATA/PLASMADB");
File indexroot = new File(root, "DATA/INDEX");
serverLog log = new serverLog("URLDBCLEANUP");
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
try {
- plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(plasmaroot, indexroot, 4194304, 10000);
+ plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexroot, 4194304, 10000);
currentUrlDB.urldbcleanup();
currentUrlDB.close();
} catch (IOException e) {
@@ -1218,19 +1280,16 @@ public final class yacy {
try {
Iterator indexContainerIterator = null;
if (resource.equals("all")) {
- WordIndex = new plasmaWordIndex(homeDBroot, indexRoot, true, 8*1024*1024, 3000, log);
- indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
- } else if (resource.equals("assortments")) {
- plasmaWordIndexAssortmentCluster assortmentCluster = new plasmaWordIndexAssortmentCluster(new File(homeDBroot, "ACLUSTER"), 64, indexRWIEntryOld.urlEntryRow, 16*1024*1024, 3000, log);
- indexContainerIterator = assortmentCluster.wordContainers(wordChunkStartHash, true, false);
- } /*else if (resource.startsWith("assortment")) {
+ WordIndex = new plasmaWordIndex(indexRoot, 8*1024*1024, 8*1024*1024, 3000, log);
+ indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, false, false);
+ } else if (resource.startsWith("assortment")) {
int a = Integer.parseInt(resource.substring(10));
plasmaWordIndexAssortment assortment = new plasmaWordIndexAssortment(new File(homeDBroot, "ACLUSTER"), a, 8*1024*1024, 3000, null);
- indexContainerIterator = assortment.hashes(wordChunkStartHash, true, false);
+ indexContainerIterator = assortment.wordContainers(wordChunkStartHash, true, false);
} else if (resource.equals("words")) {
- plasmaWordIndexFileCluster fileDB = new plasmaWordIndexFileCluster(homeDBroot, log);
- indexContainerIterator = fileDB.wordContainers(wordChunkStartHash, true, false);
- }*/ // *** FIXME ***
+ plasmaWordIndexFileCluster fileDB = new plasmaWordIndexFileCluster(homeDBroot);
+ indexContainerIterator = fileDB.wordContainers(wordChunkStartHash, false);
+ }
int counter = 0;
indexContainer container = null;
if (format.equals("zip")) {
@@ -1269,7 +1328,7 @@ public final class yacy {
log.logSevere("IOException", e);
}
if (WordIndex != null) {
- WordIndex.close(60);
+ WordIndex.close();
WordIndex = null;
}
}
@@ -1354,10 +1413,15 @@ public final class yacy {
if (args.length == 2) applicationRoot= args[1];
shutdown(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-migratewords"))) {
- // migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible
+ // migrate words from DATA/PLASMADB/WORDS path to collection index
// attention: this may run long and should not be interrupted!
if (args.length == 2) applicationRoot= args[1];
migrateWords(applicationRoot);
+ } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-migrateassortments"))) {
+ // migrate assortments from DATA/PLASMADB/ACLUSTER path to collection index
+ // attention: this may run long and should not be interrupted!
+ if (args.length == 2) applicationRoot= args[1];
+ migrateAssortments(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-minimizeurldb"))) {
// migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible
// attention: this may run long and should not be interrupted!
@@ -1437,7 +1501,7 @@ public final class yacy {
urllist(applicationRoot, source, html, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-migratelurls"))) {
File root = new File(applicationRoot);
- migratelurls(root, new File(root, "DATA/PLASMADB/urlHash.db"));
+ migratelurls(applicationRoot, new File(root, "DATA/PLASMADB/urlHash.db"));
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
// generate a url list and save it in a file
if (args.length == 2) applicationRoot= args[1];