From 109ed0a0bb23982897b58117ed65dc579c616f11 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 5 Dec 2006 02:47:51 +0000 Subject: [PATCH] - cleaned up code; removed methods to write the old data structures - added an assortment importer. the old database structures can be imported with java -classpath classes yacy -migrateassortments - modified wordmigration. The indexes from WORDS are now imported to the collection database. The call is java -classpath classes yacy -migratewords (as it was) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3044 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- doc/Download.html | 10 +- htroot/Bookmarks.java | 2 +- htroot/IndexCleaner_p.java | 6 +- htroot/IndexControl_p.java | 27 +- htroot/IndexCreateIndexingQueue_p.java | 14 +- htroot/IndexCreateWWWGlobalQueue_p.java | 8 +- htroot/IndexCreateWWWLocalQueue_p.java | 16 +- htroot/IndexCreate_p.java | 16 +- htroot/IndexImport_p.java | 4 +- htroot/IndexMonitor.java | 22 +- htroot/IndexShare_p.java | 4 +- htroot/IndexTransfer_p.java | 2 +- htroot/PerformanceMemory_p.java | 34 +- htroot/PerformanceQueues_p.java | 2 +- htroot/QuickCrawlLink_p.java | 6 +- htroot/ViewFile.java | 2 +- htroot/htdocsdefault/dir.java | 8 +- htroot/xml/queues_p.java | 8 +- htroot/yacy/crawlOrder.java | 4 +- htroot/yacy/crawlReceipt.java | 18 +- htroot/yacy/query.java | 2 +- htroot/yacy/search.java | 4 +- htroot/yacy/transferRWI.java | 70 +-- htroot/yacy/transferURL.java | 10 +- htroot/yacysearch.java | 2 +- source/de/anomic/index/indexCachedRI.java | 21 +- source/de/anomic/index/indexCollectionRI.java | 20 +- source/de/anomic/index/indexContainer.java | 40 +- source/de/anomic/index/indexRAMRI.java | 45 +- source/de/anomic/index/indexRI.java | 6 +- source/de/anomic/index/indexRWIEntryNew.java | 3 +- .../plasma/crawler/AbstractCrawlWorker.java | 4 +- .../plasma/crawler/http/CrawlWorker.java | 2 +- .../plasma/dbImport/AbstractImporter.java | 19 +- .../plasma/dbImport/AssortmentImporter.java | 58 ++- .../plasma/dbImport/dbImportManager.java | 6 +- .../de/anomic/plasma/dbImport/dbImporter.java | 2 +- .../dbImport/plasmaCrawlNURLImporter.java | 13 +- .../plasma/dbImport/plasmaDbImporter.java | 52 +-- source/de/anomic/plasma/plasmaCrawlEURL.java | 7 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 6 +- .../de/anomic/plasma/plasmaCrawlStacker.java | 12 +- source/de/anomic/plasma/plasmaDHTChunk.java | 8 +- source/de/anomic/plasma/plasmaDHTFlush.java | 2 +- source/de/anomic/plasma/plasmaHTCache.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 166 +++---- source/de/anomic/plasma/plasmaURLPool.java | 99 ----- source/de/anomic/plasma/plasmaWordIndex.java | 194 +++------ .../plasma/plasmaWordIndexAssortment.java | 149 +------ .../plasmaWordIndexAssortmentCluster.java | 408 ------------------ .../de/anomic/plasma/plasmaWordIndexFile.java | 207 +-------- .../plasma/plasmaWordIndexFileCluster.java | 94 +--- .../anomic/urlRedirector/urlRedirectord.java | 6 +- source/de/anomic/yacy/yacyClient.java | 29 +- source/de/anomic/yacy/yacyDHTAction.java | 1 + source/de/anomic/yacy/yacyPeerActions.java | 4 +- source/yacy.java | 172 +++++--- 57 files changed, 601 insertions(+), 1557 deletions(-) delete mode 100644 source/de/anomic/plasma/plasmaURLPool.java delete mode 100644 source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java diff --git a/doc/Download.html b/doc/Download.html index 7e0a228b7..b4aea23c8 100644 --- a/doc/Download.html +++ b/doc/Download.html @@ -53,19 +53,19 @@ globalheader();

If you download the software, you must accept the License.

Latest Release: -The latest YaCy release version is 0.48
+The latest YaCy release version is 0.49
Nightly builds from compiles out of SVN can be obtained from http://latest.yacy-forum.net/.

diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 42a36bc6d..28f5fef1d 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -147,7 +147,7 @@ public class Bookmarks { bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash); if (bookmark == null) { // try to get the bookmark from the LURL database - indexURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null); + indexURLEntry urlentry = switchboard.wordIndex.loadedURL.load(urlHash, null); plasmaParserDocument document = null; if (urlentry != null) { indexURLEntry.Components comp = urlentry.comp(); diff --git a/htroot/IndexCleaner_p.java b/htroot/IndexCleaner_p.java index fc36ed171..6195bd13e 100755 --- a/htroot/IndexCleaner_p.java +++ b/htroot/IndexCleaner_p.java @@ -62,7 +62,7 @@ public class IndexCleaner_p { prop.put("bla", "post!=null"); if (post.get("action").equals("ustart")) { if (urldbCleanerThread==null || !urldbCleanerThread.isAlive()) { - urldbCleanerThread = sb.urlPool.loadedURL.makeCleaner(); + urldbCleanerThread = sb.wordIndex.loadedURL.makeCleaner(); urldbCleanerThread.start(); } else { @@ -77,7 +77,7 @@ public class IndexCleaner_p { } else if (post.get("action").equals("rstart")) { if (indexCleanerThread==null || !indexCleanerThread.isAlive()) { - indexCleanerThread = sb.wordIndex.makeCleaner(sb.urlPool.loadedURL, post.get("wordHash","--------")); + indexCleanerThread = sb.wordIndex.makeCleaner(sb.wordIndex.loadedURL, post.get("wordHash","--------")); indexCleanerThread.start(); } else { @@ -98,7 +98,7 @@ public class IndexCleaner_p { } if (urldbCleanerThread!=null) { prop.put("urldb", 1); - prop.put("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/sb.urlPool.loadedURL.size())*100 + ""); + prop.put("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/sb.wordIndex.loadedURL.size())*100 + ""); prop.put("urldb_blacklisted", urldbCleanerThread.blacklistedUrls); prop.put("urldb_total", urldbCleanerThread.totalSearchedUrls); prop.put("urldb_lastBlacklistedUrl", urldbCleanerThread.lastBlacklistedUrl); diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 146c6bfdd..62758aafb 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -63,7 +63,6 @@ import de.anomic.index.indexURLEntry; import de.anomic.net.URL; import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverCodings; import de.anomic.server.serverObjects; @@ -87,7 +86,7 @@ public class IndexControl_p { prop.put("urlhash", ""); prop.put("result", ""); prop.put("wcount", Integer.toString(switchboard.wordIndex.size())); - prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size())); + prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size())); prop.put("otherHosts", ""); prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : ""); prop.put("indexDistributeWhileCrawling", (switchboard.getConfig("allowDistributeIndexWhileCrawling", "true").equals("true")) ? "checked" : ""); @@ -170,7 +169,7 @@ public class IndexControl_p { } if (delurl || delurlref) { for (int i = 0; i < urlx.length; i++) { - switchboard.urlPool.loadedURL.remove(urlx[i]); + switchboard.wordIndex.loadedURL.remove(urlx[i]); } } switchboard.wordIndex.deleteContainer(keyhash); @@ -190,7 +189,7 @@ public class IndexControl_p { } if (delurl || delurlref) { for (int i = 0; i < urlx.length; i++) { - switchboard.urlPool.loadedURL.remove(urlx[i]); + switchboard.wordIndex.loadedURL.remove(urlx[i]); } } Set urlHashes = new HashSet(); @@ -217,13 +216,13 @@ public class IndexControl_p { } if (post.containsKey("urlhashdelete")) { - indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null); + indexURLEntry entry = switchboard.wordIndex.loadedURL.load(urlhash, null); if (entry == null) { prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); } else { urlstring = entry.comp().url().toNormalform(); prop.put("urlstring", ""); - switchboard.urlPool.loadedURL.remove(urlhash); + switchboard.wordIndex.loadedURL.remove(urlhash); prop.put("result", "Removed URL " + urlstring); } } @@ -282,7 +281,7 @@ public class IndexControl_p { indexURLEntry lurl; while (urlIter.hasNext()) { iEntry = (indexRWIEntry) urlIter.next(); - lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null); + lurl = switchboard.wordIndex.loadedURL.load(iEntry.urlHash(), null); if (lurl == null) { unknownURLEntries.add(iEntry.urlHash()); urlIter.remove(); @@ -307,7 +306,7 @@ public class IndexControl_p { // generate list if (post.containsKey("keyhashsimilar")) { - final Iterator containerIt = switchboard.wordIndex.indexContainerSet(keyhash, plasmaWordIndex.RL_WORDFILES, true, 256).iterator(); + final Iterator containerIt = switchboard.wordIndex.indexContainerSet(keyhash, false, true, 256).iterator(); indexContainer container; int i = 0; int rows = 0, cols = 0; @@ -333,7 +332,7 @@ public class IndexControl_p { URL url = new URL(urlstring); urlhash = plasmaURL.urlHash(url); prop.put("urlhash", urlhash); - indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null); + indexURLEntry entry = switchboard.wordIndex.loadedURL.load(urlhash, null); if (entry == null) { prop.put("urlstring", "unknown url: " + urlstring); prop.put("urlhash", ""); @@ -347,7 +346,7 @@ public class IndexControl_p { } if (post.containsKey("urlhashsearch")) { - indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null); + indexURLEntry entry = switchboard.wordIndex.loadedURL.load(urlhash, null); if (entry == null) { prop.put("result", "No Entry for URL hash " + urlhash); } else { @@ -359,7 +358,7 @@ public class IndexControl_p { // generate list if (post.containsKey("urlhashsimilar")) { try { - final Iterator entryIt = switchboard.urlPool.loadedURL.entries(true, true, urlhash); + final Iterator entryIt = switchboard.wordIndex.loadedURL.entries(true, true, urlhash); StringBuffer result = new StringBuffer("Sequential List of URL-Hashes:
"); indexURLEntry entry; int i = 0; @@ -403,7 +402,7 @@ public class IndexControl_p { // insert constants prop.put("wcount", Integer.toString(switchboard.wordIndex.size())); - prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size())); + prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size())); prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : ""); prop.put("indexDistributeWhileCrawling", (switchboard.getConfig("allowDistributeIndexWhileCrawling", "true").equals("true")) ? "checked" : ""); prop.put("indexReceiveChecked", (switchboard.getConfig("allowReceiveIndex", "true").equals("true")) ? "checked" : ""); @@ -422,7 +421,7 @@ public class IndexControl_p { } indexURLEntry.Components comp = entry.comp(); String referrer = null; - indexURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null); + indexURLEntry le = switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null); if (le == null) { referrer = ""; } else { @@ -471,7 +470,7 @@ public class IndexControl_p { while (en.hasNext()) { xi = (indexRWIEntry) en.next(); uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())}; - indexURLEntry le = switchboard.urlPool.loadedURL.load(uh[0], null); + indexURLEntry le = switchboard.wordIndex.loadedURL.load(uh[0], null); if (le == null) { tm.put(uh[0], uh); } else { diff --git a/htroot/IndexCreateIndexingQueue_p.java b/htroot/IndexCreateIndexingQueue_p.java index 32e66590e..35d1eabbc 100644 --- a/htroot/IndexCreateIndexingQueue_p.java +++ b/htroot/IndexCreateIndexingQueue_p.java @@ -76,7 +76,7 @@ public class IndexCreateIndexingQueue_p { } if (post.containsKey("clearRejected")) { - switchboard.urlPool.errorURL.clearStack(); + switchboard.errorURL.clearStack(); } if (post.containsKey("moreRejected")) { showRejectedCount = Integer.parseInt(post.get("showRejected", "10")); @@ -172,11 +172,11 @@ public class IndexCreateIndexingQueue_p { } // failure cases - if (switchboard.urlPool.errorURL.stackSize() != 0) { - if (showRejectedCount > switchboard.urlPool.errorURL.stackSize()) showRejectedCount = switchboard.urlPool.errorURL.stackSize(); + if (switchboard.errorURL.stackSize() != 0) { + if (showRejectedCount > switchboard.errorURL.stackSize()) showRejectedCount = switchboard.errorURL.stackSize(); prop.put("rejected", 1); - prop.put("rejected_num", switchboard.urlPool.errorURL.stackSize()); - if (showRejectedCount != switchboard.urlPool.errorURL.stackSize()) { + prop.put("rejected_num", switchboard.errorURL.stackSize()); + if (showRejectedCount != switchboard.errorURL.stackSize()) { prop.put("rejected_only-latest", 1); prop.put("rejected_only-latest_num", showRejectedCount); prop.put("rejected_only-latest_newnum", ((int) (showRejectedCount * 1.5))); @@ -189,9 +189,9 @@ public class IndexCreateIndexingQueue_p { plasmaCrawlEURL.Entry entry; yacySeed initiatorSeed, executorSeed; int j=0; - for (int i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) { + for (int i = switchboard.errorURL.stackSize() - 1; i >= (switchboard.errorURL.stackSize() - showRejectedCount); i--) { try { - entry = switchboard.urlPool.errorURL.stackPopEntry(i); + entry = switchboard.errorURL.stackPopEntry(i); url = entry.url(); if (url == null) continue; diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java index 10cea766f..3c8767480 100644 --- a/htroot/IndexCreateWWWGlobalQueue_p.java +++ b/htroot/IndexCreateWWWGlobalQueue_p.java @@ -79,8 +79,8 @@ public class IndexCreateWWWGlobalQueue_p { } if (post.containsKey("clearcrawlqueue")) { - int c = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); - switchboard.urlPool.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_LIMIT); + int c = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); + switchboard.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_LIMIT); try { switchboard.cleanProfiles(); } catch (InterruptedException e) { /* Ignore this */} /* int c = 0; @@ -94,12 +94,12 @@ public class IndexCreateWWWGlobalQueue_p { } } - int stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); + int stackSize = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); if (stackSize == 0) { prop.put("crawler-queue", 0); } else { prop.put("crawler-queue", 1); - plasmaCrawlNURL.Entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, showLimit); + plasmaCrawlNURL.Entry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, showLimit); prop.put("crawler-queue_num", stackSize);//num Entries plasmaCrawlNURL.Entry urle; boolean dark = true; diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java index 137ce33c4..5d5042949 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -90,8 +90,8 @@ public class IndexCreateWWWLocalQueue_p { String pattern = post.get("pattern", ".*").trim(); String option = post.get("option", ".*").trim(); if (pattern.equals(".*")) { - c = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); - switchboard.urlPool.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE); + c = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); + switchboard.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE); try { switchboard.cleanProfiles(); } catch (InterruptedException e) {/* ignore this */} } else{ Pattern compiledPattern = null; @@ -100,13 +100,13 @@ public class IndexCreateWWWLocalQueue_p { compiledPattern = Pattern.compile(pattern); // iterating through the list of URLs - Iterator iter = switchboard.urlPool.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE); + Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE); while (iter.hasNext()) { String value = null; String nextHash = new String((byte[]) iter.next()); Entry entry = null; try { - entry = switchboard.urlPool.noticeURL.getEntry(nextHash); + entry = switchboard.noticeURL.getEntry(nextHash); } catch (IOException e) { continue; } @@ -137,7 +137,7 @@ public class IndexCreateWWWLocalQueue_p { if (value != null) { Matcher matcher = compiledPattern.matcher(value); if (matcher.find()) { - switchboard.urlPool.noticeURL.remove(nextHash); + switchboard.noticeURL.remove(nextHash); } } @@ -151,18 +151,18 @@ public class IndexCreateWWWLocalQueue_p { prop.put("info_numEntries", c); } else if (post.containsKey("deleteEntry")) { String urlHash = (String) post.get("deleteEntry"); - switchboard.urlPool.noticeURL.remove(urlHash); + switchboard.noticeURL.remove(urlHash); prop.put("LOCATION",""); return prop; } } - int showNum = 0, stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); + int showNum = 0, stackSize = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); if (stackSize == 0) { prop.put("crawler-queue", 0); } else { prop.put("crawler-queue", 1); - plasmaCrawlNURL.Entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, (int) (showLimit * 1.20)); + plasmaCrawlNURL.Entry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, (int) (showLimit * 1.20)); plasmaCrawlNURL.Entry urle; boolean dark = true; diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index b737083a2..ee5bbda4f 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -168,9 +168,9 @@ public class IndexCreate_p { // stack request // first delete old entry, if exists String urlhash = plasmaURL.urlHash(crawlingStart); - switchboard.urlPool.loadedURL.remove(urlhash); - switchboard.urlPool.noticeURL.remove(urlhash); - switchboard.urlPool.errorURL.remove(urlhash); + switchboard.wordIndex.loadedURL.remove(urlhash); + switchboard.noticeURL.remove(urlhash); + switchboard.errorURL.remove(urlhash); // stack url plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw); @@ -203,10 +203,10 @@ public class IndexCreate_p { prop.put("error_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL")))); prop.put("error_reasonString", reasonString); - plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, + plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, crawlingStartURL.getHost(), reasonString, new kelondroBitfield()); ee.store(); - switchboard.urlPool.errorURL.stackPushEntry(ee); + switchboard.errorURL.stackPushEntry(ee); } } catch (PatternSyntaxException e) { prop.put("error", 8); //crawlfilter does not match url @@ -281,10 +281,10 @@ public class IndexCreate_p { if (rejectReason == null) { c++; } else { - plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, + plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, (String) e.getValue(), rejectReason, new kelondroBitfield()); ee.store(); - switchboard.urlPool.errorURL.stackPushEntry(ee); + switchboard.errorURL.stackPushEntry(ee); } } @@ -412,7 +412,7 @@ public class IndexCreate_p { int queueStackSize = switchboard.sbQueue.size(); int loaderThreadsSize = switchboard.cacheLoader.size(); - int crawlerListSize = switchboard.urlPool.noticeURL.stackSize(); + int crawlerListSize = switchboard.noticeURL.stackSize(); int completequeue = queueStackSize + loaderThreadsSize + crawlerListSize; if ((completequeue > 0) || ((post != null) && (post.containsKey("refreshpage")))) { diff --git a/htroot/IndexImport_p.java b/htroot/IndexImport_p.java index 25bb87950..1045675ce 100644 --- a/htroot/IndexImport_p.java +++ b/htroot/IndexImport_p.java @@ -98,7 +98,7 @@ public final class IndexImport_p { if (startImport) { dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType); if (importerThread != null) { - importerThread.init(new File(importPath), switchboard.indexPath, cacheSize, 100); + importerThread.init(new File(importPath), cacheSize, 100); importerThread.startIt(); } prop.put("LOCATION",""); @@ -147,7 +147,7 @@ public final class IndexImport_p { } prop.put("wcount", Integer.toString(switchboard.wordIndex.size())); - prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size())); + prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size())); /* * Loop over all currently running jobs diff --git a/htroot/IndexMonitor.java b/htroot/IndexMonitor.java index 97568020e..24dd5c6d1 100644 --- a/htroot/IndexMonitor.java +++ b/htroot/IndexMonitor.java @@ -109,12 +109,12 @@ public class IndexMonitor { } // do the commands - if (post.containsKey("clearlist")) sb.urlPool.loadedURL.clearStack(tabletype); + if (post.containsKey("clearlist")) sb.wordIndex.loadedURL.clearStack(tabletype); if (post.containsKey("deleteentry")) { String hash = post.get("hash", null); if (hash != null) { // delete from database - sb.urlPool.loadedURL.remove(hash); + sb.wordIndex.loadedURL.remove(hash); } } if (post.containsKey("moreIndexed")) { @@ -126,18 +126,18 @@ public class IndexMonitor { // create table if (tabletype == 0) { prop.put("table", 2); - } else if (sb.urlPool.loadedURL.getStackSize(tabletype) == 0) { + } else if (sb.wordIndex.loadedURL.getStackSize(tabletype) == 0) { prop.put("table", 0); } else { prop.put("table", 1); - if (lines > sb.urlPool.loadedURL.getStackSize(tabletype)) lines = sb.urlPool.loadedURL.getStackSize(tabletype); - if (lines == sb.urlPool.loadedURL.getStackSize(tabletype)) { + if (lines > sb.wordIndex.loadedURL.getStackSize(tabletype)) lines = sb.wordIndex.loadedURL.getStackSize(tabletype); + if (lines == sb.wordIndex.loadedURL.getStackSize(tabletype)) { prop.put("table_size", 0); } else { prop.put("table_size", 1); prop.put("table_size_count", lines); } - prop.put("table_size_all", sb.urlPool.loadedURL.getStackSize(tabletype)); + prop.put("table_size_all", sb.wordIndex.loadedURL.getStackSize(tabletype)); prop.put("table_feedbackpage", "IndexMonitor.html"); prop.put("table_tabletype", tabletype); prop.put("table_showInit", (showInit) ? 1 : 0); @@ -153,14 +153,14 @@ public class IndexMonitor { final plasmaHTCache cacheManager = sb.getCacheManager(); int i, cnt = 0; - for (i = sb.urlPool.loadedURL.getStackSize(tabletype) - 1; i >= (sb.urlPool.loadedURL.getStackSize(tabletype) - lines); i--) { - initiatorHash = sb.urlPool.loadedURL.getInitiatorHash(tabletype, i); - executorHash = sb.urlPool.loadedURL.getExecutorHash(tabletype, i); + for (i = sb.wordIndex.loadedURL.getStackSize(tabletype) - 1; i >= (sb.wordIndex.loadedURL.getStackSize(tabletype) - lines); i--) { + initiatorHash = sb.wordIndex.loadedURL.getInitiatorHash(tabletype, i); + executorHash = sb.wordIndex.loadedURL.getExecutorHash(tabletype, i); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash); - urlHash = sb.urlPool.loadedURL.getUrlHash(tabletype, i); + urlHash = sb.wordIndex.loadedURL.getUrlHash(tabletype, i); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash); try { - urle = sb.urlPool.loadedURL.load(urlHash, null); + urle = sb.wordIndex.loadedURL.load(urlHash, null); indexURLEntry.Components comp = urle.comp(); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString()); initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash); diff --git a/htroot/IndexShare_p.java b/htroot/IndexShare_p.java index b6f362e42..a174beb45 100644 --- a/htroot/IndexShare_p.java +++ b/htroot/IndexShare_p.java @@ -66,7 +66,7 @@ public class IndexShare_p { prop.put("dtable", ""); prop.put("rtable", ""); prop.put("wcount", Integer.toString(switchboard.wordIndex.size())); - prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size())); + prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size())); return prop; // be save } @@ -79,7 +79,7 @@ public class IndexShare_p { // insert constants prop.put("wcount", Integer.toString(switchboard.wordIndex.size())); - prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size())); + prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size())); // return rewrite properties return prop; } diff --git a/htroot/IndexTransfer_p.java b/htroot/IndexTransfer_p.java index 1c71c091e..f6ac9952a 100644 --- a/htroot/IndexTransfer_p.java +++ b/htroot/IndexTransfer_p.java @@ -96,7 +96,7 @@ public final class IndexTransfer_p { // insert constants prop.put("wcount", Integer.toString(switchboard.wordIndex.size())); - prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size())); + prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size())); prop.put("running",(switchboard.transferIdxThread==null)?0:1); if (switchboard.transferIdxThread != null) { String[] status = switchboard.transferIdxThread.getStatus(); diff --git a/htroot/PerformanceMemory_p.java b/htroot/PerformanceMemory_p.java index 591a67e3a..ffafb1ab2 100644 --- a/htroot/PerformanceMemory_p.java +++ b/htroot/PerformanceMemory_p.java @@ -175,11 +175,11 @@ public class PerformanceMemory_p { ost = sb.cacheManager.cacheObjectStatus(); putprop(prop, env, "", "HTTP", set); - req = sb.urlPool.loadedURL.size(); - chk = sb.urlPool.loadedURL.cacheNodeChunkSize(); - obj = sb.urlPool.loadedURL.cacheObjectChunkSize(); - slt = sb.urlPool.loadedURL.cacheNodeStatus(); - ost = sb.urlPool.loadedURL.cacheObjectStatus(); + req = sb.wordIndex.loadedURL.size(); + chk = sb.wordIndex.loadedURL.cacheNodeChunkSize(); + obj = sb.wordIndex.loadedURL.cacheObjectChunkSize(); + slt = sb.wordIndex.loadedURL.cacheNodeStatus(); + ost = sb.wordIndex.loadedURL.cacheObjectStatus(); putprop(prop, env, "", "LURL", set); if (sb.sbStackCrawlThread.getDBType() != de.anomic.plasma.plasmaCrawlStacker.QUEUE_DB_TYPE_TREE) { @@ -194,27 +194,27 @@ public class PerformanceMemory_p { putprop(prop, env, "usePreNURLCache", "PreNURL", set); } - if (sb.urlPool.noticeURL.getUseNewDB()) { + if (sb.noticeURL.getUseNewDB()) { prop.put("useNURLCache", 0); } else { prop.put("useNURLCache", 1); - req = sb.urlPool.noticeURL.size(); - chk = sb.urlPool.noticeURL.cacheNodeChunkSize(); - obj = sb.urlPool.noticeURL.cacheObjectChunkSize(); - slt = sb.urlPool.noticeURL.cacheNodeStatus(); - ost = sb.urlPool.noticeURL.cacheObjectStatus(); + req = sb.noticeURL.size(); + chk = sb.noticeURL.cacheNodeChunkSize(); + obj = sb.noticeURL.cacheObjectChunkSize(); + slt = sb.noticeURL.cacheNodeStatus(); + ost = sb.noticeURL.cacheObjectStatus(); putprop(prop, env, "useNURLCache", "NURL", set); } - if (sb.urlPool.errorURL.getUseNewDB()) { + if (sb.errorURL.getUseNewDB()) { prop.put("useEURLCache", 0); } else { prop.put("useEURLCache", 1); - req = sb.urlPool.errorURL.size(); - chk = sb.urlPool.errorURL.cacheNodeChunkSize(); - obj = sb.urlPool.errorURL.cacheObjectChunkSize(); - slt = sb.urlPool.errorURL.cacheNodeStatus(); - ost = sb.urlPool.errorURL.cacheObjectStatus(); + req = sb.errorURL.size(); + chk = sb.errorURL.cacheNodeChunkSize(); + obj = sb.errorURL.cacheObjectChunkSize(); + slt = sb.errorURL.cacheNodeStatus(); + ost = sb.errorURL.cacheObjectStatus(); putprop(prop, env, "useEURLCache", "EURL", set); } diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index 6f3cafc30..1df1e0e27 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -262,7 +262,7 @@ public class PerformanceQueues_p { } // table cache settings - prop.put("urlCacheSize", switchboard.urlPool.loadedURL.writeCacheSize()); + prop.put("urlCacheSize", switchboard.wordIndex.loadedURL.writeCacheSize()); prop.put("wordCacheWSize", switchboard.wordIndex.dhtOutCacheSize()); prop.put("wordCacheKSize", switchboard.wordIndex.dhtInCacheSize()); prop.put("maxURLinWCache", "" + switchboard.wordIndex.maxURLinDHTOutCache()); diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 27d205924..dff5eef4d 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -149,9 +149,9 @@ public class QuickCrawlLink_p { } String urlhash = plasmaURL.urlHash(crawlingStart); - switchboard.urlPool.loadedURL.remove(urlhash); - switchboard.urlPool.noticeURL.remove(urlhash); - switchboard.urlPool.errorURL.remove(urlhash); + switchboard.wordIndex.loadedURL.remove(urlhash); + switchboard.noticeURL.remove(urlhash); + switchboard.errorURL.remove(urlhash); // create crawling profile plasmaCrawlProfile.entry pe = null; diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 8af387d82..ad20fdc50 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -108,7 +108,7 @@ public class ViewFile { if (urlHash.length() > 0) { // getting the urlEntry that belongs to the url hash indexURLEntry urlEntry = null; - urlEntry = sb.urlPool.loadedURL.load(urlHash, null); + urlEntry = sb.wordIndex.loadedURL.load(urlHash, null); if (urlEntry == null) { prop.put("error",2); prop.put("viewMode",VIEW_MODE_NO_TEXT); diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index 3d94c6de4..5b6bfdab1 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -361,7 +361,7 @@ public class dir { try { final URL url = new URL(urlstring); final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()), "UTF-8"); - final indexURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry( + final indexURLEntry newEntry = switchboard.wordIndex.loadedURL.newEntry( url, "YaCyShare: " + descr, yacyCore.seedDB.mySeed.getName(), @@ -379,8 +379,8 @@ public class dir { "**", // language 0,0,0,0,0,0 ); - switchboard.urlPool.loadedURL.store(newEntry); - switchboard.urlPool.loadedURL.stack( + switchboard.wordIndex.loadedURL.store(newEntry); + switchboard.wordIndex.loadedURL.stack( newEntry, "____________", /*initiator*/ yacyCore.seedDB.mySeed.hash, /*executor*/ @@ -401,7 +401,7 @@ public class dir { entry = (Map.Entry) words.next(); switchboard.wordIndex.removeEntry(plasmaCondenser.word2hash((String) entry.getKey()), urlhash, true); } - switchboard.urlPool.loadedURL.remove(urlhash); + switchboard.wordIndex.loadedURL.remove(urlhash); } catch (Exception e) { serverLog.logSevere("DIR", "INTERNAL ERROR in dir.deletePhrase", e); } diff --git a/htroot/xml/queues_p.java b/htroot/xml/queues_p.java index dadd71ab2..1ab477abb 100644 --- a/htroot/xml/queues_p.java +++ b/htroot/xml/queues_p.java @@ -164,17 +164,17 @@ public class queues_p { //local crawl queue prop.put("localCrawlSize", Integer.toString(switchboard.getThread("50_localcrawl").getJobCount())); - int stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); - addNTable(prop, "list-local", switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, Math.min(10, stackSize))); + int stackSize = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); + addNTable(prop, "list-local", switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, Math.min(10, stackSize))); //global crawl queue prop.put("remoteCrawlSize", Integer.toString(switchboard.getThread("61_globalcrawltrigger").getJobCount())); //prop.put("remoteCrawlSize", Integer.toString(switchboard.getThread("62_remotetriggeredcrawl").getJobCount())); - stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); + stackSize = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); if (stackSize == 0) { prop.put("list-remote", 0); } else { - addNTable(prop, "list-remote", switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, Math.min(10, stackSize))); + addNTable(prop, "list-remote", switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, Math.min(10, stackSize))); } // return rewrite properties diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java index c9add44ae..ae6a17bfa 100644 --- a/htroot/yacy/crawlOrder.java +++ b/htroot/yacy/crawlOrder.java @@ -249,13 +249,13 @@ public final class crawlOrder { // case where we have already the url loaded; reason = reasonString; // send lurl-Entry as response - indexURLEntry entry = switchboard.urlPool.loadedURL.load(plasmaURL.urlHash(url), null); + indexURLEntry entry = switchboard.wordIndex.loadedURL.load(plasmaURL.urlHash(url), null); if (entry == null) { response = "rejected"; lurl = ""; } else { response = "double"; - switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare); + switchboard.wordIndex.loadedURL.notifyGCrawl(entry.hash(), iam, youare); lurl = crypt.simpleEncode(entry.toString()); } } else { diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index bc8cc87bc..c4e90e520 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -124,7 +124,7 @@ public final class crawlReceipt { prop.put("delay", "3600"); } else if (result.equals("fill")) { // generating a new loaded URL entry - indexURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr); + indexURLEntry entry = switchboard.wordIndex.loadedURL.newEntry(propStr); if (entry == null) { log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam + "\n\tURL properties: "+ propStr); @@ -135,16 +135,16 @@ public final class crawlReceipt { "\n\tURL properties: "+ propStr); } else try { // put new entry into database - switchboard.urlPool.loadedURL.store(entry); - switchboard.urlPool.loadedURL.stack(entry, youare, iam, 1); + switchboard.wordIndex.loadedURL.store(entry); + switchboard.wordIndex.loadedURL.stack(entry, youare, iam, 1); // generating url hash String newUrlHash = plasmaURL.urlHash(comp.url()); String oldUrlHash = plasmaURL.oldurlHash(comp.url()); // removing URL from notice URL - switchboard.urlPool.noticeURL.remove(newUrlHash); - switchboard.urlPool.noticeURL.remove(oldUrlHash); + switchboard.noticeURL.remove(newUrlHash); + switchboard.noticeURL.remove(oldUrlHash); log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + comp.url().toNormalform()); } catch (IOException e) { @@ -155,11 +155,11 @@ public final class crawlReceipt { prop.put("delay", "10"); } else { try { - plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash); - plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new kelondroBitfield()); + plasmaCrawlNURL.Entry en = switchboard.noticeURL.getEntry(receivedUrlhash); + plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new kelondroBitfield()); ee.store(); - switchboard.urlPool.errorURL.stackPushEntry(ee); - switchboard.urlPool.noticeURL.remove(receivedUrlhash); + switchboard.errorURL.stackPushEntry(ee); + switchboard.noticeURL.remove(receivedUrlhash); } catch (IOException e) { } diff --git a/htroot/yacy/query.java b/htroot/yacy/query.java index 1c7560117..24c33da18 100644 --- a/htroot/yacy/query.java +++ b/htroot/yacy/query.java @@ -98,7 +98,7 @@ public final class query { if (obj.equals("lurlcount")) { // return the number of all available l-url's - prop.put("response", sb.urlPool.loadedURL.size()); + prop.put("response", sb.wordIndex.loadedURL.size()); return prop; } diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 96f929166..96ca414c7 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -144,7 +144,7 @@ public final class search { plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(squery.maximumTime, squery.wantedResults); plasmaSearchTimingProfile remoteTiming = null; - plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache); + plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.wordIndex.loadedURL, sb.snippetCache); Map containers = theSearch.localSearchContainers(plasmaSearchQuery.hashes2Set(urls)); if (containers != null) { Iterator ci = containers.entrySet().iterator(); @@ -173,7 +173,7 @@ public final class search { plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, - yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, + yacyCore.log, sb.wordIndex, sb.wordIndex.loadedURL, sb.snippetCache); Map containers = theSearch.localSearchContainers(plasmaSearchQuery.hashes2Set(urls)); diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 80c8be09a..f9152a133 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -53,7 +53,6 @@ import java.util.List; import de.anomic.http.httpHeader; import de.anomic.index.indexRWIEntry; import de.anomic.index.indexRWIEntryNew; -import de.anomic.index.indexRWIEntryOld; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverCore; @@ -93,6 +92,7 @@ public final class transferRWI { StringBuffer unknownURLs = new StringBuffer(); int pause = 0; + /* boolean shortCacheFlush = false; if ((granted) && (sb.wordIndex.busyCacheFlush)) { // wait a little bit, maybe we got into a short flush slot @@ -101,9 +101,10 @@ public final class transferRWI { shortCacheFlush = true; break; } - try {Thread.sleep(100);} catch (InterruptedException e) {/* */} + try {Thread.sleep(100);} catch (InterruptedException e) {} } } + */ if (!granted) { // we dont want to receive indexes @@ -152,42 +153,45 @@ public final class transferRWI { Iterator i = v.iterator(); while (i.hasNext()) { serverCore.checkInterruption(); - estring = (String) i.next(); + + // check if RWI entry is well-formed p = estring.indexOf("{"); - if (p > 0) { - wordHash = estring.substring(0, p); - wordhashes[received] = wordHash; - if (estring.indexOf("x=") > 0) - iEntry = new indexRWIEntryNew(estring.substring(p)); - else - iEntry = new indexRWIEntryOld(estring.substring(p)); - urlHash = iEntry.urlHash(); - if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(plasmaURLPattern.BLACKLIST_DHT, urlHash))) { - int deleted = sb.wordIndex.tryRemoveURLs(urlHash); - yacyCore.log.logFine("transferRWI: blocked blacklisted URLHash '" + urlHash + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs"); - blocked++; - } else { - sb.wordIndex.addEntry(wordHash, iEntry, System.currentTimeMillis(), true); - serverCore.checkInterruption(); + if ((p < 0) || (estring.indexOf("x=") < 0)) { + blocked++; + continue; + } + wordHash = estring.substring(0, p); + wordhashes[received] = wordHash; + iEntry = new indexRWIEntryNew(estring.substring(p)); + urlHash = iEntry.urlHash(); + + // block blacklisted entries + if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(plasmaURLPattern.BLACKLIST_DHT, urlHash))) { + int deleted = sb.wordIndex.tryRemoveURLs(urlHash); + yacyCore.log.logFine("transferRWI: blocked blacklisted URLHash '" + urlHash + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs"); + blocked++; + continue; + } + + // learn entry + sb.wordIndex.addEntry(wordHash, iEntry, System.currentTimeMillis(), true); + serverCore.checkInterruption(); - if (!(knownURL.contains(urlHash)||unknownURL.contains(urlHash))) { - try { - if (sb.urlPool.loadedURL.exists(urlHash)) { - knownURL.add(urlHash); - } else { - unknownURL.add(urlHash); - } - } catch (Exception ex) { - sb.getLog().logWarning( - "transferRWI: DB-Error while trying to determine if URL with hash '" + - urlHash + "' is known.", ex); - } - receivedURL++; - } - received++; + // check if we need to ask for the corresponding URL + if (!(knownURL.contains(urlHash)||unknownURL.contains(urlHash))) try { + if (sb.wordIndex.loadedURL.exists(urlHash)) { + knownURL.add(urlHash); + } else { + unknownURL.add(urlHash); } + receivedURL++; + } catch (Exception ex) { + sb.getLog().logWarning( + "transferRWI: DB-Error while trying to determine if URL with hash '" + + urlHash + "' is known.", ex); } + received++; } yacyCore.seedDB.mySeed.incRI(received); diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 6984bf679..bfcdbce05 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -87,7 +87,7 @@ public final class transferURL { if (granted) { int received = 0; int blocked = 0; - final int sizeBefore = sb.urlPool.loadedURL.size(); + final int sizeBefore = sb.wordIndex.loadedURL.size(); // read the urls from the other properties and store String urls; indexURLEntry lEntry; @@ -97,7 +97,7 @@ public final class transferURL { if (urls == null) { yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName); } else { - lEntry = sb.urlPool.loadedURL.newEntry(urls); + lEntry = sb.wordIndex.loadedURL.newEntry(urls); if (lEntry == null) { yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls); // TODO: should we send back an error message??? @@ -113,8 +113,8 @@ public final class transferURL { lEntry = null; blocked++; } else try { - sb.urlPool.loadedURL.store(lEntry); - sb.urlPool.loadedURL.stack(lEntry, iam, iam, 3); + sb.wordIndex.loadedURL.store(lEntry); + sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3); yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName); received++; } catch (IOException e) { @@ -128,7 +128,7 @@ public final class transferURL { yacyCore.seedDB.mySeed.incRU(received); // return rewrite properties - final int more = sb.urlPool.loadedURL.size() - sizeBefore; + final int more = sb.wordIndex.loadedURL.size() - sizeBefore; doublevalues = Integer.toString(received - more); sb.getLog().logInfo("Received " + received + " URLs from peer " + otherPeerName + " in " + (System.currentTimeMillis() - start) + " ms, Blocked " + blocked + " URLs"); if ((received - more) > 0) sb.getLog().logSevere("Received " + doublevalues + " double URLs from peer " + otherPeerName); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 308f4de05..33b30ff4a 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -201,7 +201,7 @@ public class yacysearch { return prop; } final String recommendHash = post.get("recommendref", ""); // urlhash - indexURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null); + indexURLEntry urlentry = sb.wordIndex.loadedURL.load(recommendHash, null); if (urlentry != null) { indexURLEntry.Components comp = urlentry.comp(); plasmaParserDocument document; diff --git a/source/de/anomic/index/indexCachedRI.java b/source/de/anomic/index/indexCachedRI.java index 6acb9b148..40bc468a3 100644 --- a/source/de/anomic/index/indexCachedRI.java +++ b/source/de/anomic/index/indexCachedRI.java @@ -86,7 +86,7 @@ public class indexCachedRI implements indexRI { return entries.updated(); } - public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean intern) { + public void addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean intern) { // add the entry if (intern) { riIntern.addEntry(wordHash, entry, updateTime, true); @@ -94,10 +94,9 @@ public class indexCachedRI implements indexRI { riExtern.addEntry(wordHash, entry, updateTime, false); flushControl(); } - return null; } - public indexContainer addEntries(indexContainer entries, long updateTime, boolean intern) { + public void addEntries(indexContainer entries, long updateTime, boolean intern) { // add the entry if (intern) { riIntern.addEntries(entries, updateTime, true); @@ -105,7 +104,6 @@ public class indexCachedRI implements indexRI { riExtern.addEntries(entries, updateTime, false); flushControl(); } - return null; } public void flushCacheSome(boolean busy) { @@ -133,12 +131,7 @@ public class indexCachedRI implements indexRI { // flush the wordHash indexContainer c = ram.deleteContainer(wordHash); - if (c != null) { - indexContainer feedback = backend.addEntries(c, c.updated(), false); - if (feedback != null) { - throw new RuntimeException("indexCollectionRI shall not return feedback entries; feedback = " + feedback.toString()); - } - } + if (c != null) backend.addEntries(c, c.updated(), false); // pause to next loop to give other processes a chance to use IO //try {this.wait(8);} catch (InterruptedException e) {} @@ -206,11 +199,11 @@ public class indexCachedRI implements indexRI { return size; } - public void close(int waitingBoundSeconds) { + public void close() { synchronized (this) { - riIntern.close(waitingBoundSeconds); - riExtern.close(waitingBoundSeconds); - backend.close(-1); + riIntern.close(); + riExtern.close(); + backend.close(); } } diff --git a/source/de/anomic/index/indexCollectionRI.java b/source/de/anomic/index/indexCollectionRI.java index ca0bdd3d7..116139a59 100644 --- a/source/de/anomic/index/indexCollectionRI.java +++ b/source/de/anomic/index/indexCollectionRI.java @@ -104,7 +104,7 @@ public class indexCollectionRI implements indexRI { byte[] key = (byte[]) oo[0]; kelondroRowSet collection = (kelondroRowSet) oo[1]; if (collection == null) return null; - return new indexContainer(new String(key), collection, true); + return new indexContainer(new String(key), collection); } public void remove() { @@ -118,7 +118,7 @@ public class indexCollectionRI implements indexRI { kelondroRowSet collection = collectionIndex.get(wordHash.getBytes(), deleteIfEmpty); if (collection != null) collection.select(urlselection); if ((collection == null) || (collection.size() == 0)) return null; - return new indexContainer(wordHash, collection, true); + return new indexContainer(wordHash, collection); } catch (IOException e) { return null; } @@ -128,7 +128,7 @@ public class indexCollectionRI implements indexRI { try { kelondroRowSet collection = collectionIndex.delete(wordHash.getBytes()); if (collection == null) return null; - return new indexContainer(wordHash, collection, true); + return new indexContainer(wordHash, collection); } catch (IOException e) { return null; } @@ -152,26 +152,24 @@ public class indexCollectionRI implements indexRI { } } - public synchronized indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) { - indexContainer container = new indexContainer(wordHash, collectionIndex.payloadRow(), true); + public synchronized void addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) { + indexContainer container = new indexContainer(wordHash, collectionIndex.payloadRow()); container.add(newEntry); - return addEntries(container, updateTime, dhtCase); + addEntries(container, updateTime, dhtCase); } - public synchronized indexContainer addEntries(indexContainer newEntries, long creationTime, boolean dhtCase) { + public synchronized void addEntries(indexContainer newEntries, long creationTime, boolean dhtCase) { String wordHash = newEntries.getWordHash(); try { collectionIndex.merge(wordHash.getBytes(), (kelondroRowCollection) newEntries); - return null; // merge does allways 'eat' up all entries unlike the assortments; they may return an overflow container } catch (kelondroOutOfLimitsException e) { e.printStackTrace(); - return null; } catch (IOException e) { - return null; + e.printStackTrace(); } } - public synchronized void close(int waitingSeconds) { + public synchronized void close() { try { collectionIndex.close(); } catch (IOException e) { diff --git a/source/de/anomic/index/indexContainer.java b/source/de/anomic/index/indexContainer.java index 68085ba4e..0717eb431 100644 --- a/source/de/anomic/index/indexContainer.java +++ b/source/de/anomic/index/indexContainer.java @@ -41,33 +41,24 @@ import de.anomic.kelondro.kelondroRowSet; public class indexContainer extends kelondroRowSet { private String wordHash; - private boolean newRWI; - public indexContainer(String wordHash, kelondroRow rowdef, int objectCount, byte[] cache, boolean newRWI) { - super(rowdef, objectCount, cache, kelondroBase64Order.enhancedCoder, 0, 0); - this.wordHash = wordHash; - this.newRWI = newRWI; - } - - public indexContainer(String wordHash, kelondroRow rowdef, boolean newRWI) { - this(wordHash, rowdef, kelondroBase64Order.enhancedCoder, 0, newRWI); + public indexContainer(String wordHash, kelondroRow rowdef) { + this(wordHash, rowdef, kelondroBase64Order.enhancedCoder, 0); } - public indexContainer(String wordHash, kelondroRowSet collection, boolean newRWI) { + public indexContainer(String wordHash, kelondroRowSet collection) { super(collection); this.wordHash = wordHash; - this.newRWI = newRWI; } - public indexContainer(String wordHash, kelondroRow rowdef, kelondroOrder ordering, int column, boolean newRWI) { + public indexContainer(String wordHash, kelondroRow rowdef, kelondroOrder ordering, int column) { super(rowdef, ordering, column, 0); this.wordHash = wordHash; this.lastTimeWrote = 0; - this.newRWI = newRWI; } public indexContainer topLevelClone() { - indexContainer newContainer = new indexContainer(this.wordHash, this.rowdef, this.sortOrder, this.sortColumn, this.newRWI); + indexContainer newContainer = new indexContainer(this.wordHash, this.rowdef, this.sortOrder, this.sortColumn); newContainer.add(this, -1); return newContainer; } @@ -133,7 +124,7 @@ public class indexContainer extends kelondroRowSet { if (entry instanceof indexRWIEntryNew) oldEntry = new indexRWIEntryNew(oldEntryRow); else - oldEntry = new indexRWIEntryOld(oldEntryRow); // FIXME: see if cloning is necessary + oldEntry = new indexRWIEntryNew(new indexRWIEntryOld(oldEntryRow)); if (entry.isOlder(oldEntry)) { // A more recent Entry is already in this container this.put(oldEntry.toKelondroEntry()); // put it back return false; @@ -146,19 +137,13 @@ public class indexContainer extends kelondroRowSet { public indexRWIEntry get(String urlHash) { kelondroRow.Entry entry = this.get(urlHash.getBytes()); if (entry == null) return null; - if (this.newRWI) - return new indexRWIEntryNew(entry); - else - return new indexRWIEntryOld(entry); + return new indexRWIEntryNew(entry); } public indexRWIEntry remove(String urlHash) { kelondroRow.Entry entry = this.remove(urlHash.getBytes()); if (entry == null) return null; - if (this.newRWI) - return new indexRWIEntryNew(entry); - else - return new indexRWIEntryOld(entry); + return new indexRWIEntryNew(entry); } public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { @@ -194,10 +179,7 @@ public class indexContainer extends kelondroRowSet { public Object next() { kelondroRow.Entry rentry = (kelondroRow.Entry) rowEntryIterator.next(); if (rentry == null) return null; - if (newRWI) - return new indexRWIEntryNew(rentry); - else - return new indexRWIEntryOld(rentry); + return new indexRWIEntryNew(rentry); } public void remove() { @@ -307,7 +289,7 @@ public class indexContainer extends kelondroRowSet { assert small.rowdef.equals(large.rowdef) : "small = " + small.rowdef.toString() + "; large = " + large.rowdef.toString(); int keylength = small.rowdef.width(0); assert (keylength == large.rowdef.width(0)); - indexContainer conj = new indexContainer(null, small.rowdef, small.newRWI); // start with empty search result + indexContainer conj = new indexContainer(null, small.rowdef); // start with empty search result Iterator se = small.entries(); indexRWIEntry ie0, ie1; long stamp = System.currentTimeMillis(); @@ -330,7 +312,7 @@ public class indexContainer extends kelondroRowSet { assert i1.rowdef.equals(i2.rowdef) : "i1 = " + i1.rowdef.toString() + "; i2 = " + i2.rowdef.toString(); int keylength = i1.rowdef.width(0); assert (keylength == i2.rowdef.width(0)); - indexContainer conj = new indexContainer(null, i1.rowdef, i1.newRWI); // start with empty search result + indexContainer conj = new indexContainer(null, i1.rowdef); // start with empty search result if (!((i1.order().signature().equals(i2.order().signature())) && (i1.primarykey() == i2.primarykey()))) return conj; // ordering must be equal Iterator e1 = i1.entries(); diff --git a/source/de/anomic/index/indexRAMRI.java b/source/de/anomic/index/indexRAMRI.java index a80a8d18a..8d20646dc 100644 --- a/source/de/anomic/index/indexRAMRI.java +++ b/source/de/anomic/index/indexRAMRI.java @@ -58,7 +58,6 @@ public final class indexRAMRI implements indexRI { private String indexArrayFileName; private kelondroRow payloadrow; private kelondroRow bufferStructureBasis; - private boolean newRWI; // calculated constants private static String maxKey; @@ -67,7 +66,7 @@ public final class indexRAMRI implements indexRI { //minKey = ""; for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-'; } - public indexRAMRI(File databaseRoot, kelondroRow payloadrow, int wCacheReferenceLimitInit, String dumpname, serverLog log, boolean newRWI) { + public indexRAMRI(File databaseRoot, kelondroRow payloadrow, int wCacheReferenceLimitInit, String dumpname, serverLog log) { // creates a new index cache // the cache has a back-end where indexes that do not fit in the cache are flushed @@ -79,7 +78,6 @@ public final class indexRAMRI implements indexRI { this.cacheMaxCount = 10000; this.cacheReferenceLimit = wCacheReferenceLimitInit; this.log = log; - this.newRWI = newRWI; this.indexArrayFileName = dumpname; this.payloadrow = payloadrow; this.bufferStructureBasis = new kelondroRow( @@ -103,7 +101,7 @@ public final class indexRAMRI implements indexRI { return entries.updated(); } - private void dump(int waitingSeconds) throws IOException { + private void dump() throws IOException { log.logConfig("creating dump for index cache '" + indexArrayFileName + "', " + cache.size() + " words (and much more urls)"); File indexDumpFile = new File(databaseRoot, indexArrayFileName); if (indexDumpFile.exists()) indexDumpFile.delete(); @@ -180,10 +178,7 @@ public final class indexRAMRI implements indexRI { if ((row == null) || (row.empty(0)) || (row.empty(3))) continue; wordHash = row.getColString(0, "UTF-8"); //creationTime = kelondroRecords.bytes2long(row[2]); - if (newRWI) - wordEntry = new indexRWIEntryNew(row.getColBytes(3)); - else - wordEntry = new indexRWIEntryOld(row.getColBytes(3)); + wordEntry = new indexRWIEntryNew(row.getColBytes(3)); // store to cache addEntry(wordHash, wordEntry, startTime, false); urlCount++; @@ -423,10 +418,10 @@ public final class indexRAMRI implements indexRI { return delCount; } - public synchronized indexContainer addEntries(indexContainer container, long updateTime, boolean dhtCase) { + public synchronized void addEntries(indexContainer container, long updateTime, boolean dhtCase) { // this puts the entries into the cache, not into the assortment directly int added = 0; - if ((container == null) || (container.size() == 0)) return null; + if ((container == null) || (container.size() == 0)) return; // put new words into cache String wordHash = container.getWordHash(); @@ -443,28 +438,26 @@ public final class indexRAMRI implements indexRI { hashDate.setScore(wordHash, intTime(updateTime)); } entries = null; - return null; } - public synchronized indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) { - indexContainer container = (indexContainer) cache.get(wordHash); - if (container == null) container = new indexContainer(wordHash, this.payloadrow, true); - indexRWIEntry[] entries = new indexRWIEntry[] { newEntry }; - if (container.add(entries, updateTime) > 0) { - cache.put(wordHash, container); - hashScore.incScore(wordHash); - hashDate.setScore(wordHash, intTime(updateTime)); - return null; - } - container = null; - entries = null; - return null; + public synchronized void addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) { + indexContainer container = (indexContainer) cache.get(wordHash); + if (container == null) container = new indexContainer(wordHash, this.payloadrow); + indexRWIEntry[] entries = new indexRWIEntry[] { newEntry }; + if (container.add(entries, updateTime) > 0) { + cache.put(wordHash, container); + hashScore.incScore(wordHash); + hashDate.setScore(wordHash, intTime(updateTime)); + return; + } + container = null; + entries = null; } - public synchronized void close(int waitingSeconds) { + public synchronized void close() { // dump cache try { - dump(waitingSeconds); + dump(); } catch (IOException e){ log.logSevere("unable to dump cache: " + e.getMessage(), e); } diff --git a/source/de/anomic/index/indexRI.java b/source/de/anomic/index/indexRI.java index 9618e0303..4313dbe9f 100644 --- a/source/de/anomic/index/indexRI.java +++ b/source/de/anomic/index/indexRI.java @@ -44,9 +44,9 @@ public interface indexRI { public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete); public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete); - public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtCase); - public indexContainer addEntries(indexContainer newEntries, long creationTime, boolean dhtCase); + public void addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtCase); + public void addEntries(indexContainer newEntries, long creationTime, boolean dhtCase); - public void close(int waitingSeconds); + public void close(); } diff --git a/source/de/anomic/index/indexRWIEntryNew.java b/source/de/anomic/index/indexRWIEntryNew.java index dc2efb9b0..f064b1844 100644 --- a/source/de/anomic/index/indexRWIEntryNew.java +++ b/source/de/anomic/index/indexRWIEntryNew.java @@ -152,10 +152,9 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry { assert oldEntry.urlHash() != null; this.entry = urlEntryRow.newEntry(); int mddlm = plasmaWordIndex.microDateDays(oldEntry.lastModified()); - int mddct = plasmaWordIndex.microDateDays(System.currentTimeMillis()); this.entry.setCol(col_urlhash, oldEntry.urlHash(), null); this.entry.setCol(col_lastModified, mddlm); - this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation + this.entry.setCol(col_freshUntil, 0); this.entry.setCol(col_wordsInTitle, 20); // guessed this.entry.setCol(col_wordsInText, oldEntry.wordcount()); this.entry.setCol(col_phrasesInText, oldEntry.phrasecount()); diff --git a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java index 00c0fa711..4e302ef94 100644 --- a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java +++ b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java @@ -290,7 +290,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW String referrerHash = (this.refererURLString==null)?null:plasmaURL.urlHash(this.refererURLString); // create a new errorURL DB entry - plasmaCrawlEURL.Entry ee = this.sb.urlPool.errorURL.newEntry( + plasmaCrawlEURL.Entry ee = this.sb.errorURL.newEntry( this.url, referrerHash, this.initiator, @@ -304,7 +304,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW ee.store(); // push it onto the stack - this.sb.urlPool.errorURL.stackPushEntry(ee); + this.sb.errorURL.stackPushEntry(ee); // delete the cache file File cacheFile = this.cacheManager.getCachePath(this.url); diff --git a/source/de/anomic/plasma/crawler/http/CrawlWorker.java b/source/de/anomic/plasma/crawler/http/CrawlWorker.java index 1429f8e44..cfe4e43ae 100644 --- a/source/de/anomic/plasma/crawler/http/CrawlWorker.java +++ b/source/de/anomic/plasma/crawler/http/CrawlWorker.java @@ -342,7 +342,7 @@ public final class CrawlWorker extends AbstractCrawlWorker { String urlhash = plasmaURL.urlHash(redirectionUrl); // removing url from loader queue - plasmaCrawlLoader.switchboard.urlPool.noticeURL.remove(urlhash); + plasmaCrawlLoader.switchboard.noticeURL.remove(urlhash); // retry crawling with new url this.url = redirectionUrl; diff --git a/source/de/anomic/plasma/dbImport/AbstractImporter.java b/source/de/anomic/plasma/dbImport/AbstractImporter.java index 823f6642a..4dcdd8798 100644 --- a/source/de/anomic/plasma/dbImport/AbstractImporter.java +++ b/source/de/anomic/plasma/dbImport/AbstractImporter.java @@ -2,7 +2,7 @@ package de.anomic.plasma.dbImport; import java.io.File; -import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaWordIndex; import de.anomic.server.logging.serverLog; public abstract class AbstractImporter extends Thread implements dbImporter{ @@ -13,8 +13,7 @@ public abstract class AbstractImporter extends Thread implements dbImporter{ protected boolean stopped = false; protected boolean paused = false; - protected plasmaSwitchboard sb; - protected File importPath, indexPath; + protected File importPath; protected int cacheSize; protected long preloadTime; @@ -23,27 +22,27 @@ public abstract class AbstractImporter extends Thread implements dbImporter{ protected long globalPauseLast; protected long globalPauseDuration; protected String error; + protected plasmaWordIndex wi; - public AbstractImporter(plasmaSwitchboard theSb) { - super(theSb.dbImportManager.runningJobs,""); - this.sb = theSb; + public AbstractImporter(plasmaWordIndex wi) { + //super(theSb.dbImportManager.runningJobs,""); + this.wi = wi; } public String getError() { return this.error; } - public void init(File theImportPath, File theIndexPath) { + public void init(File theImportPath) { if (theImportPath == null) throw new NullPointerException("The Import path must not be null."); this.importPath = theImportPath; - this.indexPath = theIndexPath; // getting a job id from the import manager - this.jobID = this.sb.dbImportManager.getJobID(); + //this.jobID = this.sb.dbImportManager.getJobID(); // initializing the logger and setting a more verbose thread name this.log = new serverLog("IMPORT_" + this.jobType + "_" + this.jobID); - this.setName("IMPORT_" + this.jobType + "_" + this.sb.dbImportManager.getJobID()); + this.setName("IMPORT_" + this.jobType /*+ "_" + this.sb.dbImportManager.getJobID()*/); } public void startIt() { diff --git a/source/de/anomic/plasma/dbImport/AssortmentImporter.java b/source/de/anomic/plasma/dbImport/AssortmentImporter.java index 20a5640eb..2e70f25ff 100644 --- a/source/de/anomic/plasma/dbImport/AssortmentImporter.java +++ b/source/de/anomic/plasma/dbImport/AssortmentImporter.java @@ -5,8 +5,7 @@ import java.io.IOException; import java.util.Iterator; import de.anomic.index.indexContainer; -import de.anomic.index.indexRWIEntryOld; -import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaWordIndexAssortment; public class AssortmentImporter extends AbstractImporter implements dbImporter{ @@ -15,31 +14,29 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{ private int wordEntityCount = 0; private int wordEntryCount = 0; - private File importAssortmentFile; private plasmaWordIndexAssortment assortmentFile; - public AssortmentImporter(plasmaSwitchboard sb) { - super(sb); + public AssortmentImporter(plasmaWordIndex wi) { + super(wi); this.jobType = "ASSORTMENT"; } - public void init(File theImportAssortmentFile, File theIndexFile, int theCacheSize, long preloadTime) { - super.init(theImportAssortmentFile, theIndexFile); - this.importAssortmentFile = theImportAssortmentFile; + public void init(File theImportAssortmentFile, int theCacheSize, long preloadTime) { + super.init(theImportAssortmentFile); this.cacheSize = theCacheSize; if (this.cacheSize < 2*1024*1024) this.cacheSize = 2*1024*1024; String errorMsg = null; - if (!this.importAssortmentFile.getName().matches("indexAssortment0[0-6][0-9]\\.db")) - errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' has an invalid name."; - if (!this.importAssortmentFile.exists()) - errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' does not exist."; - else if (this.importAssortmentFile.isDirectory()) - errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' is a directory."; - else if (!this.importAssortmentFile.canRead()) - errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' is not readable."; - else if (!this.importAssortmentFile.canWrite()) - errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' is not writeable."; + if (!this.importPath.getName().matches("indexAssortment0[0-6][0-9]\\.db")) + errorMsg = "AssortmentFile '" + this.importPath + "' has an invalid name."; + if (!this.importPath.exists()) + errorMsg = "AssortmentFile '" + this.importPath + "' does not exist."; + else if (this.importPath.isDirectory()) + errorMsg = "AssortmentFile '" + this.importPath + "' is a directory."; + else if (!this.importPath.canRead()) + errorMsg = "AssortmentFile '" + this.importPath + "' is not readable."; + else if (!this.importPath.canWrite()) + errorMsg = "AssortmentFile '" + this.importPath + "' is not writeable."; if (errorMsg != null) { this.log.logSevere(errorMsg); throw new IllegalStateException(errorMsg); @@ -49,10 +46,10 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{ File importAssortmentPath = null; int assortmentNr = -1; try { - importAssortmentPath = new File(this.importAssortmentFile.getParent()); - assortmentNr = Integer.valueOf(this.importAssortmentFile.getName().substring("indexAssortment".length(),"indexAssortment".length()+3)).intValue(); + importAssortmentPath = new File(this.importPath.getParent()); + assortmentNr = Integer.valueOf(this.importPath.getName().substring("indexAssortment".length(),"indexAssortment".length()+3)).intValue(); if (assortmentNr <1 || assortmentNr > 64) { - errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' has an invalid name."; + errorMsg = "AssortmentFile '" + this.importPath + "' has an invalid name."; } } catch (NumberFormatException e) { errorMsg = "Unable to parse the assortment file number."; @@ -61,9 +58,9 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{ } // initializing the import assortment db - this.log.logInfo("Initializing source assortment file"); + this.log.logInfo("Initializing source assortment file " + theImportAssortmentFile); try { - this.assortmentFile = new plasmaWordIndexAssortment(importAssortmentPath, indexRWIEntryOld.urlEntryRow, assortmentNr, this.cacheSize/1024, preloadTime, this.log); + this.assortmentFile = new plasmaWordIndexAssortment(importAssortmentPath, assortmentNr, this.cacheSize/1024, preloadTime, this.log); } catch (IOException e) { e.printStackTrace(); System.exit(-1); @@ -95,7 +92,7 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{ public void run() { try { // getting a content interator - Iterator contentIterator = this.assortmentFile.containers(null, true, false); + Iterator contentIterator = this.assortmentFile.wordContainers(null, true, false); while (contentIterator.hasNext()) { this.wordEntityCount++; @@ -105,14 +102,11 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{ this.wordEntryCount += container.size(); // importing entity container to home db - this.sb.wordIndex.addEntries(container, System.currentTimeMillis(), false); + wi.addEntries(container, System.currentTimeMillis(), false); - if (this.wordEntityCount % 500 == 0) { + if (this.wordEntityCount % 1000 == 0) { this.log.logFine(this.wordEntityCount + " word entities processed so far."); } - if (this.wordEntryCount % 2000 == 0) { - this.log.logFine(this.wordEntryCount + " word entries processed so far."); - } if (isAborted()) break; } } catch (Exception e) { @@ -121,8 +115,12 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{ } finally { this.log.logInfo("Import process finished."); this.globalEnd = System.currentTimeMillis(); - this.sb.dbImportManager.finishedJobs.add(this); + //this.sb.dbImportManager.finishedJobs.add(this); this.assortmentFile.close(); + File bkpPath = new File(importPath.getParentFile(), "imported"); + bkpPath.mkdirs(); + File bkpFile = new File(bkpPath, importPath.getName()); + importPath.renameTo(bkpFile); } } diff --git a/source/de/anomic/plasma/dbImport/dbImportManager.java b/source/de/anomic/plasma/dbImport/dbImportManager.java index fbca00a2a..0c8c253e2 100644 --- a/source/de/anomic/plasma/dbImport/dbImportManager.java +++ b/source/de/anomic/plasma/dbImport/dbImportManager.java @@ -58,10 +58,8 @@ public class dbImportManager { if (type.length() == 0) return null; dbImporter newImporter = null; - if (type.equals("plasmaDB")) { - newImporter = new plasmaDbImporter(this.sb); - } else if (type.equalsIgnoreCase("ASSORTMENT")) { - newImporter = new AssortmentImporter(this.sb); + if (type.equalsIgnoreCase("ASSORTMENT")) { + newImporter = new AssortmentImporter(this.sb.wordIndex); } else if (type.equalsIgnoreCase("NURL")) { newImporter = new plasmaCrawlNURLImporter(this.sb); } diff --git a/source/de/anomic/plasma/dbImport/dbImporter.java b/source/de/anomic/plasma/dbImport/dbImporter.java index 81fe9de94..c141f68fc 100644 --- a/source/de/anomic/plasma/dbImport/dbImporter.java +++ b/source/de/anomic/plasma/dbImport/dbImporter.java @@ -24,6 +24,6 @@ public interface dbImporter { public String getError(); public String getStatus(); - public void init(File importPath, File indexPath, int cacheSize, long preloadTime); + public void init(File indexPath, int cacheSize, long preloadTime); public void startIt(); } diff --git a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java index 258956b47..2385b5345 100644 --- a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java @@ -18,9 +18,10 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor private int importStartSize; private int urlCount = 0; private int profileCount = 0; + private plasmaSwitchboard sb; public plasmaCrawlNURLImporter(plasmaSwitchboard theSb) { - super(theSb); + super(theSb.wordIndex); this.jobType="NURL"; } @@ -45,8 +46,8 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor return theStatus.toString(); } - public void init(File theImportPath, File theIndexPath, int theCacheSize, long preloadTime) { - super.init(theImportPath, theIndexPath); + public void init(File theImportPath, int theCacheSize, long preloadTime) { + super.init(theImportPath); this.cacheSize = theCacheSize; this.preloadTime = preloadTime; @@ -174,10 +175,10 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor } // if the url does not alredy exists in the destination stack we insert it now - if (!this.sb.urlPool.noticeURL.existsInStack(nextHash)) { - plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(nextEntry); + if (!this.sb.noticeURL.existsInStack(nextHash)) { + plasmaCrawlNURL.Entry ne = this.sb.noticeURL.newEntry(nextEntry); ne.store(); - this.sb.urlPool.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.url().getHost(), ne.hash()); + this.sb.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.url().getHost(), ne.hash()); } // removing hash from the import db diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index 77ec24cfa..8a3b08d58 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -9,17 +9,11 @@ import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroNaturalOrder; -import de.anomic.plasma.plasmaCrawlLURL; -import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; import de.anomic.server.serverDate; public class plasmaDbImporter extends AbstractImporter implements dbImporter { - private plasmaCrawlLURL homeUrlDB; - private plasmaWordIndex homeWordIndex; - - private plasmaCrawlLURL importUrlDB; private plasmaWordIndex importWordIndex; private int importStartSize; @@ -30,8 +24,9 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { private long urlCounter = 0, wordCounter = 0, entryCounter = 0, notBoundEntryCounter = 0; - public plasmaDbImporter(plasmaSwitchboard theSb) { - super(theSb); + public plasmaDbImporter(plasmaWordIndex homeWI, plasmaWordIndex importWI) { + super(homeWI); + this.importWordIndex = importWI; this.jobType = "PLASMADB"; } @@ -51,18 +46,12 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { return theStatus.toString(); } - public void init(File theImportPath, File theIndexPath, int theCacheSize, long preloadTime) { - super.init(theImportPath, theIndexPath); + public void init(File theImportPath, int theCacheSize, long preloadTime) { + super.init(theImportPath); - this.homeWordIndex = this.sb.wordIndex; - this.homeUrlDB = this.sb.urlPool.loadedURL; this.cacheSize = theCacheSize; if (this.cacheSize < 2*1024*1024) this.cacheSize = 8*1024*1024; - if (this.homeWordIndex.getRoot().equals(this.importPath)) { - throw new IllegalArgumentException("Import and home DB directory must not be equal"); - } - // configure import DB String errorMsg = null; if (!this.importPath.exists()) errorMsg = "Import directory does not exist."; @@ -75,10 +64,8 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { } this.log.logFine("Initializing source word index db."); - this.importWordIndex = new plasmaWordIndex(this.importPath, this.indexPath, true, (this.cacheSize/2)/1024, preloadTime / 2, this.log); + this.importWordIndex = new plasmaWordIndex(this.importPath, this.cacheSize/2, this.cacheSize/2, preloadTime / 2, this.log); - this.log.logFine("Initializing import URL db."); - this.importUrlDB = new plasmaCrawlLURL(this.importPath, this.indexPath, (this.cacheSize/2)/1024, preloadTime / 2); this.importStartSize = this.importWordIndex.size(); } @@ -87,7 +74,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { importWordsDB(); } finally { this.globalEnd = System.currentTimeMillis(); - this.sb.dbImportManager.finishedJobs.add(this); + //this.sb.dbImportManager.finishedJobs.add(this); } } @@ -107,16 +94,16 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { this.log.logInfo("STARTING DB-IMPORT"); try { - this.log.logInfo("Importing DB from '" + this.importPath.getAbsolutePath() + "' to '" + this.homeWordIndex.getRoot().getAbsolutePath() + "'."); - this.log.logInfo("Home word index contains " + this.homeWordIndex.size() + " words and " + this.homeUrlDB.size() + " URLs."); - this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importUrlDB.size() + " URLs."); + this.log.logInfo("Importing DB from '" + this.importPath.getAbsolutePath() + "'"); + this.log.logInfo("Home word index contains " + wi.size() + " words and " + wi.loadedURL.size() + " URLs."); + this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.loadedURL.size() + " URLs."); HashSet unknownUrlBuffer = new HashSet(); HashSet importedUrlBuffer = new HashSet(); // iterate over all words from import db //Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false); - Iterator indexContainerIterator = this.importWordIndex.indexContainerSet(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false, 100).iterator(); + Iterator indexContainerIterator = this.importWordIndex.indexContainerSet(this.wordChunkStartHash, false, false, 100).iterator(); while (!isAborted() && indexContainerIterator.hasNext()) { TreeSet entityUrls = new TreeSet(new kelondroNaturalOrder(true)); @@ -157,11 +144,11 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { // we need to import the url // getting the url entry - indexURLEntry urlEntry = this.importUrlDB.load(urlHash, null); + indexURLEntry urlEntry = this.importWordIndex.loadedURL.load(urlHash, null); if (urlEntry != null) { /* write it into the home url db */ - this.homeUrlDB.store(urlEntry); + wi.loadedURL.store(urlEntry); importedUrlBuffer.add(urlHash); this.urlCounter++; @@ -183,7 +170,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { if (isAborted()) break; // importing entity container to home db - if (newContainer.size() > 0) { this.homeWordIndex.addEntries(newContainer, System.currentTimeMillis(), false); } + if (newContainer.size() > 0) { wi.addEntries(newContainer, System.currentTimeMillis(), false); } // delete complete index entity file this.importWordIndex.deleteContainer(this.wordHash); @@ -203,7 +190,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { "Speed: "+ 500*1000/duration + " word entities/s" + " | Elapsed time: " + serverDate.intervalToString(getElapsedTime()) + " | Estimated time: " + serverDate.intervalToString(getEstimatedTime()) + "\n" + - "Home Words = " + this.homeWordIndex.size() + + "Home Words = " + wi.size() + " | Import Words = " + this.importWordIndex.size()); this.wordChunkStart = this.wordChunkEnd; this.wordChunkStartHash = this.wordChunkEndHash; @@ -217,7 +204,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { if (!indexContainerIterator.hasNext()) { // We may not be finished yet, try to get the next chunk of wordHashes - TreeSet containers = this.importWordIndex.indexContainerSet(this.wordHash, plasmaWordIndex.RL_WORDFILES, false, 100); + TreeSet containers = this.importWordIndex.indexContainerSet(this.wordHash, false, false, 100); indexContainerIterator = containers.iterator(); // Make sure we don't get the same wordhash twice, but don't skip a word if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals(((indexContainer) indexContainerIterator.next()).getWordHash()))) { @@ -226,16 +213,15 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { } } - this.log.logInfo("Home word index contains " + this.homeWordIndex.size() + " words and " + this.homeUrlDB.size() + " URLs."); - this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importUrlDB.size() + " URLs."); + this.log.logInfo("Home word index contains " + wi.size() + " words and " + wi.loadedURL.size() + " URLs."); + this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.loadedURL.size() + " URLs."); } catch (Exception e) { this.log.logSevere("Database import failed.",e); e.printStackTrace(); this.error = e.toString(); } finally { this.log.logInfo("Import process finished."); - if (this.importUrlDB != null) try { this.importUrlDB.close(); } catch (Exception e){} - if (this.importWordIndex != null) try { this.importWordIndex.close(5000); } catch (Exception e){} + if (this.importWordIndex != null) try { this.importWordIndex.close(); } catch (Exception e){} } } diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index e2ef4dd9b..b6c805358 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -171,9 +171,12 @@ public class plasmaCrawlEURL { } } - public void close() throws IOException { + public void close() { if (urlIndexFile != null) { - urlIndexFile.close(); + try { + urlIndexFile.close(); + } catch (IOException e) { + } urlIndexFile = null; } } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 63496276f..75368f06d 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -95,11 +95,11 @@ public final class plasmaCrawlLURL { // the class object private kelondroIndex urlIndexFile = null; - public plasmaCrawlLURL(File plasmaPath, File indexPath, int bufferkb, long preloadTime) { + public plasmaCrawlLURL(File indexPath, long buffer, long preloadTime) { super(); try { - urlIndexFile = new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", bufferkb * 0x400, preloadTime, indexURLEntryNew.rowdef, kelondroBase64Order.enhancedCoder); + urlIndexFile = new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", buffer, preloadTime, indexURLEntryNew.rowdef, kelondroBase64Order.enhancedCoder); } catch (IOException e) { e.printStackTrace(); System.exit(-1); @@ -583,7 +583,7 @@ public final class plasmaCrawlLURL { } catch (MalformedURLException e) {} if (args[0].equals("-l")) try { // arg 1 is path to URLCache - final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), new File(args[2]), 1, 0); + final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[2]), 1, 0); final Iterator enu = urls.entries(true, false, null); while (enu.hasNext()) { System.out.println(((indexURLEntry) enu.next()).toString()); diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 888df3367..229a7100f 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -393,9 +393,9 @@ public final class plasmaCrawlStacker { // check if the url is double registered checkInterruption(); String nexturlhash = plasmaURL.urlHash(nexturl); - String dbocc = this.sb.urlPool.exists(nexturlhash); + String dbocc = this.sb.urlExists(nexturlhash); indexURLEntry oldEntry = null; - oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null); + oldEntry = this.sb.wordIndex.loadedURL.load(nexturlhash, null); boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder()); if ((dbocc != null) && (!(recrawl))) { reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")"; @@ -437,7 +437,7 @@ public final class plasmaCrawlStacker { // add the url into the crawling queue checkInterruption(); - plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */ + plasmaCrawlNURL.Entry ne = this.sb.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */ nexturl, /* url clear text string */ loadDate, /* load date */ referrerHash, /* last url in crawling queue */ @@ -448,7 +448,7 @@ public final class plasmaCrawlStacker { 0 /*forkfactor, default value */ ); ne.store(); - this.sb.urlPool.noticeURL.push( + this.sb.noticeURL.push( ((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT : ((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/, nexturl.getHost(), @@ -1053,7 +1053,7 @@ public final class plasmaCrawlStacker { // if the url was rejected we store it into the error URL db if (rejectReason != null) { - plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry( + plasmaCrawlEURL.Entry ee = sb.errorURL.newEntry( new URL(this.theMsg.url()), this.theMsg.referrerHash(), this.theMsg.initiatorHash(), @@ -1063,7 +1063,7 @@ public final class plasmaCrawlStacker { new kelondroBitfield() ); ee.store(); - sb.urlPool.errorURL.stackPushEntry(ee); + sb.errorURL.stackPushEntry(ee); } } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java index f4d93825b..7fac09f00 100644 --- a/source/de/anomic/plasma/plasmaDHTChunk.java +++ b/source/de/anomic/plasma/plasmaDHTChunk.java @@ -180,12 +180,12 @@ public class plasmaDHTChunk { private void selectTransferContainers(String hash, int mincount, int maxcount, int maxtime) throws InterruptedException { try { this.selectionStartTime = System.currentTimeMillis(); - int refcountRAM = selectTransferContainersResource(hash, plasmaWordIndex.RL_RAMCACHE, maxcount, maxtime); + int refcountRAM = selectTransferContainersResource(hash, true, maxcount, maxtime); if (refcountRAM >= mincount) { log.logFine("DHT selection from RAM: " + refcountRAM + " entries"); return; } - int refcountFile = selectTransferContainersResource(hash, plasmaWordIndex.RL_WORDFILES, maxcount, maxtime); + int refcountFile = selectTransferContainersResource(hash, false, maxcount, maxtime); log.logFine("DHT selection from FILE: " + refcountFile + " entries, RAM provided only " + refcountRAM + " entries"); return; } finally { @@ -193,11 +193,11 @@ public class plasmaDHTChunk { } } - private int selectTransferContainersResource(String hash, int resourceLevel, int maxcount, int maxtime) throws InterruptedException { + private int selectTransferContainersResource(String hash, boolean ram, int maxcount, int maxtime) throws InterruptedException { // the hash is a start hash from where the indexes are picked ArrayList tmpContainers = new ArrayList(maxcount); try { - Iterator indexContainerIterator = wordIndex.indexContainerSet(hash, resourceLevel, true, maxcount).iterator(); + Iterator indexContainerIterator = wordIndex.indexContainerSet(hash, ram, true, maxcount).iterator(); indexContainer container; Iterator urlIter; indexRWIEntry iEntry; diff --git a/source/de/anomic/plasma/plasmaDHTFlush.java b/source/de/anomic/plasma/plasmaDHTFlush.java index 10d95f563..f8ad29a23 100644 --- a/source/de/anomic/plasma/plasmaDHTFlush.java +++ b/source/de/anomic/plasma/plasmaDHTFlush.java @@ -169,7 +169,7 @@ public class plasmaDHTFlush extends Thread { // selecting 500 words to transfer this.status = "Running: Selecting chunk " + iteration; - newDHTChunk = new plasmaDHTChunk(this.log, this.wordIndex, this.sb.urlPool.loadedURL, this.chunkSize/3*2, this.chunkSize, -1, this.startPointHash); + newDHTChunk = new plasmaDHTChunk(this.log, this.wordIndex, this.sb.wordIndex.loadedURL, this.chunkSize/3*2, this.chunkSize, -1, this.startPointHash); /* If we havn't selected a word chunk this could be because of * a) no words are left in the index diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index a5d103ef8..9de1c4440 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -744,7 +744,7 @@ public final class plasmaHTCache { URL url = null; // try the urlPool try { - url = plasmaSwitchboard.getSwitchboard().urlPool.getURL(urlHash); + url = plasmaSwitchboard.getSwitchboard().getURL(urlHash); } catch (Exception e) { log.logWarning("getURL(" + urlHash + "): " /*+ e.getMessage()*/, e); url = null; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 5c8174b92..6964213fc 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -134,6 +134,7 @@ import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpc; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexRWIEntryNew; import de.anomic.plasma.plasmaURL; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; @@ -206,7 +207,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public File rankingPath; public File workPath; public HashMap rankingPermissions; - public plasmaURLPool urlPool; + public plasmaCrawlNURL noticeURL; + public plasmaCrawlEURL errorURL; public plasmaWordIndex wordIndex; public plasmaHTCache cacheManager; public plasmaSnippetCache snippetCache; @@ -366,10 +368,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // read memory amount - int ramLURL = (int) getConfigLong("ramCacheLURL", 1024) / 1024; + int ramLURL = (int) getConfigLong("ramCacheLURL", 1024); long ramLURL_time = getConfigLong("ramCacheLURL_time", 1000); - ramLURL = Math.max((int) (serverMemory.available() / 2 / 1024), ramLURL); - setConfig("ramCacheLURL", ramLURL * 1024); + ramLURL = Math.max((int) (serverMemory.available() / 2), ramLURL); + setConfig("ramCacheLURL", ramLURL); int ramNURL = (int) getConfigLong("ramCacheNURL", 1024) / 1024; long ramNURL_time = getConfigLong("ramCacheNURL_time", 1000); ramNURL = Math.max((int) (serverMemory.available() / 10 / 1024), ramNURL); @@ -378,10 +380,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser long ramEURL_time = getConfigLong("ramCacheEURL_time", 1000); ramEURL = Math.max((int) (serverMemory.available() / 20 / 1024), ramEURL); setConfig("ramCacheEURL", ramEURL * 1024); - int ramRWI = (int) getConfigLong("ramCacheRWI", 1024) / 1024; + int ramRWI = (int) getConfigLong("ramCacheRWI", 1024); long ramRWI_time = getConfigLong("ramCacheRWI_time", 1000); - ramRWI = Math.max((int) (serverMemory.available() / 4 / 1024), ramRWI); - setConfig("ramCacheRWI", ramRWI * 1024); + ramRWI = Math.max((int) (serverMemory.available() / 4), ramRWI); + setConfig("ramCacheRWI", ramRWI); int ramHTTP = (int) getConfigLong("ramCacheHTTP", 1024) / 1024; long ramHTTP_time = getConfigLong("ramCacheHTTP_time", 1000); int ramMessage = (int) getConfigLong("ramCacheMessage", 1024) / 1024; @@ -429,12 +431,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // start indexing management log.logConfig("Starting Indexing Management"); - urlPool = new plasmaURLPool(plasmaPath, indexPath, - ramLURL, - ramNURL, - ramEURL, - ramLURL_time); - wordIndex = new plasmaWordIndex(plasmaPath, indexPath, true, ramRWI, ramRWI_time, log); + wordIndex = new plasmaWordIndex(indexPath, ramRWI, ramLURL, ramRWI_time, log); + noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL, -1); + errorURL = new plasmaCrawlEURL(plasmaPath, ramEURL, -1); // set a high maximum cache size to current size; this is adopted later automatically int wordCacheMaxCount = Math.max((int) getConfigLong("wordCacheInitCount", 30000), @@ -471,7 +470,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser * initialize switchboard queue * ====================================================================== */ // create queue - this.sbQueue = new plasmaSwitchboardQueue(this.cacheManager, this.urlPool.loadedURL, new File(this.plasmaPath, "switchboardQueue1.stack"), this.profiles); + this.sbQueue = new plasmaSwitchboardQueue(this.cacheManager, this.wordIndex.loadedURL, new File(this.plasmaPath, "switchboardQueue1.stack"), this.profiles); // setting the indexing queue slots indexingSlots = (int) getConfigLong("indexer.slots", 100); @@ -727,6 +726,29 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public boolean isRobinsonMode() { return (yacyCore.seedDB.sizeConnected() == 0) && (yacyCore.seedDB.mySeed.isVirgin()); } + + public String urlExists(String hash) { + // tests if hash occurrs in any database + // if it exists, the name of the database is returned, + // if it not exists, null is returned + if (wordIndex.loadedURL.exists(hash)) return "loaded"; + if (noticeURL.existsInStack(hash)) return "crawler"; + if (errorURL.exists(hash)) return "errors"; + return null; + } + + public URL getURL(String urlhash) throws IOException { + if (urlhash.equals(plasmaURL.dummyHash)) return null; + try { + plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); + if (ne != null) return ne.url(); + } catch (IOException e) {} + indexURLEntry le = wordIndex.loadedURL.load(urlhash, null); + if (le != null) return le.comp().url(); + plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); + if (ee != null) return ee.url(); + return null; + } /** * This method changes the HTCache size.
@@ -796,7 +818,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } public boolean cleanProfiles() throws InterruptedException { - if ((sbQueue.size() > 0) || (cacheLoader.size() > 0) || (urlPool.noticeURL.stackSize() > 0)) return false; + if ((sbQueue.size() > 0) || (cacheLoader.size() > 0) || (noticeURL.stackSize() > 0)) return false; final Iterator iter = profiles.profiles(true); plasmaCrawlProfile.entry entry; boolean hasDoneSomething = false; @@ -970,9 +992,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser sbQueue.close(); flushCitationReference(crg, "crg"); log.logConfig("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager (stand by...)"); - int waitingBoundSeconds = Integer.parseInt(getConfig("maxWaitingWordFlush", "120")); - urlPool.close(); - wordIndex.close(waitingBoundSeconds); + noticeURL.close(); + errorURL.close(); + wordIndex.close(); log.logConfig("SWITCHBOARD SHUTDOWN TERMINATED"); } @@ -1017,7 +1039,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // flush some entries from the RAM cache // (new permanent cache flushing) wordIndex.flushCacheSome(sbQueue.size() != 0); - urlPool.loadedURL.flushCacheSome(); + wordIndex.loadedURL.flushCacheSome(); boolean doneSomething = false; @@ -1041,7 +1063,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser ) { // generate new chunk int minChunkSize = (int) getConfigLong("indexDistribution.minChunkSize", 30); - dhtTransferChunk = new plasmaDHTChunk(this.log, this.wordIndex, this.urlPool.loadedURL, minChunkSize, dhtTransferIndexCount, 5000); + dhtTransferChunk = new plasmaDHTChunk(this.log, wordIndex, wordIndex.loadedURL, minChunkSize, dhtTransferIndexCount, 5000); doneSomething = true; } @@ -1079,10 +1101,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // do one processing step log.logFine("DEQUEUE: sbQueueSize=" + sbQueue.size() + - ", coreStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + - ", limitStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + - ", overhangStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + - ", remoteStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE)); + ", coreStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + + ", limitStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + + ", overhangStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + + ", remoteStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE)); try { nextentry = sbQueue.pop(); if (nextentry == null) { @@ -1112,9 +1134,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public int cleanupJobSize() { int c = 0; - if ((urlPool.errorURL.stackSize() > 1000)) c++; + if ((errorURL.stackSize() > 1000)) c++; for (int i = 1; i <= 6; i++) { - if (urlPool.loadedURL.getStackSize(i) > 1000) c++; + if (wordIndex.loadedURL.getStackSize(i) > 1000) c++; } return c; } @@ -1133,17 +1155,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // clean up error stack checkInterruption(); - if ((urlPool.errorURL.stackSize() > 1000)) { - log.logFine("Cleaning Error-URLs report stack, " + urlPool.errorURL.stackSize() + " entries on stack"); - urlPool.errorURL.clearStack(); + if ((errorURL.stackSize() > 1000)) { + log.logFine("Cleaning Error-URLs report stack, " + errorURL.stackSize() + " entries on stack"); + errorURL.clearStack(); hasDoneSomething = true; } // clean up loadedURL stack for (int i = 1; i <= 6; i++) { checkInterruption(); - if (urlPool.loadedURL.getStackSize(i) > 1000) { - log.logFine("Cleaning Loaded-URLs report stack, " + urlPool.loadedURL.getStackSize(i) + " entries on stack " + i); - urlPool.loadedURL.clearStack(i); + if (wordIndex.loadedURL.getStackSize(i) > 1000) { + log.logFine("Cleaning Loaded-URLs report stack, " + wordIndex.loadedURL.getStackSize(i) + " entries on stack " + i); + wordIndex.loadedURL.clearStack(i); hasDoneSomething = true; } } @@ -1209,11 +1231,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } public int coreCrawlJobSize() { - return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); + return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); } public boolean coreCrawlJob() { - if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) { + if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) { //log.logDebug("CoreCrawl: queue is empty"); return false; } @@ -1247,10 +1269,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // do a local crawl plasmaCrawlNURL.Entry urlEntry = null; - while (urlEntry == null && urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) { - String stats = "LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; + while (urlEntry == null && noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) { + String stats = "LOCALCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; try { - urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE); + urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE); String profileHandle = urlEntry.profileHandle(); // System.out.println("DEBUG plasmaSwitchboard.processCrawling: // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); @@ -1276,11 +1298,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } public int limitCrawlTriggerJobSize() { - return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); + return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); } public boolean limitCrawlTriggerJob() { - if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) == 0) { + if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) == 0) { //log.logDebug("LimitCrawl: queue is empty"); return false; } @@ -1292,7 +1314,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (toshift > 1000) toshift = 1000; if (toshift > limitCrawlTriggerJobSize()) toshift = limitCrawlTriggerJobSize(); for (int i = 0; i < toshift; i++) { - urlPool.noticeURL.shift(plasmaCrawlNURL.STACK_TYPE_LIMIT, plasmaCrawlNURL.STACK_TYPE_CORE); + noticeURL.shift(plasmaCrawlNURL.STACK_TYPE_LIMIT, plasmaCrawlNURL.STACK_TYPE_CORE); } log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl"); } @@ -1312,10 +1334,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // start a global crawl, if possible - String stats = "REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " - + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; + String stats = "REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; try { - plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT); + plasmaCrawlNURL.Entry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT); String profileHandle = urlEntry.profileHandle(); // System.out.println("DEBUG plasmaSwitchboard.processCrawling: // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); @@ -1327,7 +1349,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logFine("plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() + ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false"))); - boolean tryRemote = ((urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (sbQueue.size() != 0)) && + boolean tryRemote = ((noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (sbQueue.size() != 0)) && (profile.remoteIndexing()) && (urlEntry.initiator() != null) && // (!(urlEntry.initiator().equals(indexURL.dummyHash))) && @@ -1359,7 +1381,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } public int remoteTriggeredCrawlJobSize() { - return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE); + return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE); } public boolean remoteTriggeredCrawlJob() { @@ -1367,7 +1389,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // do nothing if either there are private processes to be done // or there is no global crawl on the stack - if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) == 0) { + if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) == 0) { //log.logDebug("GlobalCrawl: queue is empty"); return false; } @@ -1398,10 +1420,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view) - String stats = "REMOTETRIGGEREDCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " - + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; + String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; try { - plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE); + plasmaCrawlNURL.Entry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE); String profileHandle = urlEntry.profileHandle(); // System.out.println("DEBUG plasmaSwitchboard.processCrawling: // profileHandle = " + profileHandle + ", urlEntry.url = " + @@ -1531,7 +1553,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } catch (MalformedURLException e1) {} } log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.normalizedURLString() + - ", NEW CRAWL STACK SIZE IS " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)); + ", NEW CRAWL STACK SIZE IS " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)); } stackEndTime = System.currentTimeMillis(); @@ -1568,7 +1590,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser checkInterruption(); // create a new loaded URL db entry - indexURLEntry newEntry = urlPool.loadedURL.newEntry( + indexURLEntry newEntry = wordIndex.loadedURL.newEntry( entry.url(), // URL docDescription, // document description "", // author @@ -1594,8 +1616,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser /* ======================================================================== * STORE URL TO LOADED-URL-DB * ======================================================================== */ - urlPool.loadedURL.store(newEntry); - urlPool.loadedURL.stack( + wordIndex.loadedURL.store(newEntry); + wordIndex.loadedURL.stack( newEntry, // loaded url db entry initiatorPeerHash, // initiator peer hash yacyCore.seedDB.mySeed.hash, // executor peer hash @@ -1672,7 +1694,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String word = (String) wentry.getKey(); wordStat = (plasmaCondenser.wordStatProp) wentry.getValue(); String wordHash = plasmaCondenser.word2hash(word); - indexRWIEntry wordIdxEntry = wordIndex.newRWIEntry( + indexRWIEntry wordIdxEntry = new indexRWIEntryNew( urlHash, urlLength, urlComps, wordStat.count, @@ -1807,7 +1829,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // removing current entry from notice URL queue - boolean removed = urlPool.noticeURL.remove(entry.urlHash()); // worked-off + boolean removed = noticeURL.remove(entry.urlHash()); // worked-off if (!removed) { log.logFinest("Unable to remove indexed URL " + entry.url() + " from Crawler Queue. This could be because of an URL redirect."); } @@ -1911,7 +1933,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser URL refererURL = null; String refererHash = urlEntry.referrerHash(); if ((refererHash != null) && (!refererHash.equals(plasmaURL.dummyHash))) try { - refererURL = this.urlPool.getURL(refererHash); + refererURL = this.getURL(refererHash); } catch (IOException e) { refererURL = null; } @@ -1924,7 +1946,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // return true iff another peer has/will index(ed) the url if (urlEntry == null) { - log.logInfo("REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null"); + log.logInfo("REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null"); return true; // superfluous request; true correct in this context } @@ -1952,7 +1974,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // do the request try { - HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), urlPool.getURL(urlEntry.referrerHash()), 6000); + HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), getURL(urlEntry.referrerHash()), 6000); // check success /* @@ -1990,10 +2012,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String lurl = (String) page.get("lurl"); if ((lurl != null) && (lurl.length() != 0)) { String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); - indexURLEntry entry = urlPool.loadedURL.newEntry(propStr); - urlPool.loadedURL.store(entry); - urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt? - urlPool.noticeURL.remove(entry.hash()); + indexURLEntry entry = wordIndex.loadedURL.newEntry(propStr); + wordIndex.loadedURL.store(entry); + wordIndex.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt? + noticeURL.remove(entry.hash()); log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'"); return true; } else { @@ -2051,7 +2073,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser //} // create a new search event - plasmaSearchEvent theSearch = new plasmaSearchEvent(query, ranking, localTiming, remoteTiming, postsort, log, wordIndex, urlPool.loadedURL, snippetCache); + plasmaSearchEvent theSearch = new plasmaSearchEvent(query, ranking, localTiming, remoteTiming, postsort, log, wordIndex, wordIndex.loadedURL, snippetCache); plasmaSearchResult acc = theSearch.search(); // fetch snippets @@ -2094,7 +2116,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if ((seed == null) || ((address = seed.getAddress()) == null)) { // seed is not known from here removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes(), "UTF-8")); - urlPool.loadedURL.remove(urlentry.hash()); // clean up + wordIndex.loadedURL.remove(urlentry.hash()); // clean up continue; // next result } urlname = "http://share." + seed.getName() + ".yacy" + filename; @@ -2217,7 +2239,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // finally, delete the url entry // determine the url string - indexURLEntry entry = urlPool.loadedURL.load(urlhash, null); + indexURLEntry entry = wordIndex.loadedURL.load(urlhash, null); if (entry == null) return 0; indexURLEntry.Components comp = entry.comp(); if (comp.url() == null) return 0; @@ -2245,7 +2267,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (witer != null) count = removeReferences(urlhash, witer); // finally delete the url entry itself - urlPool.loadedURL.remove(urlhash); + wordIndex.loadedURL.remove(urlhash); return count; } catch (ParserException e) { return 0; @@ -2373,15 +2395,15 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (getConfig("allowDistributeIndex","false").equalsIgnoreCase("false")) { return "no DHT distribution: not enabled"; } - if (urlPool.loadedURL.size() < 10) { - return "no DHT distribution: loadedURL.size() = " + urlPool.loadedURL.size(); + if (wordIndex.loadedURL.size() < 10) { + return "no DHT distribution: loadedURL.size() = " + wordIndex.loadedURL.size(); } if (wordIndex.size() < 100) { return "no DHT distribution: not enough words - wordIndex.size() = " + wordIndex.size(); } if ((getConfig("allowDistributeIndexWhileCrawling","false").equalsIgnoreCase("false")) && - ((urlPool.noticeURL.stackSize() > 0) || (sbQueue.size() > 3))) { - return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " + urlPool.noticeURL.stackSize() + ", sbQueue.size() = " + sbQueue.size(); + ((noticeURL.stackSize() > 0) || (sbQueue.size() > 3))) { + return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " + noticeURL.stackSize() + ", sbQueue.size() = " + sbQueue.size(); } return null; } @@ -2522,7 +2544,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser kelondroBitfield flags ) { // create a new errorURL DB entry - plasmaCrawlEURL.Entry ee = this.urlPool.errorURL.newEntry( + plasmaCrawlEURL.Entry ee = this.errorURL.newEntry( url, referrerHash, initiator, @@ -2534,7 +2556,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // store the entry ee.store(); // push it onto the stack - this.urlPool.errorURL.stackPushEntry(ee); + this.errorURL.stackPushEntry(ee); } public void checkInterruption() throws InterruptedException { diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java deleted file mode 100644 index 59fec6dee..000000000 --- a/source/de/anomic/plasma/plasmaURLPool.java +++ /dev/null @@ -1,99 +0,0 @@ -// plasmaURLPool.java -// ----------------------- -// part of YaCy -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2005 -// last major change: 16.06.2005 -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - -// this class combines all url storage methods into one. It is the host for all url storage - - -package de.anomic.plasma; - -import java.io.File; -import java.io.IOException; - -import de.anomic.plasma.plasmaURL; -import de.anomic.index.indexURLEntry; -import de.anomic.net.URL; - -public class plasmaURLPool { - - - public final plasmaCrawlLURL loadedURL; - public final plasmaCrawlNURL noticeURL; - public final plasmaCrawlEURL errorURL; - - public plasmaURLPool(File plasmaPath, File indexPath, - int ramLURL, - int ramNURL, - int ramEURL, - long preloadTime) { - loadedURL = new plasmaCrawlLURL(plasmaPath, indexPath, ramLURL, preloadTime); - noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL, -1); - errorURL = new plasmaCrawlEURL(plasmaPath, ramEURL, -1); - } - - public String exists(String hash) { - // tests if hash occurrs in any database - // if it exists, the name of the database is returned, - // if it not exists, null is returned - if (loadedURL.exists(hash)) return "loaded"; - if (noticeURL.existsInStack(hash)) return "crawler"; - if (errorURL.exists(hash)) return "errors"; - return null; - } - - public URL getURL(String urlhash) throws IOException { - if (urlhash.equals(plasmaURL.dummyHash)) return null; - try { - plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); - if (ne != null) return ne.url(); - } catch (IOException e) {} - indexURLEntry le = loadedURL.load(urlhash, null); - if (le != null) return le.comp().url(); - plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); - if (ee != null) return ee.url(); - return null; - } - - public void close() { - try {loadedURL.close();} catch (IOException e) {} - noticeURL.close(); - try {errorURL.close();} catch (IOException e) {} - } -} diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index d1a78b912..08df1d71d 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -27,7 +27,6 @@ package de.anomic.plasma; import java.io.File; -import java.io.IOException; import java.util.Date; import java.util.HashMap; import java.util.HashSet; @@ -47,11 +46,8 @@ import de.anomic.index.indexRWIEntryNew; import de.anomic.index.indexRWIEntryOld; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; -import de.anomic.kelondro.kelondroBitfield; -import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMergeIterator; import de.anomic.kelondro.kelondroOrder; -import de.anomic.kelondro.kelondroRow; import de.anomic.net.URL; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.logging.serverLog; @@ -59,67 +55,33 @@ import de.anomic.yacy.yacyDHTAction; public final class plasmaWordIndex implements indexRI { - private static final kelondroRow payloadrowold = indexRWIEntryOld.urlEntryRow; - private static final kelondroRow payloadrownew = indexRWIEntryNew.urlEntryRow; + private final kelondroOrder indexOrder = kelondroBase64Order.enhancedCoder; + private final indexRAMRI dhtOutCache, dhtInCache; + private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster + public boolean busyCacheFlush; // shows if a cache flush is currently performed + private int idleDivisor, busyDivisor; + public final plasmaCrawlLURL loadedURL; - private final File oldDatabaseRoot; - private final kelondroOrder indexOrder = kelondroBase64Order.enhancedCoder; - private final indexRAMRI dhtOutCache, dhtInCache; - private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster - public boolean busyCacheFlush; // shows if a cache flush is currently performed - private int idleDivisor, busyDivisor; - - public plasmaWordIndex(File oldDatabaseRoot, File newIndexRoot, boolean dummy, int bufferkb, long preloadTime, serverLog log) { - this.oldDatabaseRoot = oldDatabaseRoot; - File textindexcache = new File(newIndexRoot, "PUBLIC/TEXT/RICACHE"); + public plasmaWordIndex(File indexRoot, long rwibuffer, long lurlbuffer, long preloadTime, serverLog log) { + File textindexcache = new File(indexRoot, "PUBLIC/TEXT/RICACHE"); if (!(textindexcache.exists())) textindexcache.mkdirs(); - this.dhtOutCache = new indexRAMRI(textindexcache, payloadrownew, 1024, "dump1.array", log, true); - this.dhtInCache = new indexRAMRI(textindexcache, payloadrownew, 1024, "dump2.array", log, true); + this.dhtOutCache = new indexRAMRI(textindexcache, indexRWIEntryNew.urlEntryRow, 2040, "dump1.array", log); + this.dhtInCache = new indexRAMRI(textindexcache, indexRWIEntryNew.urlEntryRow, 2040, "dump2.array", log); // create collections storage path - File textindexcollections = new File(newIndexRoot, "PUBLIC/TEXT/RICOLLECTION"); + File textindexcollections = new File(indexRoot, "PUBLIC/TEXT/RICOLLECTION"); if (!(textindexcollections.exists())) textindexcollections.mkdirs(); - this.collections = new indexCollectionRI(textindexcollections, "collection", bufferkb * 1024, preloadTime, payloadrownew); + this.collections = new indexCollectionRI(textindexcollections, "collection", rwibuffer, preloadTime, indexRWIEntryNew.urlEntryRow); + + // create LURL-db + loadedURL = new plasmaCrawlLURL(indexRoot, lurlbuffer, preloadTime); + // performance settings busyCacheFlush = false; this.busyDivisor = 5000; this.idleDivisor = 420; } - public kelondroRow payloadrow() { - return payloadrownew; - } - - public indexRWIEntry newRWIEntry( - String urlHash, - int urlLength, - int urlComps, - int titleLength, - int hitcount, - int wordcount, - int phrasecount, - int posintext, - int posinphrase, - int posofphrase, - int worddistance, - int sizeOfPage, - long lastmodified, - long updatetime, - int quality, - String language, - char doctype, - int outlinksSame, - int outlinksOther, - kelondroBitfield flags ) { - return new indexRWIEntryNew(urlHash, urlLength, urlComps, titleLength, hitcount, wordcount, phrasecount, - posintext, posinphrase, posofphrase, worddistance, sizeOfPage, lastmodified, updatetime, quality, language, doctype, - outlinksSame, outlinksOther, flags); - } - - public File getRoot() { - return oldDatabaseRoot; - } - public int maxURLinDHTOutCache() { return dhtOutCache.maxURLinCache(); } @@ -184,12 +146,12 @@ public final class plasmaWordIndex implements indexRI { } public indexContainer emptyContainer(String wordHash) { - return new indexContainer(wordHash, payloadrow(), true); + return new indexContainer(wordHash, indexRWIEntryNew.urlEntryRow); } - public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtInCase) { + public void addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtInCase) { if (entry instanceof indexRWIEntryOld) { - if (entry.urlHash() == null) return null; + if (entry.urlHash() == null) return; entry = new indexRWIEntryNew((indexRWIEntryOld) entry); } @@ -203,12 +165,12 @@ public final class plasmaWordIndex implements indexRI { dhtOutCache.addEntry(wordHash, entry, updateTime, false); flushControl(); } - return null; } + /* private indexContainer convertOld2New(indexContainer entries) { // convert old entries to new entries - indexContainer newentries = new indexContainer(entries.getWordHash(), payloadrownew, true); + indexContainer newentries = new indexContainer(entries.getWordHash(), indexRWIEntryNew.urlEntryRow); Iterator i = entries.entries(); indexRWIEntryOld old; while (i.hasNext()) { @@ -219,9 +181,9 @@ public final class plasmaWordIndex implements indexRI { } return newentries; } - - public indexContainer addEntries(indexContainer entries, long updateTime, boolean dhtInCase) { - if (entries.row().objectsize() == payloadrowold.objectsize()) entries = convertOld2New(entries); + */ + public void addEntries(indexContainer entries, long updateTime, boolean dhtInCase) { + assert (entries.row().objectsize() == indexRWIEntryNew.urlEntryRow.objectsize()); // set dhtInCase depending on wordHash if ((!dhtInCase) && (yacyDHTAction.shallBeOwnWord(entries.getWordHash()))) dhtInCase = true; @@ -233,7 +195,6 @@ public final class plasmaWordIndex implements indexRI { dhtOutCache.addEntries(entries, updateTime, false); flushControl(); } - return null; } public void flushCacheSome(boolean busy) { @@ -263,12 +224,7 @@ public final class plasmaWordIndex implements indexRI { // flush the wordHash indexContainer c = ram.deleteContainer(wordHash); - if (c != null) { - indexContainer feedback = collections.addEntries(c, c.updated(), false); - if (feedback != null) { - throw new RuntimeException("indexCollectionRI shall not return feedback entries; feedback = " + feedback.toString()); - } - } + if (c != null) collections.addEntries(c, c.updated(), false); // pause to next loop to give other processes a chance to use IO //try {this.wait(8);} catch (InterruptedException e) {} @@ -330,7 +286,7 @@ public final class plasmaWordIndex implements indexRI { wprop = (plasmaCondenser.wordStatProp) wentry.getValue(); // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c); wordHash = plasmaCondenser.word2hash(word); - ientry = newRWIEntry(urlHash, + ientry = new indexRWIEntryNew(urlHash, urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(), wprop.count, condenser.RESULT_SIMI_WORDS, @@ -415,16 +371,16 @@ public final class plasmaWordIndex implements indexRI { return size; } - public void close(int waitingBoundSeconds) { + public void close() { synchronized (this) { - dhtInCache.close(waitingBoundSeconds); - dhtOutCache.close(waitingBoundSeconds); - collections.close(-1); + dhtInCache.close(); + dhtOutCache.close(); + collections.close(); } } public indexContainer deleteContainer(String wordHash) { - indexContainer c = new indexContainer(wordHash, payloadrow(), true); + indexContainer c = new indexContainer(wordHash, indexRWIEntryNew.urlEntryRow); c.add(dhtInCache.deleteContainer(wordHash), -1); c.add(dhtOutCache.deleteContainer(wordHash), -1); c.add(collections.deleteContainer(wordHash), -1); @@ -456,9 +412,7 @@ public final class plasmaWordIndex implements indexRI { } public static final int RL_RAMCACHE = 0; - public static final int RL_COLLECTIONS = 1; // the new index structure - public static final int RL_ASSORTMENTS = 2; // (to be) outdated structure - public static final int RL_WORDFILES = 3; // (to be) outdated structure + public static final int RL_COLLECTIONS = 1; public int tryRemoveURLs(String urlHash) { // this tries to delete an index from the cache that has this @@ -468,14 +422,14 @@ public final class plasmaWordIndex implements indexRI { return dhtInCache.tryRemoveURLs(urlHash) | dhtOutCache.tryRemoveURLs(urlHash); } - public TreeSet indexContainerSet(String startHash, int resourceLevel, boolean rot, int count) { + public TreeSet indexContainerSet(String startHash, boolean ram, boolean rot, int count) { // creates a set of indexContainers // this does not use the dhtInCache kelondroOrder containerOrder = new indexContainerOrder((kelondroOrder) indexOrder.clone()); containerOrder.rotate(startHash.getBytes()); TreeSet containers = new TreeSet(containerOrder); - Iterator i = wordContainers(startHash, resourceLevel, rot); - if (resourceLevel == plasmaWordIndex.RL_RAMCACHE) count = Math.min(dhtOutCache.size(), count); + Iterator i = wordContainers(startHash, ram, rot); + if (ram) count = Math.min(dhtOutCache.size(), count); indexContainer container; while ((count > 0) && (i.hasNext())) { container = (indexContainer) i.next(); @@ -486,38 +440,35 @@ public final class plasmaWordIndex implements indexRI { } return containers; } + - public Iterator wordContainers(String startHash, boolean rot) { - // returns an iteration of indexContainers - return wordContainers(startHash, RL_WORDFILES, rot); - } - - public Iterator wordContainers(String startHash, int resourceLevel, boolean rot) { - if (rot) return new rotatingContainerIterator(startHash, resourceLevel); - else return wordContainers(startHash, resourceLevel); + public Iterator wordContainers(String startHash, boolean ram, boolean rot) { + if (rot) return new rotatingContainerIterator(startHash, ram); + else return wordContainers(startHash, ram); } - private Iterator wordContainers(String startWordHash, int resourceLevel) { + public Iterator wordContainers(String startWordHash, boolean ram) { kelondroOrder containerOrder = new indexContainerOrder((kelondroOrder) indexOrder.clone()); containerOrder.rotate(startWordHash.getBytes()); - if (resourceLevel == plasmaWordIndex.RL_RAMCACHE) { + if (ram) { return dhtOutCache.wordContainers(startWordHash, false); - } - return new kelondroMergeIterator( + } else { + return new kelondroMergeIterator( dhtOutCache.wordContainers(startWordHash, false), collections.wordContainers(startWordHash, false), containerOrder, indexContainer.containerMergeMethod, true); + } } public class rotatingContainerIterator implements Iterator { Iterator i; - int resourceLevel; + boolean ram; - public rotatingContainerIterator(String startWordHash, int resourceLevel) { - this.resourceLevel = resourceLevel; - i = wordContainers(startWordHash, resourceLevel); + public rotatingContainerIterator(String startWordHash, boolean ram) { + this.ram = ram; + i = wordContainers(startWordHash, ram); } public void finalize() { @@ -527,7 +478,7 @@ public final class plasmaWordIndex implements indexRI { public boolean hasNext() { if (i.hasNext()) return true; else { - i = wordContainers("------------", resourceLevel); + i = wordContainers("------------", ram); return i.hasNext(); } } @@ -541,44 +492,6 @@ public final class plasmaWordIndex implements indexRI { } } // class rotatingContainerIterator - public Object migrateWords2index(String wordhash) throws IOException { - // returns the number of entries that had been added to the assortments - // can be negative if some assortments have been moved to the backend - File db = plasmaWordIndexFile.wordHash2path(oldDatabaseRoot, wordhash); - if (!(db.exists())) return "not available"; - plasmaWordIndexFile entity = null; - try { - entity = new plasmaWordIndexFile(oldDatabaseRoot, wordhash, true); - int size = entity.size(); - indexContainer container = new indexContainer(wordhash, payloadrow(), true); - - try { - Iterator entries = entity.elements(true); - indexRWIEntry entry; - while (entries.hasNext()) { - entry = (indexRWIEntry) entries.next(); - // System.out.println("ENTRY = " + entry.getUrlHash()); - container.add(new indexRWIEntry[] { entry }, System.currentTimeMillis()); - } - // we have read all elements, now delete the entity - entity.deleteComplete(); - entity.close(); - entity = null; - - indexContainer feedback = collections.addEntries(container, container.updated(), false); - if (feedback != null) return feedback; - return new Integer(size); - } catch (kelondroException e) { - // database corrupted, we simply give up the database and delete it - try { entity.close(); } catch (Exception ee) { } - entity = null; - try { db.delete(); } catch (Exception ee) { } - return "database corrupted; deleted"; - } - } finally { - if (entity != null) try {entity.close();}catch(Exception e){} - } - } // The Cleaner class was provided as "UrldbCleaner" by Hydrox // see http://www.yacy-forum.de/viewtopic.php?p=18093#18093 @@ -609,7 +522,7 @@ public final class plasmaWordIndex implements indexRI { indexRWIEntry entry = null; URL url = null; HashSet urlHashs = new HashSet(); - Iterator indexContainerIterator = indexContainerSet(startHash, plasmaWordIndex.RL_WORDFILES, false, 100).iterator(); + Iterator indexContainerIterator = indexContainerSet(startHash, false, false, 100).iterator(); while (indexContainerIterator.hasNext() && run) { waiter(); container = (indexContainer) indexContainerIterator.next(); @@ -639,7 +552,7 @@ public final class plasmaWordIndex implements indexRI { } if (!containerIterator.hasNext()) { // We may not be finished yet, try to get the next chunk of wordHashes - TreeSet containers = indexContainerSet(container.getWordHash(), plasmaWordIndex.RL_WORDFILES, false, 100); + TreeSet containers = indexContainerSet(container.getWordHash(), false, false, 100); indexContainerIterator = containers.iterator(); // Make sure we don't get the same wordhash twice, but don't skip a word if ((indexContainerIterator.hasNext()) && (!container.getWordHash().equals(((indexContainer) indexContainerIterator.next()).getWordHash()))) { @@ -693,13 +606,14 @@ public final class plasmaWordIndex implements indexRI { public static void main(String[] args) { // System.out.println(kelondroMSetTools.fastStringComparator(true).compare("RwGeoUdyDQ0Y", "rwGeoUdyDQ0Y")); // System.out.println(new Date(reverseMicroDateDays(microDateDays(System.currentTimeMillis())))); - File plasmadb = new File("D:\\dev\\proxy\\DATA\\PLASMADB"); + /* File indexdb = new File("D:\\dev\\proxy\\DATA\\INDEX"); - plasmaWordIndex index = new plasmaWordIndex(plasmadb, indexdb, true, 555, 1000, new serverLog("TESTAPP")); + plasmaWordIndex index = new plasmaWordIndex(indexdb, true, 555, 1000, new serverLog("TESTAPP")); Iterator containerIter = index.wordContainers("5A8yhZMh_Kmv", plasmaWordIndex.RL_WORDFILES, true); while (containerIter.hasNext()) { System.out.println("File: " + (indexContainer) containerIter.next()); } + */ } } diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java index 5542e4366..7a04961a6 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java @@ -58,6 +58,7 @@ import java.util.Iterator; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexRWIEntryNew; import de.anomic.index.indexRWIEntryOld; import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroColumn; @@ -74,12 +75,9 @@ public final class plasmaWordIndexAssortment { // class variables private File assortmentFile; - private int assortmentLength; private serverLog log; private kelondroCache assortments; private long bufferSize; - private long preloadTime; - private kelondroRow payloadrow; private static String intx(int x) { String s = Integer.toString(x); @@ -92,23 +90,20 @@ public final class plasmaWordIndexAssortment { structure[0] = new kelondroColumn("byte[] wordhash-" + yacySeedDB.commonHashLength); structure[1] = new kelondroColumn("Cardinal occ-4 {b256}"); structure[2] = new kelondroColumn("Cardinal time-8 {b256}"); - kelondroColumn p = new kelondroColumn("byte[] urlprops-" + payloadrow.objectsize()); + kelondroColumn p = new kelondroColumn("byte[] urlprops-" + indexRWIEntryOld.urlEntryRow.objectsize()); for (int i = 0; i < assortmentCapacity; i++) structure[3 + i] = p; return new kelondroRow(structure); } private int assortmentCapacity(int rowsize) { - return (rowsize - yacySeedDB.commonHashLength - 12) / payloadrow.objectsize(); + return (rowsize - yacySeedDB.commonHashLength - 12) / indexRWIEntryOld.urlEntryRow.objectsize(); } - public plasmaWordIndexAssortment(File storagePath, kelondroRow payloadrow, int assortmentLength, int bufferkb, long preloadTime, serverLog log) throws IOException { + public plasmaWordIndexAssortment(File storagePath, int assortmentLength, int bufferkb, long preloadTime, serverLog log) throws IOException { if (!(storagePath.exists())) storagePath.mkdirs(); - this.payloadrow = payloadrow; this.assortmentFile = new File(storagePath, assortmentFileName + intx(assortmentLength) + ".db"); - this.assortmentLength = assortmentLength; //this.bufferStructureLength = 3 + 2 * assortmentLength; this.bufferSize = bufferkb * 1024; - this.preloadTime = preloadTime; this.log = log; // open assortment tree file long start = System.currentTimeMillis(); @@ -122,138 +117,26 @@ public final class plasmaWordIndexAssortment { assortments.cacheNodeStatus()[1] + " preloaded"); } - - public void store(indexContainer newContainer) throws IOException { - // stores a word index to assortment database - // this throws an exception if the word hash already existed - //log.logDebug("storeAssortment: wordHash=" + wordHash + ", urlHash=" + entry.getUrlHash() + ", time=" + creationTime); - if (newContainer.size() != assortmentLength) throw new RuntimeException("plasmaWordIndexAssortment.store: wrong container size"); - kelondroRow.Entry row = assortments.row().newEntry(); - row.setCol(0, newContainer.getWordHash().getBytes()); - row.setCol(1, 1); - row.setCol(2, newContainer.updated()); - Iterator entries = newContainer.entries(); - indexRWIEntry entry; - for (int i = 0; i < assortmentLength; i++) { - entry = (indexRWIEntry) entries.next(); - row.setCol(3 + i, entry.toKelondroEntry().bytes()); - } - kelondroRow.Entry oldrow = null; - try { - oldrow = assortments.put(row); - } catch (IOException e) { - e.printStackTrace(); - log.logSevere("storeAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB " + assortmentFile, e); - resetDatabase(); - } catch (IndexOutOfBoundsException e) { - e.printStackTrace(); - log.logSevere("storeAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB " + assortmentFile, e); - resetDatabase(); - } catch (kelondroException e) { - e.printStackTrace(); - log.logSevere("storeAssortment/kelondro-error: " + e.getMessage() + " - reset assortment-DB " + assortmentFile, e); - resetDatabase(); - } - if (oldrow != null) throw new RuntimeException("Store to assortment ambiguous"); - } - - public indexContainer remove(String wordHash) { - // deletes a word index from assortment database - // and returns the content record - kelondroRow.Entry row = null; - try { - row = assortments.remove(wordHash.getBytes()); - } catch (IOException e) { - log.logSevere("removeAssortment/IO-error: " + e.getMessage() - + " - reset assortment-DB " + assortmentFile, e); - resetDatabase(); - return null; - } catch (kelondroException e) { - log.logSevere("removeAssortment/kelondro-error: " + e.getMessage() - + " - reset assortment-DB " + assortmentFile, e); - resetDatabase(); - return null; - } - return row2container(row); - } - - public boolean contains(String wordHash) { - // gets a word index from assortment database - // and returns the content record - kelondroRow.Entry row = null; - try { - row = assortments.get(wordHash.getBytes()); - return (row != null); - } catch (IOException e) { - return false; - } catch (kelondroException e) { - log.logSevere("removeAssortment/kelondro-error: " + e.getMessage() - + " - reset assortment-DB " + assortmentFile, e); - resetDatabase(); - return false; - } - } - - public indexContainer get(String wordHash) { - // gets a word index from assortment database - // and returns the content record - kelondroRow.Entry row = null; - try { - row = assortments.get(wordHash.getBytes()); - } catch (IOException e) { - log.logSevere("removeAssortment/IO-error: " + e.getMessage() - + " - reset assortment-DB " + assortmentFile, e); - resetDatabase(); - return null; - } catch (kelondroException e) { - log.logSevere("removeAssortment/kelondro-error: " + e.getMessage() - + " - reset assortment-DB " + assortmentFile, e); - resetDatabase(); - return null; - } - return row2container(row); - } public final indexContainer row2container(kelondroRow.Entry row) { if (row == null) return null; String wordHash = row.getColString(0, null); final long updateTime = row.getColLong(2); - indexContainer container = new indexContainer(wordHash, payloadrow, false); + indexContainer container = new indexContainer(wordHash, indexRWIEntryNew.urlEntryRow); int al = assortmentCapacity(row.objectsize()); for (int i = 0; i < al; i++) { - container.add(new indexRWIEntry[] { new indexRWIEntryOld(row.getColBytes(3 + i)) }, updateTime); + // fill AND convert old entries to new entries + container.add(new indexRWIEntry[] { new indexRWIEntryNew(new indexRWIEntryOld(row.getColBytes(3 + i))) }, updateTime); } return container; } - private void resetDatabase() { - // deletes the assortment database and creates a new one - if (assortments != null) try { - assortments.close(); - } catch (IOException e) {} - - try { - // make a back-up - File backupPath = new File(assortmentFile.getParentFile(), "ABKP"); - if (!(backupPath.exists())) backupPath.mkdirs(); - File backupFile = new File(backupPath, assortmentFile.getName() + System.currentTimeMillis()); - assortmentFile.renameTo(backupFile); - log.logInfo("a back-up of the deleted assortment file is in " + backupFile.toString()); - if (assortmentFile.exists()) assortmentFile.delete(); - assortments = new kelondroCache(kelondroTree.open(assortmentFile, bufferSize / 2, preloadTime, bufferStructure(assortmentLength)), bufferSize / 2, true, false); - } catch (Exception e) { - // if this fails, delete the file - if (!(assortmentFile.delete())) throw new RuntimeException("cannot delete assortment database"); - } - } - - public Iterator containers(String startWordHash, boolean up, boolean rot) throws IOException { + public Iterator wordContainers(String startWordHash, boolean up, boolean rot) throws IOException { // returns an iteration of indexContainer elements try { return new containerIterator(startWordHash, up, rot); } catch (kelondroException e) { log.logSevere("iterateAssortment/kelondro-error: " + e.getMessage() + " - reset assortment-DB " + assortmentFile, e); - resetDatabase(); return null; } } @@ -288,22 +171,6 @@ public final class plasmaWordIndexAssortment { return 0; } } - - public int cacheNodeChunkSize() { - return assortments.cacheNodeChunkSize(); - } - - public int cacheObjectChunkSize() { - return assortments.cacheObjectChunkSize(); - } - - public int[] cacheNodeStatus() { - return assortments.cacheNodeStatus(); - } - - public long[] cacheObjectStatus() { - return assortments.cacheObjectStatus(); - } public void close() { try { diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java deleted file mode 100644 index 377cc8f09..000000000 --- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java +++ /dev/null @@ -1,408 +0,0 @@ -// plasmaWordIndexAssortmentCluster.java -// ------------------------------------- -// part of YACY -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2005 -// last major change: 20.5.2005 -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - -/* - An assortment-cluster is a set of assortments. - Each one carries a different number of URL's - */ - -package de.anomic.plasma; - -import java.io.File; -import java.io.IOException; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Set; - -import de.anomic.index.indexContainer; -import de.anomic.index.indexContainerOrder; -import de.anomic.index.indexRWIEntry; -import de.anomic.index.indexRI; -import de.anomic.kelondro.kelondroCache; -import de.anomic.kelondro.kelondroMergeIterator; -import de.anomic.kelondro.kelondroNaturalOrder; -import de.anomic.kelondro.kelondroRecords; -import de.anomic.kelondro.kelondroRow; -import de.anomic.server.logging.serverLog; - -public final class plasmaWordIndexAssortmentCluster implements indexRI { - - // class variables - private int clusterCount; // number of cluster files - public int clusterCapacity; // number of all url referrences that can be stored to a single word in the cluster - - //private serverLog log; - private plasmaWordIndexAssortment[] assortments; - private long completeBufferKB; - private kelondroRow payloadrow; - - public plasmaWordIndexAssortmentCluster(File assortmentsPath, int clusterCount, kelondroRow payloadrow, int bufferkb, long preloadTime, serverLog log) throws IOException { - // set class variables - if (!(assortmentsPath.exists())) assortmentsPath.mkdirs(); - this.payloadrow = payloadrow; - this.clusterCount = clusterCount; - this.clusterCapacity = clusterCount * (clusterCount + 1) / 2; - this.completeBufferKB = bufferkb; - // this.log = log; - this.assortments = new plasmaWordIndexAssortment[clusterCount]; - - // open cluster and close it directly again to detect the element sizes - int[] sizes = new int[clusterCount]; - int sumSizes = 1; - plasmaWordIndexAssortment testAssortment; - for (int i = 0; i < clusterCount; i++) { - testAssortment = new plasmaWordIndexAssortment(assortmentsPath, payloadrow, i + 1, 0, 0, null); - sizes[i] = testAssortment.size() + clusterCount - i; - sumSizes += sizes[i]; - testAssortment.close(); - testAssortment = null; - } - - // initialize cluster using the cluster elements size for optimal buffer - // size - long nextTime; - long startTime; - long sS = (long) sumSizes; - for (int i = 0; i < clusterCount; i++) { - nextTime = Math.max(0, preloadTime * ((long) sizes[i]) / sS); - startTime = System.currentTimeMillis(); - assortments[i] = new plasmaWordIndexAssortment( - assortmentsPath, - payloadrow, - i + 1, - (int) (completeBufferKB * (long) sizes[i] / (long) sumSizes), - nextTime, - log); - preloadTime -= System.currentTimeMillis() - startTime; - sS -= sizes[i]; - } - } - - private indexContainer storeSingular(indexContainer newContainer) throws IOException { - // this tries to store the record. If the record does not fit, or a same hash already - // exists and would not fit together with the new record, then the record is deleted from - // the assortmen(s) and returned together with the newRecord. - // if storage was successful, NULL is returned. - if (newContainer.size() > clusterCount) return newContainer; // it will not fit - indexContainer buffer; - while ((buffer = assortments[newContainer.size() - 1].remove(newContainer.getWordHash())) != null) { - if (newContainer.add(buffer, -1) == 0) return newContainer; // security check; othervise this loop does not terminate - if (newContainer.size() > clusterCount) return newContainer; // it will not fit - } - // the assortment (newContainer.size() - 1) should now be empty. put it in there - assortments[newContainer.size() - 1].store(newContainer); - // return null to show that we have stored the new Record successfully - return null; - } - - private void storeForced(indexContainer newContainer) throws IOException { - // this stores the record and overwrites an existing record. - // this is safe if we can be shure that the record does not exist before. - if ((newContainer == null) || (newContainer.size() == 0) || (newContainer.size() > clusterCount)) return; // it will not fit - assortments[newContainer.size() - 1].store(newContainer); - } - - private void storeStretched(indexContainer newContainer) throws IOException { - // this stores the record and stretches the storage over - // all the assortments that are necessary to fit in the record - // IMPORTANT: it must be ensured that the wordHash does not exist in the cluster before - // i.e. by calling removeFromAll - if (newContainer.size() <= clusterCount) { - storeForced(newContainer); - return; - } - - // calculate minimum cluster insert point - int clusterMinStart = clusterCount; - int cap = clusterCapacity - newContainer.size() - 2 * clusterCount; - while (cap > 0) { - cap -= clusterMinStart; - clusterMinStart--; - } - - // point the real cluster insert point somewhere between the minimum and the maximum - int clusterStart = clusterCount - (int) (Math.random() * (clusterCount - clusterMinStart)); - - // do the insert - indexContainer c; - Iterator i = newContainer.entries(); - for (int j = clusterStart; j >= 1; j--) { - c = new indexContainer(newContainer.getWordHash(), payloadrow, false); - for (int k = 0; k < j; k++) { - if (i.hasNext()) { - c.add((indexRWIEntry) i.next(), newContainer.updated()); - } else { - storeForced(c); - return; - } - } - storeForced(c); - } - } - - public indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) { - indexContainer container = new indexContainer(wordHash, payloadrow, false); - container.add(newEntry); - return addEntries(container, updateTime, dhtCase); - } - - public long getUpdateTime(String wordHash) { - indexContainer entries = getContainer(wordHash, null, false, -1); - if (entries == null) return 0; - return entries.updated(); - } - - public indexContainer addEntries(indexContainer newContainer, long creationTime, boolean dhtCase) { - // this is called by the index ram cache flush process - // it returnes NULL if the storage was successful - // it returnes a new container if the given container cannot be stored - // containers that are returned will be stored in a WORDS file - if (newContainer == null) return null; - if (newContainer.size() > clusterCapacity) return newContainer; // it will not fit - - // split the container into several smaller containers that will take the whole thing - // first find out how the container can be splitted - int testsize = Math.min(clusterCount, newContainer.size()); - int [] spaces = new int[testsize]; - for (int i = testsize - 1; i >= 0; i--) spaces[i] = 0; - int need = newContainer.size(); - int selectedAssortment = testsize - 1; - while (selectedAssortment >= 0) { - if (selectedAssortment + 1 <= need) { - spaces[selectedAssortment] = (assortments[selectedAssortment].get(newContainer.getWordHash()) == null) ? (selectedAssortment + 1) : 0; - need -= spaces[selectedAssortment]; - assert (need >= 0); - if (need == 0) break; - } - selectedAssortment--; - } - if (need == 0) { - // we found spaces so that we can put in the newContainer into these spaces - indexContainer c; - Iterator i = newContainer.entries(); - for (int j = testsize - 1; j >= 0; j--) { - if (spaces[j] == 0) continue; - c = new indexContainer(newContainer.getWordHash(), payloadrow, false); - for (int k = 0; k <= j; k++) { - assert (i.hasNext()); - c.add((indexRWIEntry) i.next(), newContainer.updated()); - } - try { - storeForced(c); - } catch (IOException e) { - e.printStackTrace(); - } - } - return null; - } - - if (newContainer.size() <= clusterCount) try { - newContainer = storeSingular(newContainer); - } catch (IOException e) { - e.printStackTrace(); - } - if (newContainer == null) return null; - - // clean up the whole thing and try to insert the container then - newContainer.add(deleteContainer(newContainer.getWordHash(), -1), -1); - if (newContainer.size() > clusterCapacity) return newContainer; - try { - storeStretched(newContainer); - } catch (IOException e) { - e.printStackTrace(); - } - return null; - } - - public indexContainer deleteContainer(String wordHash) { - return deleteContainer(wordHash, -1); - } - - public indexContainer deleteContainer(String wordHash, long maxTime) { - // removes all records from all the assortments and return them - indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false); - long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; - long remainingTime; - for (int i = 0; i < clusterCount; i++) { - buffer = assortments[i].remove(wordHash); - remainingTime = limitTime - System.currentTimeMillis(); - if (0 > remainingTime) break; - if (buffer != null) record.add(buffer, remainingTime); - } - return record; - } - - /* - public int removeEntries(String wordHash, String[] referenceHashes, boolean deleteComplete) { - indexContainer c = deleteContainer(wordHash, -1); - int b = c.size(); - c.removeEntries(wordHash, referenceHashes, false); - if (c.size() != 0) { - addEntries(c, c.updated(), false); - } - return b - c.size(); - } - */ - - public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { - indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false); - boolean found = false; - for (int i = 0; i < clusterCount; i++) { - buffer = assortments[i].remove(wordHash); - if ((buffer != null) && (buffer.remove(urlHash) != null)) found = true; - record.add(buffer, -1); - if (found) break; - } - // put back remaining - if (record.size() != 0) { - addEntries(record, record.updated(), false); - } - return found; - } - - public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete) { - indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false); - int initialSize = urlHashes.size(); - for (int i = 0; i < clusterCount; i++) { - buffer = assortments[i].remove(wordHash); - if (buffer != null) { - // sort out url hashes that shall be deleted - Iterator bi = buffer.entries(); - indexRWIEntry entry; - while (bi.hasNext()) { - entry = (indexRWIEntry) bi.next(); - if (urlHashes.remove(entry.urlHash())) bi.remove(); - } - record.add(buffer, -1); - } - if (urlHashes.size() == 0) break; - } - // put back remaining - if (record.size() != 0) { - addEntries(record, record.updated(), false); - } - return initialSize - urlHashes.size(); - } - - public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) { - // collect all records from all the assortments and return them - indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false); - long timeout = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; - for (int i = 0; i < clusterCount; i++) { - buffer = assortments[i].get(wordHash); - if (buffer != null) { - buffer.select(urlselection); - record.add(buffer, -1); - } - if (System.currentTimeMillis() > timeout) break; - } - return record; - } - - public int indexSize(String wordHash) { - int size = 0; - for (int i = 0; i < clusterCount; i++) { - if (assortments[i].contains(wordHash)) size += i + 1; - } - return size; - } - - public Iterator wordContainers(String startWordHash, boolean rot) { - try { - return wordContainers(startWordHash, true, rot); - } catch (IOException e) { - return new HashSet().iterator(); - } - } - - public Iterator wordContainers(String startWordHash, boolean up, boolean rot) throws IOException { - // iterates indexContainer - Objects - HashSet containerIterators = new HashSet(); - for (int i = 0; i < clusterCount; i++) containerIterators.add(assortments[i].containers(startWordHash, up, rot)); - return kelondroMergeIterator.cascade(containerIterators, new indexContainerOrder(kelondroNaturalOrder.naturalOrder), indexContainer.containerMergeMethod, up); - } - - public int size() { - int total = 0; - for (int i = 0; i < clusterCount; i++) total += assortments[i].size(); - return total; - } - - public int[] sizes() { - int[] sizes = new int[clusterCount]; - for (int i = 0; i < clusterCount; i++) sizes[i] = assortments[i].size(); - return sizes; - } - - public int cacheChunkSizeAvg() { - int i = 0; - for (int j = 0; j < clusterCount; j++) { - i += assortments[j].cacheNodeChunkSize(); - } - return i / clusterCount; - } - - public int cacheObjectSizeAvg() { - long c = 0, k = 0; - for (int j = 0; j < clusterCount; j++) { - c += assortments[j].size() * assortments[j].cacheObjectChunkSize(); - k += assortments[j].size(); - } - return (k > 0) ? (int) (c / k) : 0; - } - - public int[] cacheNodeStatus() { - int[][] a = new int[assortments.length][]; - for (int i = assortments.length - 1; i >= 0; i--) a[i] = assortments[i].cacheNodeStatus(); - return kelondroRecords.cacheCombinedStatus(a, assortments.length); - } - - public long[] cacheObjectStatus() { - long[][] a = new long[assortments.length][]; - for (int i = assortments.length - 1; i >= 0; i--) a[i] = assortments[i].cacheObjectStatus(); - return kelondroCache.combinedStatus(a, a.length); - } - - public void close(int waitingSeconds) { - for (int i = 0; i < clusterCount; i++) assortments[i].close(); - } - -} diff --git a/source/de/anomic/plasma/plasmaWordIndexFile.java b/source/de/anomic/plasma/plasmaWordIndexFile.java index 68ed2691f..d1bd492cd 100644 --- a/source/de/anomic/plasma/plasmaWordIndexFile.java +++ b/source/de/anomic/plasma/plasmaWordIndexFile.java @@ -50,6 +50,7 @@ import java.util.Iterator; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexRWIEntryNew; import de.anomic.index.indexRWIEntryOld; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroRow; @@ -131,7 +132,7 @@ public final class plasmaWordIndexFile { public indexRWIEntry getEntry(String urlhash) throws IOException { kelondroRow.Entry n = theIndex.get(urlhash.getBytes()); if (n == null) return null; - return new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null)); + return new indexRWIEntryNew(new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null))); } public boolean contains(String urlhash) throws IOException { @@ -142,33 +143,12 @@ public final class plasmaWordIndexFile { return (theIndex.get(entry.urlHash().getBytes()) != null); } - public boolean addEntry(indexRWIEntry entry) throws IOException { - if (entry == null) return false; - indexRWIEntry oldEntry = getEntry(entry.urlHash()); - if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity - return false; - } - return (theIndex.put(entry.toKelondroEntry()) == null); + public void addEntry(indexRWIEntry entry) { + throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above"); } - public int addEntries(indexContainer container) throws IOException { - //System.out.println("* adding " + newEntries.size() + " cached word index entries for word " + wordHash); // debug - // fetch the index cache - if ((container == null) || (container.size() == 0)) return 0; - - // open file - int count = 0; - - // write from vector - if (container != null) { - Iterator i = container.entries(); - while (i.hasNext()) { - if (addEntry((indexRWIEntry) i.next())) count++; - } - } - - // close and return - return count; + public void addEntries(indexContainer container) { + throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above"); } public boolean deleteComplete() { @@ -228,7 +208,7 @@ public final class plasmaWordIndexFile { public Object next() { if (i == null) return null; kelondroRow.Entry n = (kelondroRow.Entry) i.next(); - return new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null)); + return new indexRWIEntryNew(new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null))); } public void remove() { throw new UnsupportedOperationException(); @@ -239,8 +219,7 @@ public final class plasmaWordIndexFile { return "DB:" + theIndex.toString(); } - - public void merge(plasmaWordIndexFile otherEntity, long time) throws IOException { + public void merge(plasmaWordIndexFile otherEntity, long time) { // this is a merge of another entity to this entity // the merge is interrupted when the given time is over // a time=-1 means: no timeout @@ -255,174 +234,4 @@ public final class plasmaWordIndexFile { } } - /* - // join methods - private static int log2(int x) { - int l = 0; - while (x > 0) {x = x >> 1; l++;} - return l; - } - - public static plasmaWordIndexEntity joinEntities(Set entities, long time) throws IOException { - - // big problem here: there cannot be a time-out for join, since a time-out will leave the joined set too big. - // this will result in a OR behavior of the search instead of an AND behavior - - long stamp = System.currentTimeMillis(); - - // order entities by their size - TreeMap map = new TreeMap(); - plasmaWordIndexEntity singleEntity; - Iterator i = entities.iterator(); - int count = 0; - while (i.hasNext()) { - // get next entity: - singleEntity = (plasmaWordIndexEntity) i.next(); - - // check result - if ((singleEntity == null) || (singleEntity.size() == 0)) return new plasmaWordIndexEntity(null); // as this is a cunjunction of searches, we have no result if any word is not known - - // store result in order of result size - map.put(new Long(singleEntity.size() * 1000 + count), singleEntity); - count++; - } - - // check if there is any result - if (map.size() == 0) return new plasmaWordIndexEntity(null); // no result, nothing found - - // the map now holds the search results in order of number of hits per word - // we now must pairwise build up a conjunction of these sets - Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries - plasmaWordIndexEntity searchA, searchB, searchResult = (plasmaWordIndexEntity) map.remove(k); - while ((map.size() > 0) && (searchResult.size() > 0)) { - // take the first element of map which is a result and combine it with result - k = (Long) map.firstKey(); // the next smallest... - time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis(); - searchA = searchResult; - searchB = (plasmaWordIndexEntity) map.remove(k); - searchResult = plasmaWordIndexEntity.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1)); - // close the input files/structures - if (searchA != searchResult) searchA.close(); - if (searchB != searchResult) searchB.close(); - } - searchA = null; // free resources - searchB = null; // free resources - - // in 'searchResult' is now the combined search result - if (searchResult.size() == 0) return new plasmaWordIndexEntity(null); - return searchResult; - } - - - public static plasmaWordIndexEntity joinConstructive(plasmaWordIndexEntity i1, plasmaWordIndexEntity i2, long time) throws IOException { - if ((i1 == null) || (i2 == null)) return null; - if ((i1.size() == 0) || (i2.size() == 0)) return new plasmaWordIndexEntity(null); - - // decide which method to use - int high = ((i1.size() > i2.size()) ? i1.size() : i2.size()); - int low = ((i1.size() > i2.size()) ? i2.size() : i1.size()); - int stepsEnum = 10 * (high + low - 1); - int stepsTest = 12 * log2(high) * low; - - // start most efficient method - if (stepsEnum > stepsTest) { - if (i1.size() < i2.size()) - return joinConstructiveByTest(i1, i2, time); - else - return joinConstructiveByTest(i2, i1, time); - } else { - return joinConstructiveByEnumeration(i1, i2, time); - } - } - - private static plasmaWordIndexEntity joinConstructiveByTest(plasmaWordIndexEntity small, plasmaWordIndexEntity large, long time) throws IOException { - System.out.println("DEBUG: JOIN METHOD BY TEST"); - plasmaWordIndexEntity conj = new plasmaWordIndexEntity(null); // start with empty search result - Iterator se = small.elements(true); - plasmaWordIndexEntry ie0, ie1; - long stamp = System.currentTimeMillis(); - try { - while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) { - ie0 = (plasmaWordIndexEntry) se.next(); - ie1 = large.getEntry(ie0.getUrlHash()); - if (ie1 != null) { - // this is a hit. Calculate word distance: - ie0.combineDistance(ie1); - conj.addEntry(ie0); - } - } - } catch (kelondroException e) { - //serverLog.logSevere("PLASMA", "joinConstructiveByTest: Database corrupt (" + e.getMessage() + "), deleting index"); - small.deleteComplete(); - return conj; - } - return conj; - } - - private static plasmaWordIndexEntity joinConstructiveByEnumeration(plasmaWordIndexEntity i1, plasmaWordIndexEntity i2, long time) throws IOException { - System.out.println("DEBUG: JOIN METHOD BY ENUMERATION"); - plasmaWordIndexEntity conj = new plasmaWordIndexEntity(null); // start with empty search result - Iterator e1 = i1.elements(true); - Iterator e2 = i2.elements(true); - int c; - if ((e1.hasNext()) && (e2.hasNext())) { - plasmaWordIndexEntry ie1; - plasmaWordIndexEntry ie2; - try { - ie1 = (plasmaWordIndexEntry) e1.next(); - } catch (kelondroException e) { - //serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database corrupt 1 (" + e.getMessage() + "), deleting index"); - i1.deleteComplete(); - return conj; - } - try { - ie2 = (plasmaWordIndexEntry) e2.next(); - } catch (kelondroException e) { - //serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database corrupt 2 (" + e.getMessage() + "), deleting index"); - i2.deleteComplete(); - return conj; - } - long stamp = System.currentTimeMillis(); - while ((System.currentTimeMillis() - stamp) < time) { - c = ie1.getUrlHash().compareTo(ie2.getUrlHash()); - if (c < 0) { - try { - if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break; - } catch (kelondroException e) { - //serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 1 corrupt (" + e.getMessage() + "), deleting index"); - i1.deleteComplete(); - break; - } - } else if (c > 0) { - try { - if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break; - } catch (kelondroException e) { - //serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 2 corrupt (" + e.getMessage() + "), deleting index"); - i2.deleteComplete(); - break; - } - } else { - // we have found the same urls in different searches! - ie1.combineDistance(ie2); - conj.addEntry(ie1); - try { - if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break; - } catch (kelondroException e) { - //serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 1 corrupt (" + e.getMessage() + "), deleting index"); - i1.deleteComplete(); - break; - } - try { - if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break; - } catch (kelondroException e) { - //serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 2 corrupt (" + e.getMessage() + "), deleting index"); - i2.deleteComplete(); - break; - } - } - } - } - return conj; - } -*/ } diff --git a/source/de/anomic/plasma/plasmaWordIndexFileCluster.java b/source/de/anomic/plasma/plasmaWordIndexFileCluster.java index 8cd76accd..b14146c2a 100644 --- a/source/de/anomic/plasma/plasmaWordIndexFileCluster.java +++ b/source/de/anomic/plasma/plasmaWordIndexFileCluster.java @@ -43,7 +43,6 @@ package de.anomic.plasma; import java.io.File; -import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; import java.util.Iterator; @@ -53,23 +52,19 @@ import java.util.TreeSet; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; import de.anomic.index.indexRI; +import de.anomic.index.indexRWIEntryNew; +import de.anomic.index.indexRWIEntryOld; import de.anomic.kelondro.kelondroNaturalOrder; -import de.anomic.kelondro.kelondroRow; -import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySeedDB; public class plasmaWordIndexFileCluster implements indexRI { // class variables - private final File databaseRoot; - private final serverLog log; - private int size; - private kelondroRow payloadrow; + private final File databaseRoot; + private int size; - public plasmaWordIndexFileCluster(File databaseRoot, kelondroRow payloadrow, serverLog log) { + public plasmaWordIndexFileCluster(File databaseRoot) { this.databaseRoot = databaseRoot; - this.payloadrow = payloadrow; - this.log = log; this.size = 0; } @@ -77,7 +72,6 @@ public class plasmaWordIndexFileCluster implements indexRI { return size; } - public Iterator wordContainers(String startHash, boolean rot) { return new containerIterator(wordHashes(startHash, rot)); } @@ -234,16 +228,16 @@ public class plasmaWordIndexFileCluster implements indexRI { if ((maxTime < 0) || (maxTime > 60000)) maxTime=60000; // maximum is one minute if (exists(wordHash)) { plasmaWordIndexFile entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10); - indexContainer container = new indexContainer(wordHash, payloadrow, false); - indexRWIEntry entry; + indexContainer container = new indexContainer(wordHash, indexRWIEntryNew.urlEntryRow); + indexRWIEntryNew entry; Iterator i = entity.elements(true); while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) { - entry = (indexRWIEntry) i.next(); + entry = new indexRWIEntryNew((indexRWIEntryOld) i.next()); if ((urlselection == null) || (urlselection.contains(entry.urlHash()))) container.add(entry); } return container; } else { - return new indexContainer(wordHash, payloadrow, false); + return new indexContainer(wordHash, indexRWIEntryNew.urlEntryRow); } } @@ -258,80 +252,26 @@ public class plasmaWordIndexFileCluster implements indexRI { public indexContainer deleteContainer(String wordHash) { plasmaWordIndexFile.removePlasmaIndex(databaseRoot, wordHash); - return new indexContainer(wordHash, payloadrow, false); + return null; } public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { - // removes all given url hashes from a single word index. Returns number of deletions. - plasmaWordIndexFile pi = null; - boolean removed = false; - if (exists(wordHash)) try { - pi = getEntity(wordHash, true, -1); - if (pi.removeEntry(urlHash, deleteComplete)) removed = true; - int size = pi.size(); - pi.close(); pi = null; - // check if we can remove the index completely - if ((deleteComplete) && (size == 0)) deleteContainer(wordHash); - return removed; - } catch (IOException e) { - log.logSevere("plasmaWordIndexClassic.removeEntries: " + e.getMessage()); - return false; - } finally { - if (pi != null) try{pi.close();}catch(Exception e){} - } else return false; + throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above"); } public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete) { - // removes all given url hashes from a single word index. Returns number of deletions. - plasmaWordIndexFile pi = null; - int count = 0; - if (exists(wordHash)) try { - pi = getEntity(wordHash, true, -1); - Iterator i = urlHashes.iterator(); - while (i.hasNext()) if (pi.removeEntry((String) i.next(), deleteComplete)) count++; - int size = pi.size(); - pi.close(); pi = null; - // check if we can remove the index completely - if ((deleteComplete) && (size == 0)) deleteContainer(wordHash); - return count; - } catch (IOException e) { - log.logSevere("plasmaWordIndexClassic.removeEntries: " + e.getMessage()); - return count; - } finally { - if (pi != null) try{pi.close();}catch(Exception e){} - } else return 0; + throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above"); } - public indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) { - indexContainer container = new indexContainer(wordHash, payloadrow, false); - container.add(newEntry); - return addEntries(container, updateTime, dhtCase); + public void addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) { + throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above"); } - public indexContainer addEntries(indexContainer container, long creationTime, boolean highPriority) { - //System.out.println("* adding " + newEntries.size() + " cached word index entries for word " + wordHash); // debug - // fetch the index cache - if ((container == null) || (container.size() == 0)) return null; - - // open file - plasmaWordIndexFile pi = null; - try { - pi = new plasmaWordIndexFile(databaseRoot, container.getWordHash(), false); - pi.addEntries(container); - - // close and return - pi.close(); pi = null; - return null; - } catch (IOException e) { - log.logSevere("plasmaWordIndexClassic.addEntries: " + e.getMessage()); - return container; - } finally { - if (pi != null) try{pi.close();}catch (Exception e){} - } + public void addEntries(indexContainer container, long creationTime, boolean highPriority) { + throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above"); } - public void close(int waitingSeconds) { - + public void close() { } public int indexSize(String wordHash) { diff --git a/source/de/anomic/urlRedirector/urlRedirectord.java b/source/de/anomic/urlRedirector/urlRedirectord.java index ec17d28fe..8f372a69d 100644 --- a/source/de/anomic/urlRedirector/urlRedirectord.java +++ b/source/de/anomic/urlRedirector/urlRedirectord.java @@ -190,9 +190,9 @@ public class urlRedirectord implements serverHandler { ) { // first delete old entry, if exists String urlhash = plasmaURL.urlHash(this.nextURL); - switchboard.urlPool.loadedURL.remove(urlhash); - switchboard.urlPool.noticeURL.remove(urlhash); - switchboard.urlPool.errorURL.remove(urlhash); + switchboard.wordIndex.loadedURL.remove(urlhash); + switchboard.noticeURL.remove(urlhash); + switchboard.errorURL.remove(urlhash); // enqueuing URL for crawling reasonString = switchboard.sbStackCrawlThread.stackCrawl( diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 398f548d9..2134dcf90 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -52,7 +52,6 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeMap; -import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpc; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; @@ -520,30 +519,12 @@ public final class yacyClient { // save the url entry indexRWIEntry entry; if (urlEntry.word() == null) { - // the old way to define words - int urlLength = comp.url().toNormalform().length(); - int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length; - - entry = wordIndex.newRWIEntry( - urlEntry.hash(), - urlLength, - urlComps, - comp.descr().length(), - urlEntry.wordCount(), - 0, 0, 0, 0, 0, 0, - urlEntry.size(), - urlEntry.moddate().getTime(), - System.currentTimeMillis(), - 0, - urlEntry.language(), - urlEntry.doctype(), - 0,0, - new kelondroBitfield(4) - ); - } else { - // the new way: the search-result-url transports all the attributes of word indexes - entry = urlEntry.word(); + yacyCore.log.logWarning("DEBUG-SEARCH: no word attached from peer " + targetPeer.getName() + ", version " + targetPeer.getVersion()); + continue; // no word attached } + // the search-result-url transports all the attributes of word indexes + entry = urlEntry.word(); + if (urlEntry.snippet() != null) { // we don't store the snippets along the url entry, because they are search-specific. // instead, they are placed in a snipped-search cache. diff --git a/source/de/anomic/yacy/yacyDHTAction.java b/source/de/anomic/yacy/yacyDHTAction.java index 6e3d5ad22..7a5482c06 100644 --- a/source/de/anomic/yacy/yacyDHTAction.java +++ b/source/de/anomic/yacy/yacyDHTAction.java @@ -234,6 +234,7 @@ public class yacyDHTAction implements yacyPeerAction { } public static boolean shallBeOwnWord(String wordhash) { + if (yacyCore.seedDB == null) return false; if (yacyCore.seedDB.mySeed.isPotential()) return false; final double distance = dhtDistance(yacyCore.seedDB.mySeed.hash, wordhash); final double max = 1.2 / yacyCore.seedDB.sizeConnected(); diff --git a/source/de/anomic/yacy/yacyPeerActions.java b/source/de/anomic/yacy/yacyPeerActions.java index 219169f8e..7644ffba8 100644 --- a/source/de/anomic/yacy/yacyPeerActions.java +++ b/source/de/anomic/yacy/yacyPeerActions.java @@ -134,8 +134,8 @@ public class yacyPeerActions { sb.setConfig("totalPPM", Long.toString(indexedc / 1)); //no division by zero seedDB.mySeed.put(yacySeed.UPTIME, Long.toString(uptime/60)); // the number of minutes that the peer is up in minutes/day (moving average MA30) - seedDB.mySeed.put(yacySeed.LCOUNT, Integer.toString(sb.urlPool.loadedURL.size())); // the number of links that the peer has stored (LURL's) - seedDB.mySeed.put(yacySeed.NCOUNT, Integer.toString(sb.urlPool.noticeURL.stackSize())); // the number of links that the peer has noticed, but not loaded (NURL's) + seedDB.mySeed.put(yacySeed.LCOUNT, Integer.toString(sb.wordIndex.loadedURL.size())); // the number of links that the peer has stored (LURL's) + seedDB.mySeed.put(yacySeed.NCOUNT, Integer.toString(sb.noticeURL.stackSize())); // the number of links that the peer has noticed, but not loaded (NURL's) seedDB.mySeed.put(yacySeed.ICOUNT, Integer.toString(sb.cacheSizeMin())); // the minimum number of words that the peer has indexed (as it says) seedDB.mySeed.put(yacySeed.SCOUNT, Integer.toString(seedDB.sizeConnected())); // the number of seeds that the peer has stored seedDB.mySeed.put(yacySeed.CCOUNT, Double.toString(((int) ((seedDB.sizeConnected() + seedDB.sizeDisconnected() + seedDB.sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour) diff --git a/source/yacy.java b/source/yacy.java index 78abbeb21..fa718369b 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -72,11 +72,12 @@ import de.anomic.http.httpdFileHandler; import de.anomic.http.httpdProxyHandler; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; -import de.anomic.index.indexRWIEntryOld; +import de.anomic.index.indexRWIEntryNew; import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntryOld; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroDyn; +import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMap; import de.anomic.kelondro.kelondroRow; @@ -87,10 +88,11 @@ import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaURLPool; import de.anomic.plasma.plasmaWordIndex; -import de.anomic.plasma.plasmaWordIndexAssortmentCluster; +import de.anomic.plasma.plasmaWordIndexAssortment; import de.anomic.plasma.plasmaWordIndexFile; +import de.anomic.plasma.plasmaWordIndexFileCluster; +import de.anomic.plasma.dbImport.AssortmentImporter; import de.anomic.server.serverCore; import de.anomic.server.serverDate; import de.anomic.server.serverFileUtils; @@ -655,34 +657,96 @@ public final class yacy { File indexRoot = new File(new File(homePath), "DATA/INDEX"); serverLog log = new serverLog("WORDMIGRATION"); log.logInfo("STARTING MIGRATION"); - plasmaWordIndex wordIndexCache = null; - wordIndexCache = new plasmaWordIndex(dbroot, indexRoot, true, 20000, 10000, log); + plasmaWordIndex wordIndexCache = new plasmaWordIndex(indexRoot, 60000000, 60000000, 10000, log); enumerateFiles words = new enumerateFiles(new File(dbroot, "WORDS"), true, false, true, true); String wordhash; File wordfile; - Object migrationStatus; + int migrationCount; while (words.hasMoreElements()) try { wordfile = (File) words.nextElement(); wordhash = wordfile.getName().substring(0, 12); // System.out.println("NOW: " + wordhash); - migrationStatus = wordIndexCache.migrateWords2index(wordhash); - if (migrationStatus instanceof Integer) { - int migrationCount = ((Integer) migrationStatus).intValue(); + migrationCount = migrateWords2index(dbroot, wordhash, wordIndexCache); + if (migrationCount >= 0) { if (migrationCount == 0) log.logInfo("SKIPPED " + wordhash + ": empty"); else if (migrationCount > 0) log.logInfo("MIGRATED " + wordhash + ": " + migrationCount + " entries"); else log.logInfo("REVERSED " + wordhash + ": " + (-migrationCount) + " entries"); - } else if (migrationStatus instanceof String) { - log.logInfo("SKIPPED " + wordhash + ": " + migrationStatus); + } else { + log.logInfo("SKIPPED " + wordhash); } } catch (Exception e) { log.logSevere("Exception", e); } log.logInfo("FINISHED MIGRATION JOB, WAIT FOR DUMP"); - wordIndexCache.close(60); + wordIndexCache.close(); + log.logInfo("TERMINATED MIGRATION"); + } + + + public static int migrateWords2index(File oldDatabaseRoot, String wordhash, plasmaWordIndex wi) throws IOException { + // returns the number of entries that had been added to the assortments + // can be negative if some assortments have been moved to the backend + File db = plasmaWordIndexFile.wordHash2path(oldDatabaseRoot, wordhash); + if (!(db.exists())) { + serverLog.logSevere("migrateWordIndex", "word index file for hash " + wordhash + " not found"); + return -1; + } + plasmaWordIndexFile entity = null; + try { + entity = new plasmaWordIndexFile(oldDatabaseRoot, wordhash, true); + int size = entity.size(); + indexContainer container = new indexContainer(wordhash, indexRWIEntryNew.urlEntryRow); + + try { + Iterator entries = entity.elements(true); + indexRWIEntry entry; + while (entries.hasNext()) { + entry = (indexRWIEntry) entries.next(); + // System.out.println("ENTRY = " + entry.getUrlHash()); + container.add(new indexRWIEntry[] { entry }, System.currentTimeMillis()); + } + // we have read all elements, now delete the entity + entity.deleteComplete(); + entity.close(); + entity = null; + + wi.addEntries(container, container.updated(), false); + return size; + } catch (kelondroException e) { + // database corrupted, we simply give up the database and delete it + try { entity.close(); } catch (Exception ee) { } + entity = null; + try { db.delete(); } catch (Exception ee) { } + serverLog.logSevere("migrateWordIndex", "database for hash " + wordhash + " corrupted; deleted"); + return -1; + } + } finally { + if (entity != null) try {entity.close();}catch(Exception e){} + } + } + + public static void migrateAssortments(String homePath) { + // run with "java -classpath classes yacy -migrateassortments" + try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} + serverLog log = new serverLog("ASSORTMENTMIGRATION"); + File aclusterroot = new File(new File(homePath), "DATA/PLASMADB/ACLUSTER"); + File indexRoot = new File(new File(homePath), "DATA/INDEX"); + plasmaWordIndex wordIndexCache = new plasmaWordIndex(indexRoot, 60000000, 60000000, 10000, log); + log.logInfo("STARTING MIGRATION"); + String[] a = aclusterroot.list(); + AssortmentImporter importer = new AssortmentImporter(wordIndexCache); + for (int i = a.length - 1; i >= 0; i--) { + if (a[i].startsWith("indexAssortment")) { + importer.init(new File(aclusterroot, a[i]), 16000000, 2000); + importer.run(); + } + } + log.logInfo("FINISHED MIGRATION JOB, WAIT FOR DUMP"); + wordIndexCache.close(); log.logInfo("TERMINATED MIGRATION"); } @@ -693,7 +757,6 @@ public final class yacy { public static void minimizeUrlDB(String homePath, int dbcache) { // run with "java -classpath classes yacy -minimizeUrlDB" try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} - File plasmaroot = new File(new File(homePath), "DATA/PLASMADB"); File indexRoot = new File(new File(homePath), "DATA/INDEX"); serverLog log = new serverLog("URL-CLEANUP"); try { @@ -702,17 +765,17 @@ public final class yacy { // db containing all currently loades urls int cache = dbcache * 1024; // in KB log.logFine("URLDB-Caches: "+cache+" bytes"); - plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(plasmaroot, indexRoot, cache, 10000); + plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexRoot, cache, 10000); // db used to hold all neede urls - plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(new File(plasmaroot, "minimized"), indexRoot, cache, 10000); + plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(indexRoot, cache, 10000); Runtime rt = Runtime.getRuntime(); - int cacheMem = (int)((serverMemory.max-rt.totalMemory())/1024)-(2*cache + 8*1024); - if (cacheMem < 2048) throw new OutOfMemoryError("Not enough memory available to start clean up."); + int cacheMem = (int)(serverMemory.max-rt.totalMemory()); + if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up."); - plasmaWordIndex wordIndex = new plasmaWordIndex(plasmaroot, indexRoot, true, cacheMem, 10000, log); - Iterator indexContainerIterator = wordIndex.wordContainers("------------", plasmaWordIndex.RL_WORDFILES, false); + plasmaWordIndex wordIndex = new plasmaWordIndex(indexRoot, cacheMem, cacheMem, 10000, log); + Iterator indexContainerIterator = wordIndex.wordContainers("------------", false, false); long urlCounter = 0, wordCounter = 0; long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0; @@ -767,7 +830,7 @@ public final class yacy { currentUrlDB.close(); minimizedUrlDB.close(); - wordIndex.close(600); + wordIndex.close(); // TODO: rename the mimimized UrlDB to the name of the previous UrlDB @@ -941,16 +1004,16 @@ public final class yacy { File root = new File(homePath); try { - plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, 1000, 1000, 10000); + final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf"); HashMap doms = new HashMap(); - System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries."); + System.out.println("Started domain list extraction from " + sb.wordIndex.loadedURL.size() + " url entries."); System.out.println("a dump will be written after double-check of all extracted domains."); System.out.println("This process may fail in case of too less memory. To increase memory, start with"); System.out.println("java -Xmxm -classpath classes yacy -domlist [ -source { nurl | lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ ]"); int c = 0; long start = System.currentTimeMillis(); if (source.equals("lurl")) { - Iterator eiter = pool.loadedURL.entries(true, false, null); + Iterator eiter = sb.wordIndex.loadedURL.entries(true, false, null); indexURLEntry entry; while (eiter.hasNext()) { try { @@ -966,11 +1029,11 @@ public final class yacy { c + " urls checked, " + doms.size() + " domains collected, " + ((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " + - ((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining."); + ((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining."); } } if (source.equals("eurl")) { - Iterator eiter = pool.errorURL.entries(true, false, null); + Iterator eiter = sb.errorURL.entries(true, false, null); plasmaCrawlEURL.Entry entry; while (eiter.hasNext()) { try { @@ -985,11 +1048,11 @@ public final class yacy { c + " urls checked, " + doms.size() + " domains collected, " + ((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " + - ((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining."); + ((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining."); } } if (source.equals("nurl")) { - Iterator eiter = pool.noticeURL.entries(true, false, null); + Iterator eiter = sb.noticeURL.entries(true, false, null); plasmaCrawlNURL.Entry entry; while (eiter.hasNext()) { try { @@ -1004,7 +1067,7 @@ public final class yacy { c + " urls checked, " + doms.size() + " domains collected, " + ((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " + - ((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining."); + ((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining."); } } @@ -1048,7 +1111,7 @@ public final class yacy { System.out.println("Started domain list dump to file " + file); serverFileUtils.saveSet(file, "plain", doms.keySet(), new String(serverCore.crlf)); } - pool.close(); + sb.close(); } catch (IOException e) { e.printStackTrace(); } @@ -1057,12 +1120,12 @@ public final class yacy { private static void urllist(String homePath, String source, boolean html, String targetName) { File root = new File(homePath); try { - plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, 1000, 1000, 10000); + final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf"); File file = new File(root, targetName); BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file)); if (source.equals("lurl")) { - Iterator eiter = pool.loadedURL.entries(true, false, null); + Iterator eiter = sb.wordIndex.loadedURL.entries(true, false, null); indexURLEntry entry; while (eiter.hasNext()) { entry = (indexURLEntry) eiter.next(); @@ -1079,7 +1142,7 @@ public final class yacy { } } if (source.equals("eurl")) { - Iterator eiter = pool.errorURL.entries(true, false, null); + Iterator eiter = sb.errorURL.entries(true, false, null); plasmaCrawlEURL.Entry entry; while (eiter.hasNext()) { entry = (plasmaCrawlEURL.Entry) eiter.next(); @@ -1095,7 +1158,7 @@ public final class yacy { } } if (source.equals("nurl")) { - Iterator eiter = pool.noticeURL.entries(true, false, null); + Iterator eiter = sb.noticeURL.entries(true, false, null); plasmaCrawlNURL.Entry entry; while (eiter.hasNext()) { entry = (plasmaCrawlNURL.Entry) eiter.next(); @@ -1111,14 +1174,14 @@ public final class yacy { } } bos.close(); - pool.close(); + sb.close(); } catch (IOException e) { e.printStackTrace(); } } - private static void migratelurls(File root, File urlHash) { - plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, 1000, 1000, 10000); + private static void migratelurls(String homePath, File urlHash) { + final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf"); kelondroTree oldindex = null; try { oldindex = new kelondroTree(urlHash, 1000, -1, indexURLEntryOld.rowdef); @@ -1146,7 +1209,7 @@ public final class yacy { if (oldrow != null) try { oldentry = new indexURLEntryOld(oldrow, null); comp = oldentry.comp(); - newentry = pool.loadedURL.newEntry( + newentry = sb.wordIndex.loadedURL.newEntry( comp.url(), comp.descr(), "", @@ -1163,7 +1226,7 @@ public final class yacy { new kelondroBitfield(4), oldentry.language(), 0, 0, 0, 0, 0, 0); - pool.loadedURL.store(newentry); + sb.wordIndex.loadedURL.store(newentry); c++; } catch (IOException e) { // ignore @@ -1173,7 +1236,7 @@ public final class yacy { last = System.currentTimeMillis(); } } - pool.close(); + sb.close(); try { oldindex.close(); } catch (IOException e) { } System.out.println("MIGRATION OF " + c + " URLs FINISHED"); } @@ -1193,12 +1256,11 @@ public final class yacy { */ private static void urldbcleanup(String homePath) { File root = new File(homePath); - File plasmaroot = new File(root, "DATA/PLASMADB"); File indexroot = new File(root, "DATA/INDEX"); serverLog log = new serverLog("URLDBCLEANUP"); try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} try { - plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(plasmaroot, indexroot, 4194304, 10000); + plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexroot, 4194304, 10000); currentUrlDB.urldbcleanup(); currentUrlDB.close(); } catch (IOException e) { @@ -1218,19 +1280,16 @@ public final class yacy { try { Iterator indexContainerIterator = null; if (resource.equals("all")) { - WordIndex = new plasmaWordIndex(homeDBroot, indexRoot, true, 8*1024*1024, 3000, log); - indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false); - } else if (resource.equals("assortments")) { - plasmaWordIndexAssortmentCluster assortmentCluster = new plasmaWordIndexAssortmentCluster(new File(homeDBroot, "ACLUSTER"), 64, indexRWIEntryOld.urlEntryRow, 16*1024*1024, 3000, log); - indexContainerIterator = assortmentCluster.wordContainers(wordChunkStartHash, true, false); - } /*else if (resource.startsWith("assortment")) { + WordIndex = new plasmaWordIndex(indexRoot, 8*1024*1024, 8*1024*1024, 3000, log); + indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, false, false); + } else if (resource.startsWith("assortment")) { int a = Integer.parseInt(resource.substring(10)); plasmaWordIndexAssortment assortment = new plasmaWordIndexAssortment(new File(homeDBroot, "ACLUSTER"), a, 8*1024*1024, 3000, null); - indexContainerIterator = assortment.hashes(wordChunkStartHash, true, false); + indexContainerIterator = assortment.wordContainers(wordChunkStartHash, true, false); } else if (resource.equals("words")) { - plasmaWordIndexFileCluster fileDB = new plasmaWordIndexFileCluster(homeDBroot, log); - indexContainerIterator = fileDB.wordContainers(wordChunkStartHash, true, false); - }*/ // *** FIXME *** + plasmaWordIndexFileCluster fileDB = new plasmaWordIndexFileCluster(homeDBroot); + indexContainerIterator = fileDB.wordContainers(wordChunkStartHash, false); + } int counter = 0; indexContainer container = null; if (format.equals("zip")) { @@ -1269,7 +1328,7 @@ public final class yacy { log.logSevere("IOException", e); } if (WordIndex != null) { - WordIndex.close(60); + WordIndex.close(); WordIndex = null; } } @@ -1354,10 +1413,15 @@ public final class yacy { if (args.length == 2) applicationRoot= args[1]; shutdown(applicationRoot); } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-migratewords"))) { - // migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible + // migrate words from DATA/PLASMADB/WORDS path to collection index // attention: this may run long and should not be interrupted! if (args.length == 2) applicationRoot= args[1]; migrateWords(applicationRoot); + } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-migrateassortments"))) { + // migrate assortments from DATA/PLASMADB/ACLUSTER path to collection index + // attention: this may run long and should not be interrupted! + if (args.length == 2) applicationRoot= args[1]; + migrateAssortments(applicationRoot); } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-minimizeurldb"))) { // migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible // attention: this may run long and should not be interrupted! @@ -1437,7 +1501,7 @@ public final class yacy { urllist(applicationRoot, source, html, outfile); } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-migratelurls"))) { File root = new File(applicationRoot); - migratelurls(root, new File(root, "DATA/PLASMADB/urlHash.db")); + migratelurls(applicationRoot, new File(root, "DATA/PLASMADB/urlHash.db")); } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) { // generate a url list and save it in a file if (args.length == 2) applicationRoot= args[1];