- cleaned up code; removed methods to write the old data structures

- added an assortment importer. the old database structures can
  be imported with
  java -classpath classes yacy -migrateassortments
- modified wordmigration. The indexes from WORDS are now imported
  to the collection database. The call is
  java -classpath classes yacy -migratewords
  (as it was)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3044 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 8043bb82fb
commit 109ed0a0bb

@ -53,19 +53,19 @@ globalheader();
<p>If you download the software, you must accept the <a href="License.html">License</a>.</p>
<p><b>Latest Release:</b>
The latest YaCy release version is 0.48<br>
The latest YaCy release version is 0.49<br>
Nightly builds from compiles out of SVN can be obtained from <a href="http://latest.yacy-forum.net">http://latest.yacy-forum.net/</a>.<br>
<ul>
<li>Generic release of YaCy (all platforms with J2SE 1.4.2: Linux, Mac OS X, Windows, Solaris):</li>
<ul>
<li><tt>from yacy.net&nbsp;&nbsp;&nbsp;: <a href="http://www.yacy.net/yacy/release/yacy_v0.48_20061010_2743.tar.gz"><tt>yacy_v0.48_20061010_2743.tar.gz</tt></a></tt></li>
<li><tt>from BerliOS.de&nbsp;: <a href="http://download.berlios.de/yacy/yacy_v0.48_20061010_2743.tar.gz"><tt>yacy_v0.48_20061010_2743.tar.gz</tt></a></tt></li><br><br>
<li><tt>from yacy.net&nbsp;&nbsp;&nbsp;: </tt><a href="http://www.yacy.net/yacy/release/yacy_v0.49_20061202_3040.tar.gz"><tt>yacy_v0.49_20061202_3040.tar.gz</tt></a></li>
<li><tt>from BerliOS.de&nbsp;: </tt><a href="http://download.berlios.de/yacy/yacy_v0.49_20061202_3040.tar.gz"><tt>yacy_v0.49_20061202_3040.tar.gz</tt></a></li><br><br>
</ul>
<li>Windows-flavour release of YaCy (same code as generic release, but with convenient Windows-Installer):</li>
<ul>
<li><tt>from yacy.net&nbsp;&nbsp;&nbsp;: <a href="http://www.yacy.net/yacy/release/yacy_v0.48_20061010_2743.exe"><tt>yacy_v0.48_20061010_2743.exe</tt></a></tt></li>
<li><tt>from BerliOS.de&nbsp;: <a href="http://download.berlios.de/yacy/yacy_v0.48_20061010_2743.exe"><tt>yacy_v0.48_20061010_2743.exe</tt></a></tt></li>
<li><tt>from yacy.net&nbsp;&nbsp;&nbsp;: </tt><a href="http://www.yacy.net/yacy/release/yacy_v0.49_20061203_3040.exe"><tt>yacy_v0.49_20061203_3040.exe</tt></a></li>
<li><tt>from BerliOS.de&nbsp;: </tt><a href="http://download.berlios.de/yacy/yacy_v0.49_20061203_3040.exe"><tt>yacy_v0.49_20061203_3040.exe</tt></a></li>
</ul>
</ul>
</p>

@ -147,7 +147,7 @@ public class Bookmarks {
bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash);
if (bookmark == null) {
// try to get the bookmark from the LURL database
indexURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
indexURLEntry urlentry = switchboard.wordIndex.loadedURL.load(urlHash, null);
plasmaParserDocument document = null;
if (urlentry != null) {
indexURLEntry.Components comp = urlentry.comp();

@ -62,7 +62,7 @@ public class IndexCleaner_p {
prop.put("bla", "post!=null");
if (post.get("action").equals("ustart")) {
if (urldbCleanerThread==null || !urldbCleanerThread.isAlive()) {
urldbCleanerThread = sb.urlPool.loadedURL.makeCleaner();
urldbCleanerThread = sb.wordIndex.loadedURL.makeCleaner();
urldbCleanerThread.start();
}
else {
@ -77,7 +77,7 @@ public class IndexCleaner_p {
}
else if (post.get("action").equals("rstart")) {
if (indexCleanerThread==null || !indexCleanerThread.isAlive()) {
indexCleanerThread = sb.wordIndex.makeCleaner(sb.urlPool.loadedURL, post.get("wordHash","--------"));
indexCleanerThread = sb.wordIndex.makeCleaner(sb.wordIndex.loadedURL, post.get("wordHash","--------"));
indexCleanerThread.start();
}
else {
@ -98,7 +98,7 @@ public class IndexCleaner_p {
}
if (urldbCleanerThread!=null) {
prop.put("urldb", 1);
prop.put("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/sb.urlPool.loadedURL.size())*100 + "");
prop.put("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/sb.wordIndex.loadedURL.size())*100 + "");
prop.put("urldb_blacklisted", urldbCleanerThread.blacklistedUrls);
prop.put("urldb_total", urldbCleanerThread.totalSearchedUrls);
prop.put("urldb_lastBlacklistedUrl", urldbCleanerThread.lastBlacklistedUrl);

@ -63,7 +63,6 @@ import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCodings;
import de.anomic.server.serverObjects;
@ -87,7 +86,7 @@ public class IndexControl_p {
prop.put("urlhash", "");
prop.put("result", "");
prop.put("wcount", Integer.toString(switchboard.wordIndex.size()));
prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size()));
prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size()));
prop.put("otherHosts", "");
prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : "");
prop.put("indexDistributeWhileCrawling", (switchboard.getConfig("allowDistributeIndexWhileCrawling", "true").equals("true")) ? "checked" : "");
@ -170,7 +169,7 @@ public class IndexControl_p {
}
if (delurl || delurlref) {
for (int i = 0; i < urlx.length; i++) {
switchboard.urlPool.loadedURL.remove(urlx[i]);
switchboard.wordIndex.loadedURL.remove(urlx[i]);
}
}
switchboard.wordIndex.deleteContainer(keyhash);
@ -190,7 +189,7 @@ public class IndexControl_p {
}
if (delurl || delurlref) {
for (int i = 0; i < urlx.length; i++) {
switchboard.urlPool.loadedURL.remove(urlx[i]);
switchboard.wordIndex.loadedURL.remove(urlx[i]);
}
}
Set urlHashes = new HashSet();
@ -217,13 +216,13 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashdelete")) {
indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
indexURLEntry entry = switchboard.wordIndex.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
urlstring = entry.comp().url().toNormalform();
prop.put("urlstring", "");
switchboard.urlPool.loadedURL.remove(urlhash);
switchboard.wordIndex.loadedURL.remove(urlhash);
prop.put("result", "Removed URL " + urlstring);
}
}
@ -282,7 +281,7 @@ public class IndexControl_p {
indexURLEntry lurl;
while (urlIter.hasNext()) {
iEntry = (indexRWIEntry) urlIter.next();
lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null);
lurl = switchboard.wordIndex.loadedURL.load(iEntry.urlHash(), null);
if (lurl == null) {
unknownURLEntries.add(iEntry.urlHash());
urlIter.remove();
@ -307,7 +306,7 @@ public class IndexControl_p {
// generate list
if (post.containsKey("keyhashsimilar")) {
final Iterator containerIt = switchboard.wordIndex.indexContainerSet(keyhash, plasmaWordIndex.RL_WORDFILES, true, 256).iterator();
final Iterator containerIt = switchboard.wordIndex.indexContainerSet(keyhash, false, true, 256).iterator();
indexContainer container;
int i = 0;
int rows = 0, cols = 0;
@ -333,7 +332,7 @@ public class IndexControl_p {
URL url = new URL(urlstring);
urlhash = plasmaURL.urlHash(url);
prop.put("urlhash", urlhash);
indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
indexURLEntry entry = switchboard.wordIndex.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("urlstring", "unknown url: " + urlstring);
prop.put("urlhash", "");
@ -347,7 +346,7 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashsearch")) {
indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
indexURLEntry entry = switchboard.wordIndex.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash);
} else {
@ -359,7 +358,7 @@ public class IndexControl_p {
// generate list
if (post.containsKey("urlhashsimilar")) {
try {
final Iterator entryIt = switchboard.urlPool.loadedURL.entries(true, true, urlhash);
final Iterator entryIt = switchboard.wordIndex.loadedURL.entries(true, true, urlhash);
StringBuffer result = new StringBuffer("Sequential List of URL-Hashes:<br>");
indexURLEntry entry;
int i = 0;
@ -403,7 +402,7 @@ public class IndexControl_p {
// insert constants
prop.put("wcount", Integer.toString(switchboard.wordIndex.size()));
prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size()));
prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size()));
prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : "");
prop.put("indexDistributeWhileCrawling", (switchboard.getConfig("allowDistributeIndexWhileCrawling", "true").equals("true")) ? "checked" : "");
prop.put("indexReceiveChecked", (switchboard.getConfig("allowReceiveIndex", "true").equals("true")) ? "checked" : "");
@ -422,7 +421,7 @@ public class IndexControl_p {
}
indexURLEntry.Components comp = entry.comp();
String referrer = null;
indexURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
indexURLEntry le = switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null);
if (le == null) {
referrer = "<unknown>";
} else {
@ -471,7 +470,7 @@ public class IndexControl_p {
while (en.hasNext()) {
xi = (indexRWIEntry) en.next();
uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())};
indexURLEntry le = switchboard.urlPool.loadedURL.load(uh[0], null);
indexURLEntry le = switchboard.wordIndex.loadedURL.load(uh[0], null);
if (le == null) {
tm.put(uh[0], uh);
} else {

@ -76,7 +76,7 @@ public class IndexCreateIndexingQueue_p {
}
if (post.containsKey("clearRejected")) {
switchboard.urlPool.errorURL.clearStack();
switchboard.errorURL.clearStack();
}
if (post.containsKey("moreRejected")) {
showRejectedCount = Integer.parseInt(post.get("showRejected", "10"));
@ -172,11 +172,11 @@ public class IndexCreateIndexingQueue_p {
}
// failure cases
if (switchboard.urlPool.errorURL.stackSize() != 0) {
if (showRejectedCount > switchboard.urlPool.errorURL.stackSize()) showRejectedCount = switchboard.urlPool.errorURL.stackSize();
if (switchboard.errorURL.stackSize() != 0) {
if (showRejectedCount > switchboard.errorURL.stackSize()) showRejectedCount = switchboard.errorURL.stackSize();
prop.put("rejected", 1);
prop.put("rejected_num", switchboard.urlPool.errorURL.stackSize());
if (showRejectedCount != switchboard.urlPool.errorURL.stackSize()) {
prop.put("rejected_num", switchboard.errorURL.stackSize());
if (showRejectedCount != switchboard.errorURL.stackSize()) {
prop.put("rejected_only-latest", 1);
prop.put("rejected_only-latest_num", showRejectedCount);
prop.put("rejected_only-latest_newnum", ((int) (showRejectedCount * 1.5)));
@ -189,9 +189,9 @@ public class IndexCreateIndexingQueue_p {
plasmaCrawlEURL.Entry entry;
yacySeed initiatorSeed, executorSeed;
int j=0;
for (int i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) {
for (int i = switchboard.errorURL.stackSize() - 1; i >= (switchboard.errorURL.stackSize() - showRejectedCount); i--) {
try {
entry = switchboard.urlPool.errorURL.stackPopEntry(i);
entry = switchboard.errorURL.stackPopEntry(i);
url = entry.url();
if (url == null) continue;

@ -79,8 +79,8 @@ public class IndexCreateWWWGlobalQueue_p {
}
if (post.containsKey("clearcrawlqueue")) {
int c = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
switchboard.urlPool.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_LIMIT);
int c = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
switchboard.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_LIMIT);
try { switchboard.cleanProfiles(); } catch (InterruptedException e) { /* Ignore this */}
/*
int c = 0;
@ -94,12 +94,12 @@ public class IndexCreateWWWGlobalQueue_p {
}
}
int stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
int stackSize = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
if (stackSize == 0) {
prop.put("crawler-queue", 0);
} else {
prop.put("crawler-queue", 1);
plasmaCrawlNURL.Entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, showLimit);
plasmaCrawlNURL.Entry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, showLimit);
prop.put("crawler-queue_num", stackSize);//num Entries
plasmaCrawlNURL.Entry urle;
boolean dark = true;

@ -90,8 +90,8 @@ public class IndexCreateWWWLocalQueue_p {
String pattern = post.get("pattern", ".*").trim();
String option = post.get("option", ".*").trim();
if (pattern.equals(".*")) {
c = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
switchboard.urlPool.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE);
c = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
switchboard.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE);
try { switchboard.cleanProfiles(); } catch (InterruptedException e) {/* ignore this */}
} else{
Pattern compiledPattern = null;
@ -100,13 +100,13 @@ public class IndexCreateWWWLocalQueue_p {
compiledPattern = Pattern.compile(pattern);
// iterating through the list of URLs
Iterator iter = switchboard.urlPool.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
while (iter.hasNext()) {
String value = null;
String nextHash = new String((byte[]) iter.next());
Entry entry = null;
try {
entry = switchboard.urlPool.noticeURL.getEntry(nextHash);
entry = switchboard.noticeURL.getEntry(nextHash);
} catch (IOException e) {
continue;
}
@ -137,7 +137,7 @@ public class IndexCreateWWWLocalQueue_p {
if (value != null) {
Matcher matcher = compiledPattern.matcher(value);
if (matcher.find()) {
switchboard.urlPool.noticeURL.remove(nextHash);
switchboard.noticeURL.remove(nextHash);
}
}
@ -151,18 +151,18 @@ public class IndexCreateWWWLocalQueue_p {
prop.put("info_numEntries", c);
} else if (post.containsKey("deleteEntry")) {
String urlHash = (String) post.get("deleteEntry");
switchboard.urlPool.noticeURL.remove(urlHash);
switchboard.noticeURL.remove(urlHash);
prop.put("LOCATION","");
return prop;
}
}
int showNum = 0, stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
int showNum = 0, stackSize = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
if (stackSize == 0) {
prop.put("crawler-queue", 0);
} else {
prop.put("crawler-queue", 1);
plasmaCrawlNURL.Entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, (int) (showLimit * 1.20));
plasmaCrawlNURL.Entry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, (int) (showLimit * 1.20));
plasmaCrawlNURL.Entry urle;
boolean dark = true;

@ -168,9 +168,9 @@ public class IndexCreate_p {
// stack request
// first delete old entry, if exists
String urlhash = plasmaURL.urlHash(crawlingStart);
switchboard.urlPool.loadedURL.remove(urlhash);
switchboard.urlPool.noticeURL.remove(urlhash);
switchboard.urlPool.errorURL.remove(urlhash);
switchboard.wordIndex.loadedURL.remove(urlhash);
switchboard.noticeURL.remove(urlhash);
switchboard.errorURL.remove(urlhash);
// stack url
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
@ -203,10 +203,10 @@ public class IndexCreate_p {
prop.put("error_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL"))));
prop.put("error_reasonString", reasonString);
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
crawlingStartURL.getHost(), reasonString, new kelondroBitfield());
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
switchboard.errorURL.stackPushEntry(ee);
}
} catch (PatternSyntaxException e) {
prop.put("error", 8); //crawlfilter does not match url
@ -281,10 +281,10 @@ public class IndexCreate_p {
if (rejectReason == null) {
c++;
} else {
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new kelondroBitfield());
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
switchboard.errorURL.stackPushEntry(ee);
}
}
@ -412,7 +412,7 @@ public class IndexCreate_p {
int queueStackSize = switchboard.sbQueue.size();
int loaderThreadsSize = switchboard.cacheLoader.size();
int crawlerListSize = switchboard.urlPool.noticeURL.stackSize();
int crawlerListSize = switchboard.noticeURL.stackSize();
int completequeue = queueStackSize + loaderThreadsSize + crawlerListSize;
if ((completequeue > 0) || ((post != null) && (post.containsKey("refreshpage")))) {

@ -98,7 +98,7 @@ public final class IndexImport_p {
if (startImport) {
dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType);
if (importerThread != null) {
importerThread.init(new File(importPath), switchboard.indexPath, cacheSize, 100);
importerThread.init(new File(importPath), cacheSize, 100);
importerThread.startIt();
}
prop.put("LOCATION","");
@ -147,7 +147,7 @@ public final class IndexImport_p {
}
prop.put("wcount", Integer.toString(switchboard.wordIndex.size()));
prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size()));
prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size()));
/*
* Loop over all currently running jobs

@ -109,12 +109,12 @@ public class IndexMonitor {
}
// do the commands
if (post.containsKey("clearlist")) sb.urlPool.loadedURL.clearStack(tabletype);
if (post.containsKey("clearlist")) sb.wordIndex.loadedURL.clearStack(tabletype);
if (post.containsKey("deleteentry")) {
String hash = post.get("hash", null);
if (hash != null) {
// delete from database
sb.urlPool.loadedURL.remove(hash);
sb.wordIndex.loadedURL.remove(hash);
}
}
if (post.containsKey("moreIndexed")) {
@ -126,18 +126,18 @@ public class IndexMonitor {
// create table
if (tabletype == 0) {
prop.put("table", 2);
} else if (sb.urlPool.loadedURL.getStackSize(tabletype) == 0) {
} else if (sb.wordIndex.loadedURL.getStackSize(tabletype) == 0) {
prop.put("table", 0);
} else {
prop.put("table", 1);
if (lines > sb.urlPool.loadedURL.getStackSize(tabletype)) lines = sb.urlPool.loadedURL.getStackSize(tabletype);
if (lines == sb.urlPool.loadedURL.getStackSize(tabletype)) {
if (lines > sb.wordIndex.loadedURL.getStackSize(tabletype)) lines = sb.wordIndex.loadedURL.getStackSize(tabletype);
if (lines == sb.wordIndex.loadedURL.getStackSize(tabletype)) {
prop.put("table_size", 0);
} else {
prop.put("table_size", 1);
prop.put("table_size_count", lines);
}
prop.put("table_size_all", sb.urlPool.loadedURL.getStackSize(tabletype));
prop.put("table_size_all", sb.wordIndex.loadedURL.getStackSize(tabletype));
prop.put("table_feedbackpage", "IndexMonitor.html");
prop.put("table_tabletype", tabletype);
prop.put("table_showInit", (showInit) ? 1 : 0);
@ -153,14 +153,14 @@ public class IndexMonitor {
final plasmaHTCache cacheManager = sb.getCacheManager();
int i, cnt = 0;
for (i = sb.urlPool.loadedURL.getStackSize(tabletype) - 1; i >= (sb.urlPool.loadedURL.getStackSize(tabletype) - lines); i--) {
initiatorHash = sb.urlPool.loadedURL.getInitiatorHash(tabletype, i);
executorHash = sb.urlPool.loadedURL.getExecutorHash(tabletype, i);
for (i = sb.wordIndex.loadedURL.getStackSize(tabletype) - 1; i >= (sb.wordIndex.loadedURL.getStackSize(tabletype) - lines); i--) {
initiatorHash = sb.wordIndex.loadedURL.getInitiatorHash(tabletype, i);
executorHash = sb.wordIndex.loadedURL.getExecutorHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash);
urlHash = sb.urlPool.loadedURL.getUrlHash(tabletype, i);
urlHash = sb.wordIndex.loadedURL.getUrlHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try {
urle = sb.urlPool.loadedURL.load(urlHash, null);
urle = sb.wordIndex.loadedURL.load(urlHash, null);
indexURLEntry.Components comp = urle.comp();
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);

@ -66,7 +66,7 @@ public class IndexShare_p {
prop.put("dtable", "");
prop.put("rtable", "");
prop.put("wcount", Integer.toString(switchboard.wordIndex.size()));
prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size()));
prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size()));
return prop; // be save
}
@ -79,7 +79,7 @@ public class IndexShare_p {
// insert constants
prop.put("wcount", Integer.toString(switchboard.wordIndex.size()));
prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size()));
prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size()));
// return rewrite properties
return prop;
}

@ -96,7 +96,7 @@ public final class IndexTransfer_p {
// insert constants
prop.put("wcount", Integer.toString(switchboard.wordIndex.size()));
prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size()));
prop.put("ucount", Integer.toString(switchboard.wordIndex.loadedURL.size()));
prop.put("running",(switchboard.transferIdxThread==null)?0:1);
if (switchboard.transferIdxThread != null) {
String[] status = switchboard.transferIdxThread.getStatus();

@ -175,11 +175,11 @@ public class PerformanceMemory_p {
ost = sb.cacheManager.cacheObjectStatus();
putprop(prop, env, "", "HTTP", set);
req = sb.urlPool.loadedURL.size();
chk = sb.urlPool.loadedURL.cacheNodeChunkSize();
obj = sb.urlPool.loadedURL.cacheObjectChunkSize();
slt = sb.urlPool.loadedURL.cacheNodeStatus();
ost = sb.urlPool.loadedURL.cacheObjectStatus();
req = sb.wordIndex.loadedURL.size();
chk = sb.wordIndex.loadedURL.cacheNodeChunkSize();
obj = sb.wordIndex.loadedURL.cacheObjectChunkSize();
slt = sb.wordIndex.loadedURL.cacheNodeStatus();
ost = sb.wordIndex.loadedURL.cacheObjectStatus();
putprop(prop, env, "", "LURL", set);
if (sb.sbStackCrawlThread.getDBType() != de.anomic.plasma.plasmaCrawlStacker.QUEUE_DB_TYPE_TREE) {
@ -194,27 +194,27 @@ public class PerformanceMemory_p {
putprop(prop, env, "usePreNURLCache", "PreNURL", set);
}
if (sb.urlPool.noticeURL.getUseNewDB()) {
if (sb.noticeURL.getUseNewDB()) {
prop.put("useNURLCache", 0);
} else {
prop.put("useNURLCache", 1);
req = sb.urlPool.noticeURL.size();
chk = sb.urlPool.noticeURL.cacheNodeChunkSize();
obj = sb.urlPool.noticeURL.cacheObjectChunkSize();
slt = sb.urlPool.noticeURL.cacheNodeStatus();
ost = sb.urlPool.noticeURL.cacheObjectStatus();
req = sb.noticeURL.size();
chk = sb.noticeURL.cacheNodeChunkSize();
obj = sb.noticeURL.cacheObjectChunkSize();
slt = sb.noticeURL.cacheNodeStatus();
ost = sb.noticeURL.cacheObjectStatus();
putprop(prop, env, "useNURLCache", "NURL", set);
}
if (sb.urlPool.errorURL.getUseNewDB()) {
if (sb.errorURL.getUseNewDB()) {
prop.put("useEURLCache", 0);
} else {
prop.put("useEURLCache", 1);
req = sb.urlPool.errorURL.size();
chk = sb.urlPool.errorURL.cacheNodeChunkSize();
obj = sb.urlPool.errorURL.cacheObjectChunkSize();
slt = sb.urlPool.errorURL.cacheNodeStatus();
ost = sb.urlPool.errorURL.cacheObjectStatus();
req = sb.errorURL.size();
chk = sb.errorURL.cacheNodeChunkSize();
obj = sb.errorURL.cacheObjectChunkSize();
slt = sb.errorURL.cacheNodeStatus();
ost = sb.errorURL.cacheObjectStatus();
putprop(prop, env, "useEURLCache", "EURL", set);
}

@ -262,7 +262,7 @@ public class PerformanceQueues_p {
}
// table cache settings
prop.put("urlCacheSize", switchboard.urlPool.loadedURL.writeCacheSize());
prop.put("urlCacheSize", switchboard.wordIndex.loadedURL.writeCacheSize());
prop.put("wordCacheWSize", switchboard.wordIndex.dhtOutCacheSize());
prop.put("wordCacheKSize", switchboard.wordIndex.dhtInCacheSize());
prop.put("maxURLinWCache", "" + switchboard.wordIndex.maxURLinDHTOutCache());

@ -149,9 +149,9 @@ public class QuickCrawlLink_p {
}
String urlhash = plasmaURL.urlHash(crawlingStart);
switchboard.urlPool.loadedURL.remove(urlhash);
switchboard.urlPool.noticeURL.remove(urlhash);
switchboard.urlPool.errorURL.remove(urlhash);
switchboard.wordIndex.loadedURL.remove(urlhash);
switchboard.noticeURL.remove(urlhash);
switchboard.errorURL.remove(urlhash);
// create crawling profile
plasmaCrawlProfile.entry pe = null;

@ -108,7 +108,7 @@ public class ViewFile {
if (urlHash.length() > 0) {
// getting the urlEntry that belongs to the url hash
indexURLEntry urlEntry = null;
urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
urlEntry = sb.wordIndex.loadedURL.load(urlHash, null);
if (urlEntry == null) {
prop.put("error",2);
prop.put("viewMode",VIEW_MODE_NO_TEXT);

@ -361,7 +361,7 @@ public class dir {
try {
final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()), "UTF-8");
final indexURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
final indexURLEntry newEntry = switchboard.wordIndex.loadedURL.newEntry(
url,
"YaCyShare: " + descr,
yacyCore.seedDB.mySeed.getName(),
@ -379,8 +379,8 @@ public class dir {
"**", // language
0,0,0,0,0,0
);
switchboard.urlPool.loadedURL.store(newEntry);
switchboard.urlPool.loadedURL.stack(
switchboard.wordIndex.loadedURL.store(newEntry);
switchboard.wordIndex.loadedURL.stack(
newEntry,
"____________", /*initiator*/
yacyCore.seedDB.mySeed.hash, /*executor*/
@ -401,7 +401,7 @@ public class dir {
entry = (Map.Entry) words.next();
switchboard.wordIndex.removeEntry(plasmaCondenser.word2hash((String) entry.getKey()), urlhash, true);
}
switchboard.urlPool.loadedURL.remove(urlhash);
switchboard.wordIndex.loadedURL.remove(urlhash);
} catch (Exception e) {
serverLog.logSevere("DIR", "INTERNAL ERROR in dir.deletePhrase", e);
}

@ -164,17 +164,17 @@ public class queues_p {
//local crawl queue
prop.put("localCrawlSize", Integer.toString(switchboard.getThread("50_localcrawl").getJobCount()));
int stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
addNTable(prop, "list-local", switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, Math.min(10, stackSize)));
int stackSize = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
addNTable(prop, "list-local", switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, Math.min(10, stackSize)));
//global crawl queue
prop.put("remoteCrawlSize", Integer.toString(switchboard.getThread("61_globalcrawltrigger").getJobCount()));
//prop.put("remoteCrawlSize", Integer.toString(switchboard.getThread("62_remotetriggeredcrawl").getJobCount()));
stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
stackSize = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
if (stackSize == 0) {
prop.put("list-remote", 0);
} else {
addNTable(prop, "list-remote", switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, Math.min(10, stackSize)));
addNTable(prop, "list-remote", switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, Math.min(10, stackSize)));
}
// return rewrite properties

@ -249,13 +249,13 @@ public final class crawlOrder {
// case where we have already the url loaded;
reason = reasonString;
// send lurl-Entry as response
indexURLEntry entry = switchboard.urlPool.loadedURL.load(plasmaURL.urlHash(url), null);
indexURLEntry entry = switchboard.wordIndex.loadedURL.load(plasmaURL.urlHash(url), null);
if (entry == null) {
response = "rejected";
lurl = "";
} else {
response = "double";
switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
switchboard.wordIndex.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
lurl = crypt.simpleEncode(entry.toString());
}
} else {

@ -124,7 +124,7 @@ public final class crawlReceipt {
prop.put("delay", "3600");
} else if (result.equals("fill")) {
// generating a new loaded URL entry
indexURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr);
indexURLEntry entry = switchboard.wordIndex.loadedURL.newEntry(propStr);
if (entry == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr);
@ -135,16 +135,16 @@ public final class crawlReceipt {
"\n\tURL properties: "+ propStr);
} else try {
// put new entry into database
switchboard.urlPool.loadedURL.store(entry);
switchboard.urlPool.loadedURL.stack(entry, youare, iam, 1);
switchboard.wordIndex.loadedURL.store(entry);
switchboard.wordIndex.loadedURL.stack(entry, youare, iam, 1);
// generating url hash
String newUrlHash = plasmaURL.urlHash(comp.url());
String oldUrlHash = plasmaURL.oldurlHash(comp.url());
// removing URL from notice URL
switchboard.urlPool.noticeURL.remove(newUrlHash);
switchboard.urlPool.noticeURL.remove(oldUrlHash);
switchboard.noticeURL.remove(newUrlHash);
switchboard.noticeURL.remove(oldUrlHash);
log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + comp.url().toNormalform());
} catch (IOException e) {
@ -155,11 +155,11 @@ public final class crawlReceipt {
prop.put("delay", "10");
} else {
try {
plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash);
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new kelondroBitfield());
plasmaCrawlNURL.Entry en = switchboard.noticeURL.getEntry(receivedUrlhash);
plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new kelondroBitfield());
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
switchboard.urlPool.noticeURL.remove(receivedUrlhash);
switchboard.errorURL.stackPushEntry(ee);
switchboard.noticeURL.remove(receivedUrlhash);
} catch (IOException e) {
}

@ -98,7 +98,7 @@ public final class query {
if (obj.equals("lurlcount")) {
// return the number of all available l-url's
prop.put("response", sb.urlPool.loadedURL.size());
prop.put("response", sb.wordIndex.loadedURL.size());
return prop;
}

@ -144,7 +144,7 @@ public final class search {
plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(squery.maximumTime, squery.wantedResults);
plasmaSearchTimingProfile remoteTiming = null;
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.wordIndex.loadedURL, sb.snippetCache);
Map containers = theSearch.localSearchContainers(plasmaSearchQuery.hashes2Set(urls));
if (containers != null) {
Iterator ci = containers.entrySet().iterator();
@ -173,7 +173,7 @@ public final class search {
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery,
rankingProfile, localTiming, remoteTiming, true,
yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL,
yacyCore.log, sb.wordIndex, sb.wordIndex.loadedURL,
sb.snippetCache);
Map containers = theSearch.localSearchContainers(plasmaSearchQuery.hashes2Set(urls));

@ -53,7 +53,6 @@ import java.util.List;
import de.anomic.http.httpHeader;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryNew;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore;
@ -93,6 +92,7 @@ public final class transferRWI {
StringBuffer unknownURLs = new StringBuffer();
int pause = 0;
/*
boolean shortCacheFlush = false;
if ((granted) && (sb.wordIndex.busyCacheFlush)) {
// wait a little bit, maybe we got into a short flush slot
@ -101,9 +101,10 @@ public final class transferRWI {
shortCacheFlush = true;
break;
}
try {Thread.sleep(100);} catch (InterruptedException e) {/* */}
try {Thread.sleep(100);} catch (InterruptedException e) {}
}
}
*/
if (!granted) {
// we dont want to receive indexes
@ -152,42 +153,45 @@ public final class transferRWI {
Iterator i = v.iterator();
while (i.hasNext()) {
serverCore.checkInterruption();
estring = (String) i.next();
// check if RWI entry is well-formed
p = estring.indexOf("{");
if (p > 0) {
wordHash = estring.substring(0, p);
wordhashes[received] = wordHash;
if (estring.indexOf("x=") > 0)
iEntry = new indexRWIEntryNew(estring.substring(p));
else
iEntry = new indexRWIEntryOld(estring.substring(p));
urlHash = iEntry.urlHash();
if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(plasmaURLPattern.BLACKLIST_DHT, urlHash))) {
int deleted = sb.wordIndex.tryRemoveURLs(urlHash);
yacyCore.log.logFine("transferRWI: blocked blacklisted URLHash '" + urlHash + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
blocked++;
} else {
sb.wordIndex.addEntry(wordHash, iEntry, System.currentTimeMillis(), true);
serverCore.checkInterruption();
if ((p < 0) || (estring.indexOf("x=") < 0)) {
blocked++;
continue;
}
wordHash = estring.substring(0, p);
wordhashes[received] = wordHash;
iEntry = new indexRWIEntryNew(estring.substring(p));
urlHash = iEntry.urlHash();
// block blacklisted entries
if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(plasmaURLPattern.BLACKLIST_DHT, urlHash))) {
int deleted = sb.wordIndex.tryRemoveURLs(urlHash);
yacyCore.log.logFine("transferRWI: blocked blacklisted URLHash '" + urlHash + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
blocked++;
continue;
}
// learn entry
sb.wordIndex.addEntry(wordHash, iEntry, System.currentTimeMillis(), true);
serverCore.checkInterruption();
if (!(knownURL.contains(urlHash)||unknownURL.contains(urlHash))) {
try {
if (sb.urlPool.loadedURL.exists(urlHash)) {
knownURL.add(urlHash);
} else {
unknownURL.add(urlHash);
}
} catch (Exception ex) {
sb.getLog().logWarning(
"transferRWI: DB-Error while trying to determine if URL with hash '" +
urlHash + "' is known.", ex);
}
receivedURL++;
}
received++;
// check if we need to ask for the corresponding URL
if (!(knownURL.contains(urlHash)||unknownURL.contains(urlHash))) try {
if (sb.wordIndex.loadedURL.exists(urlHash)) {
knownURL.add(urlHash);
} else {
unknownURL.add(urlHash);
}
receivedURL++;
} catch (Exception ex) {
sb.getLog().logWarning(
"transferRWI: DB-Error while trying to determine if URL with hash '" +
urlHash + "' is known.", ex);
}
received++;
}
yacyCore.seedDB.mySeed.incRI(received);

@ -87,7 +87,7 @@ public final class transferURL {
if (granted) {
int received = 0;
int blocked = 0;
final int sizeBefore = sb.urlPool.loadedURL.size();
final int sizeBefore = sb.wordIndex.loadedURL.size();
// read the urls from the other properties and store
String urls;
indexURLEntry lEntry;
@ -97,7 +97,7 @@ public final class transferURL {
if (urls == null) {
yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName);
} else {
lEntry = sb.urlPool.loadedURL.newEntry(urls);
lEntry = sb.wordIndex.loadedURL.newEntry(urls);
if (lEntry == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message???
@ -113,8 +113,8 @@ public final class transferURL {
lEntry = null;
blocked++;
} else try {
sb.urlPool.loadedURL.store(lEntry);
sb.urlPool.loadedURL.stack(lEntry, iam, iam, 3);
sb.wordIndex.loadedURL.store(lEntry);
sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName);
received++;
} catch (IOException e) {
@ -128,7 +128,7 @@ public final class transferURL {
yacyCore.seedDB.mySeed.incRU(received);
// return rewrite properties
final int more = sb.urlPool.loadedURL.size() - sizeBefore;
final int more = sb.wordIndex.loadedURL.size() - sizeBefore;
doublevalues = Integer.toString(received - more);
sb.getLog().logInfo("Received " + received + " URLs from peer " + otherPeerName + " in " + (System.currentTimeMillis() - start) + " ms, Blocked " + blocked + " URLs");
if ((received - more) > 0) sb.getLog().logSevere("Received " + doublevalues + " double URLs from peer " + otherPeerName);

@ -201,7 +201,7 @@ public class yacysearch {
return prop;
}
final String recommendHash = post.get("recommendref", ""); // urlhash
indexURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
indexURLEntry urlentry = sb.wordIndex.loadedURL.load(recommendHash, null);
if (urlentry != null) {
indexURLEntry.Components comp = urlentry.comp();
plasmaParserDocument document;

@ -86,7 +86,7 @@ public class indexCachedRI implements indexRI {
return entries.updated();
}
public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean intern) {
public void addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean intern) {
// add the entry
if (intern) {
riIntern.addEntry(wordHash, entry, updateTime, true);
@ -94,10 +94,9 @@ public class indexCachedRI implements indexRI {
riExtern.addEntry(wordHash, entry, updateTime, false);
flushControl();
}
return null;
}
public indexContainer addEntries(indexContainer entries, long updateTime, boolean intern) {
public void addEntries(indexContainer entries, long updateTime, boolean intern) {
// add the entry
if (intern) {
riIntern.addEntries(entries, updateTime, true);
@ -105,7 +104,6 @@ public class indexCachedRI implements indexRI {
riExtern.addEntries(entries, updateTime, false);
flushControl();
}
return null;
}
public void flushCacheSome(boolean busy) {
@ -133,12 +131,7 @@ public class indexCachedRI implements indexRI {
// flush the wordHash
indexContainer c = ram.deleteContainer(wordHash);
if (c != null) {
indexContainer feedback = backend.addEntries(c, c.updated(), false);
if (feedback != null) {
throw new RuntimeException("indexCollectionRI shall not return feedback entries; feedback = " + feedback.toString());
}
}
if (c != null) backend.addEntries(c, c.updated(), false);
// pause to next loop to give other processes a chance to use IO
//try {this.wait(8);} catch (InterruptedException e) {}
@ -206,11 +199,11 @@ public class indexCachedRI implements indexRI {
return size;
}
public void close(int waitingBoundSeconds) {
public void close() {
synchronized (this) {
riIntern.close(waitingBoundSeconds);
riExtern.close(waitingBoundSeconds);
backend.close(-1);
riIntern.close();
riExtern.close();
backend.close();
}
}

@ -104,7 +104,7 @@ public class indexCollectionRI implements indexRI {
byte[] key = (byte[]) oo[0];
kelondroRowSet collection = (kelondroRowSet) oo[1];
if (collection == null) return null;
return new indexContainer(new String(key), collection, true);
return new indexContainer(new String(key), collection);
}
public void remove() {
@ -118,7 +118,7 @@ public class indexCollectionRI implements indexRI {
kelondroRowSet collection = collectionIndex.get(wordHash.getBytes(), deleteIfEmpty);
if (collection != null) collection.select(urlselection);
if ((collection == null) || (collection.size() == 0)) return null;
return new indexContainer(wordHash, collection, true);
return new indexContainer(wordHash, collection);
} catch (IOException e) {
return null;
}
@ -128,7 +128,7 @@ public class indexCollectionRI implements indexRI {
try {
kelondroRowSet collection = collectionIndex.delete(wordHash.getBytes());
if (collection == null) return null;
return new indexContainer(wordHash, collection, true);
return new indexContainer(wordHash, collection);
} catch (IOException e) {
return null;
}
@ -152,26 +152,24 @@ public class indexCollectionRI implements indexRI {
}
}
public synchronized indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
indexContainer container = new indexContainer(wordHash, collectionIndex.payloadRow(), true);
public synchronized void addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
indexContainer container = new indexContainer(wordHash, collectionIndex.payloadRow());
container.add(newEntry);
return addEntries(container, updateTime, dhtCase);
addEntries(container, updateTime, dhtCase);
}
public synchronized indexContainer addEntries(indexContainer newEntries, long creationTime, boolean dhtCase) {
public synchronized void addEntries(indexContainer newEntries, long creationTime, boolean dhtCase) {
String wordHash = newEntries.getWordHash();
try {
collectionIndex.merge(wordHash.getBytes(), (kelondroRowCollection) newEntries);
return null; // merge does allways 'eat' up all entries unlike the assortments; they may return an overflow container
} catch (kelondroOutOfLimitsException e) {
e.printStackTrace();
return null;
} catch (IOException e) {
return null;
e.printStackTrace();
}
}
public synchronized void close(int waitingSeconds) {
public synchronized void close() {
try {
collectionIndex.close();
} catch (IOException e) {

@ -41,33 +41,24 @@ import de.anomic.kelondro.kelondroRowSet;
public class indexContainer extends kelondroRowSet {
private String wordHash;
private boolean newRWI;
public indexContainer(String wordHash, kelondroRow rowdef, int objectCount, byte[] cache, boolean newRWI) {
super(rowdef, objectCount, cache, kelondroBase64Order.enhancedCoder, 0, 0);
this.wordHash = wordHash;
this.newRWI = newRWI;
}
public indexContainer(String wordHash, kelondroRow rowdef, boolean newRWI) {
this(wordHash, rowdef, kelondroBase64Order.enhancedCoder, 0, newRWI);
public indexContainer(String wordHash, kelondroRow rowdef) {
this(wordHash, rowdef, kelondroBase64Order.enhancedCoder, 0);
}
public indexContainer(String wordHash, kelondroRowSet collection, boolean newRWI) {
public indexContainer(String wordHash, kelondroRowSet collection) {
super(collection);
this.wordHash = wordHash;
this.newRWI = newRWI;
}
public indexContainer(String wordHash, kelondroRow rowdef, kelondroOrder ordering, int column, boolean newRWI) {
public indexContainer(String wordHash, kelondroRow rowdef, kelondroOrder ordering, int column) {
super(rowdef, ordering, column, 0);
this.wordHash = wordHash;
this.lastTimeWrote = 0;
this.newRWI = newRWI;
}
public indexContainer topLevelClone() {
indexContainer newContainer = new indexContainer(this.wordHash, this.rowdef, this.sortOrder, this.sortColumn, this.newRWI);
indexContainer newContainer = new indexContainer(this.wordHash, this.rowdef, this.sortOrder, this.sortColumn);
newContainer.add(this, -1);
return newContainer;
}
@ -133,7 +124,7 @@ public class indexContainer extends kelondroRowSet {
if (entry instanceof indexRWIEntryNew)
oldEntry = new indexRWIEntryNew(oldEntryRow);
else
oldEntry = new indexRWIEntryOld(oldEntryRow); // FIXME: see if cloning is necessary
oldEntry = new indexRWIEntryNew(new indexRWIEntryOld(oldEntryRow));
if (entry.isOlder(oldEntry)) { // A more recent Entry is already in this container
this.put(oldEntry.toKelondroEntry()); // put it back
return false;
@ -146,19 +137,13 @@ public class indexContainer extends kelondroRowSet {
public indexRWIEntry get(String urlHash) {
kelondroRow.Entry entry = this.get(urlHash.getBytes());
if (entry == null) return null;
if (this.newRWI)
return new indexRWIEntryNew(entry);
else
return new indexRWIEntryOld(entry);
return new indexRWIEntryNew(entry);
}
public indexRWIEntry remove(String urlHash) {
kelondroRow.Entry entry = this.remove(urlHash.getBytes());
if (entry == null) return null;
if (this.newRWI)
return new indexRWIEntryNew(entry);
else
return new indexRWIEntryOld(entry);
return new indexRWIEntryNew(entry);
}
public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) {
@ -194,10 +179,7 @@ public class indexContainer extends kelondroRowSet {
public Object next() {
kelondroRow.Entry rentry = (kelondroRow.Entry) rowEntryIterator.next();
if (rentry == null) return null;
if (newRWI)
return new indexRWIEntryNew(rentry);
else
return new indexRWIEntryOld(rentry);
return new indexRWIEntryNew(rentry);
}
public void remove() {
@ -307,7 +289,7 @@ public class indexContainer extends kelondroRowSet {
assert small.rowdef.equals(large.rowdef) : "small = " + small.rowdef.toString() + "; large = " + large.rowdef.toString();
int keylength = small.rowdef.width(0);
assert (keylength == large.rowdef.width(0));
indexContainer conj = new indexContainer(null, small.rowdef, small.newRWI); // start with empty search result
indexContainer conj = new indexContainer(null, small.rowdef); // start with empty search result
Iterator se = small.entries();
indexRWIEntry ie0, ie1;
long stamp = System.currentTimeMillis();
@ -330,7 +312,7 @@ public class indexContainer extends kelondroRowSet {
assert i1.rowdef.equals(i2.rowdef) : "i1 = " + i1.rowdef.toString() + "; i2 = " + i2.rowdef.toString();
int keylength = i1.rowdef.width(0);
assert (keylength == i2.rowdef.width(0));
indexContainer conj = new indexContainer(null, i1.rowdef, i1.newRWI); // start with empty search result
indexContainer conj = new indexContainer(null, i1.rowdef); // start with empty search result
if (!((i1.order().signature().equals(i2.order().signature())) &&
(i1.primarykey() == i2.primarykey()))) return conj; // ordering must be equal
Iterator e1 = i1.entries();

@ -58,7 +58,6 @@ public final class indexRAMRI implements indexRI {
private String indexArrayFileName;
private kelondroRow payloadrow;
private kelondroRow bufferStructureBasis;
private boolean newRWI;
// calculated constants
private static String maxKey;
@ -67,7 +66,7 @@ public final class indexRAMRI implements indexRI {
//minKey = ""; for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-';
}
public indexRAMRI(File databaseRoot, kelondroRow payloadrow, int wCacheReferenceLimitInit, String dumpname, serverLog log, boolean newRWI) {
public indexRAMRI(File databaseRoot, kelondroRow payloadrow, int wCacheReferenceLimitInit, String dumpname, serverLog log) {
// creates a new index cache
// the cache has a back-end where indexes that do not fit in the cache are flushed
@ -79,7 +78,6 @@ public final class indexRAMRI implements indexRI {
this.cacheMaxCount = 10000;
this.cacheReferenceLimit = wCacheReferenceLimitInit;
this.log = log;
this.newRWI = newRWI;
this.indexArrayFileName = dumpname;
this.payloadrow = payloadrow;
this.bufferStructureBasis = new kelondroRow(
@ -103,7 +101,7 @@ public final class indexRAMRI implements indexRI {
return entries.updated();
}
private void dump(int waitingSeconds) throws IOException {
private void dump() throws IOException {
log.logConfig("creating dump for index cache '" + indexArrayFileName + "', " + cache.size() + " words (and much more urls)");
File indexDumpFile = new File(databaseRoot, indexArrayFileName);
if (indexDumpFile.exists()) indexDumpFile.delete();
@ -180,10 +178,7 @@ public final class indexRAMRI implements indexRI {
if ((row == null) || (row.empty(0)) || (row.empty(3))) continue;
wordHash = row.getColString(0, "UTF-8");
//creationTime = kelondroRecords.bytes2long(row[2]);
if (newRWI)
wordEntry = new indexRWIEntryNew(row.getColBytes(3));
else
wordEntry = new indexRWIEntryOld(row.getColBytes(3));
wordEntry = new indexRWIEntryNew(row.getColBytes(3));
// store to cache
addEntry(wordHash, wordEntry, startTime, false);
urlCount++;
@ -423,10 +418,10 @@ public final class indexRAMRI implements indexRI {
return delCount;
}
public synchronized indexContainer addEntries(indexContainer container, long updateTime, boolean dhtCase) {
public synchronized void addEntries(indexContainer container, long updateTime, boolean dhtCase) {
// this puts the entries into the cache, not into the assortment directly
int added = 0;
if ((container == null) || (container.size() == 0)) return null;
if ((container == null) || (container.size() == 0)) return;
// put new words into cache
String wordHash = container.getWordHash();
@ -443,28 +438,26 @@ public final class indexRAMRI implements indexRI {
hashDate.setScore(wordHash, intTime(updateTime));
}
entries = null;
return null;
}
public synchronized indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
indexContainer container = (indexContainer) cache.get(wordHash);
if (container == null) container = new indexContainer(wordHash, this.payloadrow, true);
indexRWIEntry[] entries = new indexRWIEntry[] { newEntry };
if (container.add(entries, updateTime) > 0) {
cache.put(wordHash, container);
hashScore.incScore(wordHash);
hashDate.setScore(wordHash, intTime(updateTime));
return null;
}
container = null;
entries = null;
return null;
public synchronized void addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
indexContainer container = (indexContainer) cache.get(wordHash);
if (container == null) container = new indexContainer(wordHash, this.payloadrow);
indexRWIEntry[] entries = new indexRWIEntry[] { newEntry };
if (container.add(entries, updateTime) > 0) {
cache.put(wordHash, container);
hashScore.incScore(wordHash);
hashDate.setScore(wordHash, intTime(updateTime));
return;
}
container = null;
entries = null;
}
public synchronized void close(int waitingSeconds) {
public synchronized void close() {
// dump cache
try {
dump(waitingSeconds);
dump();
} catch (IOException e){
log.logSevere("unable to dump cache: " + e.getMessage(), e);
}

@ -44,9 +44,9 @@ public interface indexRI {
public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete);
public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete);
public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtCase);
public indexContainer addEntries(indexContainer newEntries, long creationTime, boolean dhtCase);
public void addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtCase);
public void addEntries(indexContainer newEntries, long creationTime, boolean dhtCase);
public void close(int waitingSeconds);
public void close();
}

@ -152,10 +152,9 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
assert oldEntry.urlHash() != null;
this.entry = urlEntryRow.newEntry();
int mddlm = plasmaWordIndex.microDateDays(oldEntry.lastModified());
int mddct = plasmaWordIndex.microDateDays(System.currentTimeMillis());
this.entry.setCol(col_urlhash, oldEntry.urlHash(), null);
this.entry.setCol(col_lastModified, mddlm);
this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
this.entry.setCol(col_freshUntil, 0);
this.entry.setCol(col_wordsInTitle, 20); // guessed
this.entry.setCol(col_wordsInText, oldEntry.wordcount());
this.entry.setCol(col_phrasesInText, oldEntry.phrasecount());

@ -290,7 +290,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
String referrerHash = (this.refererURLString==null)?null:plasmaURL.urlHash(this.refererURLString);
// create a new errorURL DB entry
plasmaCrawlEURL.Entry ee = this.sb.urlPool.errorURL.newEntry(
plasmaCrawlEURL.Entry ee = this.sb.errorURL.newEntry(
this.url,
referrerHash,
this.initiator,
@ -304,7 +304,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
ee.store();
// push it onto the stack
this.sb.urlPool.errorURL.stackPushEntry(ee);
this.sb.errorURL.stackPushEntry(ee);
// delete the cache file
File cacheFile = this.cacheManager.getCachePath(this.url);

@ -342,7 +342,7 @@ public final class CrawlWorker extends AbstractCrawlWorker {
String urlhash = plasmaURL.urlHash(redirectionUrl);
// removing url from loader queue
plasmaCrawlLoader.switchboard.urlPool.noticeURL.remove(urlhash);
plasmaCrawlLoader.switchboard.noticeURL.remove(urlhash);
// retry crawling with new url
this.url = redirectionUrl;

@ -2,7 +2,7 @@ package de.anomic.plasma.dbImport;
import java.io.File;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.logging.serverLog;
public abstract class AbstractImporter extends Thread implements dbImporter{
@ -13,8 +13,7 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
protected boolean stopped = false;
protected boolean paused = false;
protected plasmaSwitchboard sb;
protected File importPath, indexPath;
protected File importPath;
protected int cacheSize;
protected long preloadTime;
@ -23,27 +22,27 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
protected long globalPauseLast;
protected long globalPauseDuration;
protected String error;
protected plasmaWordIndex wi;
public AbstractImporter(plasmaSwitchboard theSb) {
super(theSb.dbImportManager.runningJobs,"");
this.sb = theSb;
public AbstractImporter(plasmaWordIndex wi) {
//super(theSb.dbImportManager.runningJobs,"");
this.wi = wi;
}
public String getError() {
return this.error;
}
public void init(File theImportPath, File theIndexPath) {
public void init(File theImportPath) {
if (theImportPath == null) throw new NullPointerException("The Import path must not be null.");
this.importPath = theImportPath;
this.indexPath = theIndexPath;
// getting a job id from the import manager
this.jobID = this.sb.dbImportManager.getJobID();
//this.jobID = this.sb.dbImportManager.getJobID();
// initializing the logger and setting a more verbose thread name
this.log = new serverLog("IMPORT_" + this.jobType + "_" + this.jobID);
this.setName("IMPORT_" + this.jobType + "_" + this.sb.dbImportManager.getJobID());
this.setName("IMPORT_" + this.jobType /*+ "_" + this.sb.dbImportManager.getJobID()*/);
}
public void startIt() {

@ -5,8 +5,7 @@ import java.io.IOException;
import java.util.Iterator;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.plasmaWordIndexAssortment;
public class AssortmentImporter extends AbstractImporter implements dbImporter{
@ -15,31 +14,29 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{
private int wordEntityCount = 0;
private int wordEntryCount = 0;
private File importAssortmentFile;
private plasmaWordIndexAssortment assortmentFile;
public AssortmentImporter(plasmaSwitchboard sb) {
super(sb);
public AssortmentImporter(plasmaWordIndex wi) {
super(wi);
this.jobType = "ASSORTMENT";
}
public void init(File theImportAssortmentFile, File theIndexFile, int theCacheSize, long preloadTime) {
super.init(theImportAssortmentFile, theIndexFile);
this.importAssortmentFile = theImportAssortmentFile;
public void init(File theImportAssortmentFile, int theCacheSize, long preloadTime) {
super.init(theImportAssortmentFile);
this.cacheSize = theCacheSize;
if (this.cacheSize < 2*1024*1024) this.cacheSize = 2*1024*1024;
String errorMsg = null;
if (!this.importAssortmentFile.getName().matches("indexAssortment0[0-6][0-9]\\.db"))
errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' has an invalid name.";
if (!this.importAssortmentFile.exists())
errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' does not exist.";
else if (this.importAssortmentFile.isDirectory())
errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' is a directory.";
else if (!this.importAssortmentFile.canRead())
errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' is not readable.";
else if (!this.importAssortmentFile.canWrite())
errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' is not writeable.";
if (!this.importPath.getName().matches("indexAssortment0[0-6][0-9]\\.db"))
errorMsg = "AssortmentFile '" + this.importPath + "' has an invalid name.";
if (!this.importPath.exists())
errorMsg = "AssortmentFile '" + this.importPath + "' does not exist.";
else if (this.importPath.isDirectory())
errorMsg = "AssortmentFile '" + this.importPath + "' is a directory.";
else if (!this.importPath.canRead())
errorMsg = "AssortmentFile '" + this.importPath + "' is not readable.";
else if (!this.importPath.canWrite())
errorMsg = "AssortmentFile '" + this.importPath + "' is not writeable.";
if (errorMsg != null) {
this.log.logSevere(errorMsg);
throw new IllegalStateException(errorMsg);
@ -49,10 +46,10 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{
File importAssortmentPath = null;
int assortmentNr = -1;
try {
importAssortmentPath = new File(this.importAssortmentFile.getParent());
assortmentNr = Integer.valueOf(this.importAssortmentFile.getName().substring("indexAssortment".length(),"indexAssortment".length()+3)).intValue();
importAssortmentPath = new File(this.importPath.getParent());
assortmentNr = Integer.valueOf(this.importPath.getName().substring("indexAssortment".length(),"indexAssortment".length()+3)).intValue();
if (assortmentNr <1 || assortmentNr > 64) {
errorMsg = "AssortmentFile '" + this.importAssortmentFile + "' has an invalid name.";
errorMsg = "AssortmentFile '" + this.importPath + "' has an invalid name.";
}
} catch (NumberFormatException e) {
errorMsg = "Unable to parse the assortment file number.";
@ -61,9 +58,9 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{
}
// initializing the import assortment db
this.log.logInfo("Initializing source assortment file");
this.log.logInfo("Initializing source assortment file " + theImportAssortmentFile);
try {
this.assortmentFile = new plasmaWordIndexAssortment(importAssortmentPath, indexRWIEntryOld.urlEntryRow, assortmentNr, this.cacheSize/1024, preloadTime, this.log);
this.assortmentFile = new plasmaWordIndexAssortment(importAssortmentPath, assortmentNr, this.cacheSize/1024, preloadTime, this.log);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
@ -95,7 +92,7 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{
public void run() {
try {
// getting a content interator
Iterator contentIterator = this.assortmentFile.containers(null, true, false);
Iterator contentIterator = this.assortmentFile.wordContainers(null, true, false);
while (contentIterator.hasNext()) {
this.wordEntityCount++;
@ -105,14 +102,11 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{
this.wordEntryCount += container.size();
// importing entity container to home db
this.sb.wordIndex.addEntries(container, System.currentTimeMillis(), false);
wi.addEntries(container, System.currentTimeMillis(), false);
if (this.wordEntityCount % 500 == 0) {
if (this.wordEntityCount % 1000 == 0) {
this.log.logFine(this.wordEntityCount + " word entities processed so far.");
}
if (this.wordEntryCount % 2000 == 0) {
this.log.logFine(this.wordEntryCount + " word entries processed so far.");
}
if (isAborted()) break;
}
} catch (Exception e) {
@ -121,8 +115,12 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{
} finally {
this.log.logInfo("Import process finished.");
this.globalEnd = System.currentTimeMillis();
this.sb.dbImportManager.finishedJobs.add(this);
//this.sb.dbImportManager.finishedJobs.add(this);
this.assortmentFile.close();
File bkpPath = new File(importPath.getParentFile(), "imported");
bkpPath.mkdirs();
File bkpFile = new File(bkpPath, importPath.getName());
importPath.renameTo(bkpFile);
}
}

@ -58,10 +58,8 @@ public class dbImportManager {
if (type.length() == 0) return null;
dbImporter newImporter = null;
if (type.equals("plasmaDB")) {
newImporter = new plasmaDbImporter(this.sb);
} else if (type.equalsIgnoreCase("ASSORTMENT")) {
newImporter = new AssortmentImporter(this.sb);
if (type.equalsIgnoreCase("ASSORTMENT")) {
newImporter = new AssortmentImporter(this.sb.wordIndex);
} else if (type.equalsIgnoreCase("NURL")) {
newImporter = new plasmaCrawlNURLImporter(this.sb);
}

@ -24,6 +24,6 @@ public interface dbImporter {
public String getError();
public String getStatus();
public void init(File importPath, File indexPath, int cacheSize, long preloadTime);
public void init(File indexPath, int cacheSize, long preloadTime);
public void startIt();
}

@ -18,9 +18,10 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
private int importStartSize;
private int urlCount = 0;
private int profileCount = 0;
private plasmaSwitchboard sb;
public plasmaCrawlNURLImporter(plasmaSwitchboard theSb) {
super(theSb);
super(theSb.wordIndex);
this.jobType="NURL";
}
@ -45,8 +46,8 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
return theStatus.toString();
}
public void init(File theImportPath, File theIndexPath, int theCacheSize, long preloadTime) {
super.init(theImportPath, theIndexPath);
public void init(File theImportPath, int theCacheSize, long preloadTime) {
super.init(theImportPath);
this.cacheSize = theCacheSize;
this.preloadTime = preloadTime;
@ -174,10 +175,10 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
}
// if the url does not alredy exists in the destination stack we insert it now
if (!this.sb.urlPool.noticeURL.existsInStack(nextHash)) {
plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(nextEntry);
if (!this.sb.noticeURL.existsInStack(nextHash)) {
plasmaCrawlNURL.Entry ne = this.sb.noticeURL.newEntry(nextEntry);
ne.store();
this.sb.urlPool.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.url().getHost(), ne.hash());
this.sb.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.url().getHost(), ne.hash());
}
// removing hash from the import db

@ -9,17 +9,11 @@ import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverDate;
public class plasmaDbImporter extends AbstractImporter implements dbImporter {
private plasmaCrawlLURL homeUrlDB;
private plasmaWordIndex homeWordIndex;
private plasmaCrawlLURL importUrlDB;
private plasmaWordIndex importWordIndex;
private int importStartSize;
@ -30,8 +24,9 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
private long urlCounter = 0, wordCounter = 0, entryCounter = 0, notBoundEntryCounter = 0;
public plasmaDbImporter(plasmaSwitchboard theSb) {
super(theSb);
public plasmaDbImporter(plasmaWordIndex homeWI, plasmaWordIndex importWI) {
super(homeWI);
this.importWordIndex = importWI;
this.jobType = "PLASMADB";
}
@ -51,18 +46,12 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
return theStatus.toString();
}
public void init(File theImportPath, File theIndexPath, int theCacheSize, long preloadTime) {
super.init(theImportPath, theIndexPath);
public void init(File theImportPath, int theCacheSize, long preloadTime) {
super.init(theImportPath);
this.homeWordIndex = this.sb.wordIndex;
this.homeUrlDB = this.sb.urlPool.loadedURL;
this.cacheSize = theCacheSize;
if (this.cacheSize < 2*1024*1024) this.cacheSize = 8*1024*1024;
if (this.homeWordIndex.getRoot().equals(this.importPath)) {
throw new IllegalArgumentException("Import and home DB directory must not be equal");
}
// configure import DB
String errorMsg = null;
if (!this.importPath.exists()) errorMsg = "Import directory does not exist.";
@ -75,10 +64,8 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
}
this.log.logFine("Initializing source word index db.");
this.importWordIndex = new plasmaWordIndex(this.importPath, this.indexPath, true, (this.cacheSize/2)/1024, preloadTime / 2, this.log);
this.importWordIndex = new plasmaWordIndex(this.importPath, this.cacheSize/2, this.cacheSize/2, preloadTime / 2, this.log);
this.log.logFine("Initializing import URL db.");
this.importUrlDB = new plasmaCrawlLURL(this.importPath, this.indexPath, (this.cacheSize/2)/1024, preloadTime / 2);
this.importStartSize = this.importWordIndex.size();
}
@ -87,7 +74,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
importWordsDB();
} finally {
this.globalEnd = System.currentTimeMillis();
this.sb.dbImportManager.finishedJobs.add(this);
//this.sb.dbImportManager.finishedJobs.add(this);
}
}
@ -107,16 +94,16 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
this.log.logInfo("STARTING DB-IMPORT");
try {
this.log.logInfo("Importing DB from '" + this.importPath.getAbsolutePath() + "' to '" + this.homeWordIndex.getRoot().getAbsolutePath() + "'.");
this.log.logInfo("Home word index contains " + this.homeWordIndex.size() + " words and " + this.homeUrlDB.size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importUrlDB.size() + " URLs.");
this.log.logInfo("Importing DB from '" + this.importPath.getAbsolutePath() + "'");
this.log.logInfo("Home word index contains " + wi.size() + " words and " + wi.loadedURL.size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.loadedURL.size() + " URLs.");
HashSet unknownUrlBuffer = new HashSet();
HashSet importedUrlBuffer = new HashSet();
// iterate over all words from import db
//Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
Iterator indexContainerIterator = this.importWordIndex.indexContainerSet(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false, 100).iterator();
Iterator indexContainerIterator = this.importWordIndex.indexContainerSet(this.wordChunkStartHash, false, false, 100).iterator();
while (!isAborted() && indexContainerIterator.hasNext()) {
TreeSet entityUrls = new TreeSet(new kelondroNaturalOrder(true));
@ -157,11 +144,11 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// we need to import the url
// getting the url entry
indexURLEntry urlEntry = this.importUrlDB.load(urlHash, null);
indexURLEntry urlEntry = this.importWordIndex.loadedURL.load(urlHash, null);
if (urlEntry != null) {
/* write it into the home url db */
this.homeUrlDB.store(urlEntry);
wi.loadedURL.store(urlEntry);
importedUrlBuffer.add(urlHash);
this.urlCounter++;
@ -183,7 +170,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
if (isAborted()) break;
// importing entity container to home db
if (newContainer.size() > 0) { this.homeWordIndex.addEntries(newContainer, System.currentTimeMillis(), false); }
if (newContainer.size() > 0) { wi.addEntries(newContainer, System.currentTimeMillis(), false); }
// delete complete index entity file
this.importWordIndex.deleteContainer(this.wordHash);
@ -203,7 +190,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
"Speed: "+ 500*1000/duration + " word entities/s" +
" | Elapsed time: " + serverDate.intervalToString(getElapsedTime()) +
" | Estimated time: " + serverDate.intervalToString(getEstimatedTime()) + "\n" +
"Home Words = " + this.homeWordIndex.size() +
"Home Words = " + wi.size() +
" | Import Words = " + this.importWordIndex.size());
this.wordChunkStart = this.wordChunkEnd;
this.wordChunkStartHash = this.wordChunkEndHash;
@ -217,7 +204,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
if (!indexContainerIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes
TreeSet containers = this.importWordIndex.indexContainerSet(this.wordHash, plasmaWordIndex.RL_WORDFILES, false, 100);
TreeSet containers = this.importWordIndex.indexContainerSet(this.wordHash, false, false, 100);
indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals(((indexContainer) indexContainerIterator.next()).getWordHash()))) {
@ -226,16 +213,15 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
}
}
this.log.logInfo("Home word index contains " + this.homeWordIndex.size() + " words and " + this.homeUrlDB.size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importUrlDB.size() + " URLs.");
this.log.logInfo("Home word index contains " + wi.size() + " words and " + wi.loadedURL.size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.loadedURL.size() + " URLs.");
} catch (Exception e) {
this.log.logSevere("Database import failed.",e);
e.printStackTrace();
this.error = e.toString();
} finally {
this.log.logInfo("Import process finished.");
if (this.importUrlDB != null) try { this.importUrlDB.close(); } catch (Exception e){}
if (this.importWordIndex != null) try { this.importWordIndex.close(5000); } catch (Exception e){}
if (this.importWordIndex != null) try { this.importWordIndex.close(); } catch (Exception e){}
}
}

@ -171,9 +171,12 @@ public class plasmaCrawlEURL {
}
}
public void close() throws IOException {
public void close() {
if (urlIndexFile != null) {
urlIndexFile.close();
try {
urlIndexFile.close();
} catch (IOException e) {
}
urlIndexFile = null;
}
}

@ -95,11 +95,11 @@ public final class plasmaCrawlLURL {
// the class object
private kelondroIndex urlIndexFile = null;
public plasmaCrawlLURL(File plasmaPath, File indexPath, int bufferkb, long preloadTime) {
public plasmaCrawlLURL(File indexPath, long buffer, long preloadTime) {
super();
try {
urlIndexFile = new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", bufferkb * 0x400, preloadTime, indexURLEntryNew.rowdef, kelondroBase64Order.enhancedCoder);
urlIndexFile = new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", buffer, preloadTime, indexURLEntryNew.rowdef, kelondroBase64Order.enhancedCoder);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
@ -583,7 +583,7 @@ public final class plasmaCrawlLURL {
} catch (MalformedURLException e) {}
if (args[0].equals("-l")) try {
// arg 1 is path to URLCache
final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), new File(args[2]), 1, 0);
final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[2]), 1, 0);
final Iterator enu = urls.entries(true, false, null);
while (enu.hasNext()) {
System.out.println(((indexURLEntry) enu.next()).toString());

@ -393,9 +393,9 @@ public final class plasmaCrawlStacker {
// check if the url is double registered
checkInterruption();
String nexturlhash = plasmaURL.urlHash(nexturl);
String dbocc = this.sb.urlPool.exists(nexturlhash);
String dbocc = this.sb.urlExists(nexturlhash);
indexURLEntry oldEntry = null;
oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null);
oldEntry = this.sb.wordIndex.loadedURL.load(nexturlhash, null);
boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
if ((dbocc != null) && (!(recrawl))) {
reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")";
@ -437,7 +437,7 @@ public final class plasmaCrawlStacker {
// add the url into the crawling queue
checkInterruption();
plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
plasmaCrawlNURL.Entry ne = this.sb.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
nexturl, /* url clear text string */
loadDate, /* load date */
referrerHash, /* last url in crawling queue */
@ -448,7 +448,7 @@ public final class plasmaCrawlStacker {
0 /*forkfactor, default value */
);
ne.store();
this.sb.urlPool.noticeURL.push(
this.sb.noticeURL.push(
((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/,
nexturl.getHost(),
@ -1053,7 +1053,7 @@ public final class plasmaCrawlStacker {
// if the url was rejected we store it into the error URL db
if (rejectReason != null) {
plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry(
plasmaCrawlEURL.Entry ee = sb.errorURL.newEntry(
new URL(this.theMsg.url()),
this.theMsg.referrerHash(),
this.theMsg.initiatorHash(),
@ -1063,7 +1063,7 @@ public final class plasmaCrawlStacker {
new kelondroBitfield()
);
ee.store();
sb.urlPool.errorURL.stackPushEntry(ee);
sb.errorURL.stackPushEntry(ee);
}
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

@ -180,12 +180,12 @@ public class plasmaDHTChunk {
private void selectTransferContainers(String hash, int mincount, int maxcount, int maxtime) throws InterruptedException {
try {
this.selectionStartTime = System.currentTimeMillis();
int refcountRAM = selectTransferContainersResource(hash, plasmaWordIndex.RL_RAMCACHE, maxcount, maxtime);
int refcountRAM = selectTransferContainersResource(hash, true, maxcount, maxtime);
if (refcountRAM >= mincount) {
log.logFine("DHT selection from RAM: " + refcountRAM + " entries");
return;
}
int refcountFile = selectTransferContainersResource(hash, plasmaWordIndex.RL_WORDFILES, maxcount, maxtime);
int refcountFile = selectTransferContainersResource(hash, false, maxcount, maxtime);
log.logFine("DHT selection from FILE: " + refcountFile + " entries, RAM provided only " + refcountRAM + " entries");
return;
} finally {
@ -193,11 +193,11 @@ public class plasmaDHTChunk {
}
}
private int selectTransferContainersResource(String hash, int resourceLevel, int maxcount, int maxtime) throws InterruptedException {
private int selectTransferContainersResource(String hash, boolean ram, int maxcount, int maxtime) throws InterruptedException {
// the hash is a start hash from where the indexes are picked
ArrayList tmpContainers = new ArrayList(maxcount);
try {
Iterator indexContainerIterator = wordIndex.indexContainerSet(hash, resourceLevel, true, maxcount).iterator();
Iterator indexContainerIterator = wordIndex.indexContainerSet(hash, ram, true, maxcount).iterator();
indexContainer container;
Iterator urlIter;
indexRWIEntry iEntry;

@ -169,7 +169,7 @@ public class plasmaDHTFlush extends Thread {
// selecting 500 words to transfer
this.status = "Running: Selecting chunk " + iteration;
newDHTChunk = new plasmaDHTChunk(this.log, this.wordIndex, this.sb.urlPool.loadedURL, this.chunkSize/3*2, this.chunkSize, -1, this.startPointHash);
newDHTChunk = new plasmaDHTChunk(this.log, this.wordIndex, this.sb.wordIndex.loadedURL, this.chunkSize/3*2, this.chunkSize, -1, this.startPointHash);
/* If we havn't selected a word chunk this could be because of
* a) no words are left in the index

@ -744,7 +744,7 @@ public final class plasmaHTCache {
URL url = null;
// try the urlPool
try {
url = plasmaSwitchboard.getSwitchboard().urlPool.getURL(urlHash);
url = plasmaSwitchboard.getSwitchboard().getURL(urlHash);
} catch (Exception e) {
log.logWarning("getURL(" + urlHash + "): " /*+ e.getMessage()*/, e);
url = null;

@ -134,6 +134,7 @@ import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryNew;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
@ -206,7 +207,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public File rankingPath;
public File workPath;
public HashMap rankingPermissions;
public plasmaURLPool urlPool;
public plasmaCrawlNURL noticeURL;
public plasmaCrawlEURL errorURL;
public plasmaWordIndex wordIndex;
public plasmaHTCache cacheManager;
public plasmaSnippetCache snippetCache;
@ -366,10 +368,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// read memory amount
int ramLURL = (int) getConfigLong("ramCacheLURL", 1024) / 1024;
int ramLURL = (int) getConfigLong("ramCacheLURL", 1024);
long ramLURL_time = getConfigLong("ramCacheLURL_time", 1000);
ramLURL = Math.max((int) (serverMemory.available() / 2 / 1024), ramLURL);
setConfig("ramCacheLURL", ramLURL * 1024);
ramLURL = Math.max((int) (serverMemory.available() / 2), ramLURL);
setConfig("ramCacheLURL", ramLURL);
int ramNURL = (int) getConfigLong("ramCacheNURL", 1024) / 1024;
long ramNURL_time = getConfigLong("ramCacheNURL_time", 1000);
ramNURL = Math.max((int) (serverMemory.available() / 10 / 1024), ramNURL);
@ -378,10 +380,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
long ramEURL_time = getConfigLong("ramCacheEURL_time", 1000);
ramEURL = Math.max((int) (serverMemory.available() / 20 / 1024), ramEURL);
setConfig("ramCacheEURL", ramEURL * 1024);
int ramRWI = (int) getConfigLong("ramCacheRWI", 1024) / 1024;
int ramRWI = (int) getConfigLong("ramCacheRWI", 1024);
long ramRWI_time = getConfigLong("ramCacheRWI_time", 1000);
ramRWI = Math.max((int) (serverMemory.available() / 4 / 1024), ramRWI);
setConfig("ramCacheRWI", ramRWI * 1024);
ramRWI = Math.max((int) (serverMemory.available() / 4), ramRWI);
setConfig("ramCacheRWI", ramRWI);
int ramHTTP = (int) getConfigLong("ramCacheHTTP", 1024) / 1024;
long ramHTTP_time = getConfigLong("ramCacheHTTP_time", 1000);
int ramMessage = (int) getConfigLong("ramCacheMessage", 1024) / 1024;
@ -429,12 +431,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// start indexing management
log.logConfig("Starting Indexing Management");
urlPool = new plasmaURLPool(plasmaPath, indexPath,
ramLURL,
ramNURL,
ramEURL,
ramLURL_time);
wordIndex = new plasmaWordIndex(plasmaPath, indexPath, true, ramRWI, ramRWI_time, log);
wordIndex = new plasmaWordIndex(indexPath, ramRWI, ramLURL, ramRWI_time, log);
noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL, -1);
errorURL = new plasmaCrawlEURL(plasmaPath, ramEURL, -1);
// set a high maximum cache size to current size; this is adopted later automatically
int wordCacheMaxCount = Math.max((int) getConfigLong("wordCacheInitCount", 30000),
@ -471,7 +470,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
* initialize switchboard queue
* ====================================================================== */
// create queue
this.sbQueue = new plasmaSwitchboardQueue(this.cacheManager, this.urlPool.loadedURL, new File(this.plasmaPath, "switchboardQueue1.stack"), this.profiles);
this.sbQueue = new plasmaSwitchboardQueue(this.cacheManager, this.wordIndex.loadedURL, new File(this.plasmaPath, "switchboardQueue1.stack"), this.profiles);
// setting the indexing queue slots
indexingSlots = (int) getConfigLong("indexer.slots", 100);
@ -727,6 +726,29 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public boolean isRobinsonMode() {
return (yacyCore.seedDB.sizeConnected() == 0) && (yacyCore.seedDB.mySeed.isVirgin());
}
public String urlExists(String hash) {
// tests if hash occurrs in any database
// if it exists, the name of the database is returned,
// if it not exists, null is returned
if (wordIndex.loadedURL.exists(hash)) return "loaded";
if (noticeURL.existsInStack(hash)) return "crawler";
if (errorURL.exists(hash)) return "errors";
return null;
}
public URL getURL(String urlhash) throws IOException {
if (urlhash.equals(plasmaURL.dummyHash)) return null;
try {
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
if (ne != null) return ne.url();
} catch (IOException e) {}
indexURLEntry le = wordIndex.loadedURL.load(urlhash, null);
if (le != null) return le.comp().url();
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url();
return null;
}
/**
* This method changes the HTCache size.<br>
@ -796,7 +818,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public boolean cleanProfiles() throws InterruptedException {
if ((sbQueue.size() > 0) || (cacheLoader.size() > 0) || (urlPool.noticeURL.stackSize() > 0)) return false;
if ((sbQueue.size() > 0) || (cacheLoader.size() > 0) || (noticeURL.stackSize() > 0)) return false;
final Iterator iter = profiles.profiles(true);
plasmaCrawlProfile.entry entry;
boolean hasDoneSomething = false;
@ -970,9 +992,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
sbQueue.close();
flushCitationReference(crg, "crg");
log.logConfig("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager (stand by...)");
int waitingBoundSeconds = Integer.parseInt(getConfig("maxWaitingWordFlush", "120"));
urlPool.close();
wordIndex.close(waitingBoundSeconds);
noticeURL.close();
errorURL.close();
wordIndex.close();
log.logConfig("SWITCHBOARD SHUTDOWN TERMINATED");
}
@ -1017,7 +1039,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// flush some entries from the RAM cache
// (new permanent cache flushing)
wordIndex.flushCacheSome(sbQueue.size() != 0);
urlPool.loadedURL.flushCacheSome();
wordIndex.loadedURL.flushCacheSome();
boolean doneSomething = false;
@ -1041,7 +1063,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
) {
// generate new chunk
int minChunkSize = (int) getConfigLong("indexDistribution.minChunkSize", 30);
dhtTransferChunk = new plasmaDHTChunk(this.log, this.wordIndex, this.urlPool.loadedURL, minChunkSize, dhtTransferIndexCount, 5000);
dhtTransferChunk = new plasmaDHTChunk(this.log, wordIndex, wordIndex.loadedURL, minChunkSize, dhtTransferIndexCount, 5000);
doneSomething = true;
}
@ -1079,10 +1101,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// do one processing step
log.logFine("DEQUEUE: sbQueueSize=" + sbQueue.size() +
", coreStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) +
", limitStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) +
", overhangStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) +
", remoteStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE));
", coreStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) +
", limitStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) +
", overhangStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) +
", remoteStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE));
try {
nextentry = sbQueue.pop();
if (nextentry == null) {
@ -1112,9 +1134,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public int cleanupJobSize() {
int c = 0;
if ((urlPool.errorURL.stackSize() > 1000)) c++;
if ((errorURL.stackSize() > 1000)) c++;
for (int i = 1; i <= 6; i++) {
if (urlPool.loadedURL.getStackSize(i) > 1000) c++;
if (wordIndex.loadedURL.getStackSize(i) > 1000) c++;
}
return c;
}
@ -1133,17 +1155,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// clean up error stack
checkInterruption();
if ((urlPool.errorURL.stackSize() > 1000)) {
log.logFine("Cleaning Error-URLs report stack, " + urlPool.errorURL.stackSize() + " entries on stack");
urlPool.errorURL.clearStack();
if ((errorURL.stackSize() > 1000)) {
log.logFine("Cleaning Error-URLs report stack, " + errorURL.stackSize() + " entries on stack");
errorURL.clearStack();
hasDoneSomething = true;
}
// clean up loadedURL stack
for (int i = 1; i <= 6; i++) {
checkInterruption();
if (urlPool.loadedURL.getStackSize(i) > 1000) {
log.logFine("Cleaning Loaded-URLs report stack, " + urlPool.loadedURL.getStackSize(i) + " entries on stack " + i);
urlPool.loadedURL.clearStack(i);
if (wordIndex.loadedURL.getStackSize(i) > 1000) {
log.logFine("Cleaning Loaded-URLs report stack, " + wordIndex.loadedURL.getStackSize(i) + " entries on stack " + i);
wordIndex.loadedURL.clearStack(i);
hasDoneSomething = true;
}
}
@ -1209,11 +1231,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public int coreCrawlJobSize() {
return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
}
public boolean coreCrawlJob() {
if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) {
if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) {
//log.logDebug("CoreCrawl: queue is empty");
return false;
}
@ -1247,10 +1269,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// do a local crawl
plasmaCrawlNURL.Entry urlEntry = null;
while (urlEntry == null && urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) {
String stats = "LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
while (urlEntry == null && noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) {
String stats = "LOCALCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try {
urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE);
urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE);
String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
@ -1276,11 +1298,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public int limitCrawlTriggerJobSize() {
return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
}
public boolean limitCrawlTriggerJob() {
if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) == 0) {
if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) == 0) {
//log.logDebug("LimitCrawl: queue is empty");
return false;
}
@ -1292,7 +1314,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (toshift > 1000) toshift = 1000;
if (toshift > limitCrawlTriggerJobSize()) toshift = limitCrawlTriggerJobSize();
for (int i = 0; i < toshift; i++) {
urlPool.noticeURL.shift(plasmaCrawlNURL.STACK_TYPE_LIMIT, plasmaCrawlNURL.STACK_TYPE_CORE);
noticeURL.shift(plasmaCrawlNURL.STACK_TYPE_LIMIT, plasmaCrawlNURL.STACK_TYPE_CORE);
}
log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl");
}
@ -1312,10 +1334,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// start a global crawl, if possible
String stats = "REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
+ urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
String stats = "REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
+ noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try {
plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
plasmaCrawlNURL.Entry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
@ -1327,7 +1349,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logFine("plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter="
+ profile.generalFilter() + ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false")));
boolean tryRemote = ((urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (sbQueue.size() != 0)) &&
boolean tryRemote = ((noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (sbQueue.size() != 0)) &&
(profile.remoteIndexing()) &&
(urlEntry.initiator() != null) &&
// (!(urlEntry.initiator().equals(indexURL.dummyHash))) &&
@ -1359,7 +1381,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public int remoteTriggeredCrawlJobSize() {
return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE);
return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE);
}
public boolean remoteTriggeredCrawlJob() {
@ -1367,7 +1389,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// do nothing if either there are private processes to be done
// or there is no global crawl on the stack
if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) == 0) {
if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) == 0) {
//log.logDebug("GlobalCrawl: queue is empty");
return false;
}
@ -1398,10 +1420,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
String stats = "REMOTETRIGGEREDCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
+ urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
+ noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try {
plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE);
plasmaCrawlNURL.Entry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE);
String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " +
@ -1531,7 +1553,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} catch (MalformedURLException e1) {}
}
log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.normalizedURLString() +
", NEW CRAWL STACK SIZE IS " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
", NEW CRAWL STACK SIZE IS " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
}
stackEndTime = System.currentTimeMillis();
@ -1568,7 +1590,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
// create a new loaded URL db entry
indexURLEntry newEntry = urlPool.loadedURL.newEntry(
indexURLEntry newEntry = wordIndex.loadedURL.newEntry(
entry.url(), // URL
docDescription, // document description
"", // author
@ -1594,8 +1616,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
/* ========================================================================
* STORE URL TO LOADED-URL-DB
* ======================================================================== */
urlPool.loadedURL.store(newEntry);
urlPool.loadedURL.stack(
wordIndex.loadedURL.store(newEntry);
wordIndex.loadedURL.stack(
newEntry, // loaded url db entry
initiatorPeerHash, // initiator peer hash
yacyCore.seedDB.mySeed.hash, // executor peer hash
@ -1672,7 +1694,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String word = (String) wentry.getKey();
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
String wordHash = plasmaCondenser.word2hash(word);
indexRWIEntry wordIdxEntry = wordIndex.newRWIEntry(
indexRWIEntry wordIdxEntry = new indexRWIEntryNew(
urlHash,
urlLength, urlComps,
wordStat.count,
@ -1807,7 +1829,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// removing current entry from notice URL queue
boolean removed = urlPool.noticeURL.remove(entry.urlHash()); // worked-off
boolean removed = noticeURL.remove(entry.urlHash()); // worked-off
if (!removed) {
log.logFinest("Unable to remove indexed URL " + entry.url() + " from Crawler Queue. This could be because of an URL redirect.");
}
@ -1911,7 +1933,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
URL refererURL = null;
String refererHash = urlEntry.referrerHash();
if ((refererHash != null) && (!refererHash.equals(plasmaURL.dummyHash))) try {
refererURL = this.urlPool.getURL(refererHash);
refererURL = this.getURL(refererHash);
} catch (IOException e) {
refererURL = null;
}
@ -1924,7 +1946,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// return true iff another peer has/will index(ed) the url
if (urlEntry == null) {
log.logInfo("REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null");
log.logInfo("REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null");
return true; // superfluous request; true correct in this context
}
@ -1952,7 +1974,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// do the request
try {
HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), urlPool.getURL(urlEntry.referrerHash()), 6000);
HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), getURL(urlEntry.referrerHash()), 6000);
// check success
/*
@ -1990,10 +2012,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
indexURLEntry entry = urlPool.loadedURL.newEntry(propStr);
urlPool.loadedURL.store(entry);
urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
urlPool.noticeURL.remove(entry.hash());
indexURLEntry entry = wordIndex.loadedURL.newEntry(propStr);
wordIndex.loadedURL.store(entry);
wordIndex.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
noticeURL.remove(entry.hash());
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'");
return true;
} else {
@ -2051,7 +2073,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
//}
// create a new search event
plasmaSearchEvent theSearch = new plasmaSearchEvent(query, ranking, localTiming, remoteTiming, postsort, log, wordIndex, urlPool.loadedURL, snippetCache);
plasmaSearchEvent theSearch = new plasmaSearchEvent(query, ranking, localTiming, remoteTiming, postsort, log, wordIndex, wordIndex.loadedURL, snippetCache);
plasmaSearchResult acc = theSearch.search();
// fetch snippets
@ -2094,7 +2116,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ((seed == null) || ((address = seed.getAddress()) == null)) {
// seed is not known from here
removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes(), "UTF-8"));
urlPool.loadedURL.remove(urlentry.hash()); // clean up
wordIndex.loadedURL.remove(urlentry.hash()); // clean up
continue; // next result
}
urlname = "http://share." + seed.getName() + ".yacy" + filename;
@ -2217,7 +2239,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// finally, delete the url entry
// determine the url string
indexURLEntry entry = urlPool.loadedURL.load(urlhash, null);
indexURLEntry entry = wordIndex.loadedURL.load(urlhash, null);
if (entry == null) return 0;
indexURLEntry.Components comp = entry.comp();
if (comp.url() == null) return 0;
@ -2245,7 +2267,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (witer != null) count = removeReferences(urlhash, witer);
// finally delete the url entry itself
urlPool.loadedURL.remove(urlhash);
wordIndex.loadedURL.remove(urlhash);
return count;
} catch (ParserException e) {
return 0;
@ -2373,15 +2395,15 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (getConfig("allowDistributeIndex","false").equalsIgnoreCase("false")) {
return "no DHT distribution: not enabled";
}
if (urlPool.loadedURL.size() < 10) {
return "no DHT distribution: loadedURL.size() = " + urlPool.loadedURL.size();
if (wordIndex.loadedURL.size() < 10) {
return "no DHT distribution: loadedURL.size() = " + wordIndex.loadedURL.size();
}
if (wordIndex.size() < 100) {
return "no DHT distribution: not enough words - wordIndex.size() = " + wordIndex.size();
}
if ((getConfig("allowDistributeIndexWhileCrawling","false").equalsIgnoreCase("false")) &&
((urlPool.noticeURL.stackSize() > 0) || (sbQueue.size() > 3))) {
return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " + urlPool.noticeURL.stackSize() + ", sbQueue.size() = " + sbQueue.size();
((noticeURL.stackSize() > 0) || (sbQueue.size() > 3))) {
return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " + noticeURL.stackSize() + ", sbQueue.size() = " + sbQueue.size();
}
return null;
}
@ -2522,7 +2544,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
kelondroBitfield flags
) {
// create a new errorURL DB entry
plasmaCrawlEURL.Entry ee = this.urlPool.errorURL.newEntry(
plasmaCrawlEURL.Entry ee = this.errorURL.newEntry(
url,
referrerHash,
initiator,
@ -2534,7 +2556,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// store the entry
ee.store();
// push it onto the stack
this.urlPool.errorURL.stackPushEntry(ee);
this.errorURL.stackPushEntry(ee);
}
public void checkInterruption() throws InterruptedException {

@ -1,99 +0,0 @@
// plasmaURLPool.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// last major change: 16.06.2005
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
// this class combines all url storage methods into one. It is the host for all url storage
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
public class plasmaURLPool {
public final plasmaCrawlLURL loadedURL;
public final plasmaCrawlNURL noticeURL;
public final plasmaCrawlEURL errorURL;
public plasmaURLPool(File plasmaPath, File indexPath,
int ramLURL,
int ramNURL,
int ramEURL,
long preloadTime) {
loadedURL = new plasmaCrawlLURL(plasmaPath, indexPath, ramLURL, preloadTime);
noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL, -1);
errorURL = new plasmaCrawlEURL(plasmaPath, ramEURL, -1);
}
public String exists(String hash) {
// tests if hash occurrs in any database
// if it exists, the name of the database is returned,
// if it not exists, null is returned
if (loadedURL.exists(hash)) return "loaded";
if (noticeURL.existsInStack(hash)) return "crawler";
if (errorURL.exists(hash)) return "errors";
return null;
}
public URL getURL(String urlhash) throws IOException {
if (urlhash.equals(plasmaURL.dummyHash)) return null;
try {
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
if (ne != null) return ne.url();
} catch (IOException e) {}
indexURLEntry le = loadedURL.load(urlhash, null);
if (le != null) return le.comp().url();
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url();
return null;
}
public void close() {
try {loadedURL.close();} catch (IOException e) {}
noticeURL.close();
try {errorURL.close();} catch (IOException e) {}
}
}

@ -27,7 +27,6 @@
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
@ -47,11 +46,8 @@ import de.anomic.index.indexRWIEntryNew;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMergeIterator;
import de.anomic.kelondro.kelondroOrder;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.logging.serverLog;
@ -59,67 +55,33 @@ import de.anomic.yacy.yacyDHTAction;
public final class plasmaWordIndex implements indexRI {
private static final kelondroRow payloadrowold = indexRWIEntryOld.urlEntryRow;
private static final kelondroRow payloadrownew = indexRWIEntryNew.urlEntryRow;
private final kelondroOrder indexOrder = kelondroBase64Order.enhancedCoder;
private final indexRAMRI dhtOutCache, dhtInCache;
private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster
public boolean busyCacheFlush; // shows if a cache flush is currently performed
private int idleDivisor, busyDivisor;
public final plasmaCrawlLURL loadedURL;
private final File oldDatabaseRoot;
private final kelondroOrder indexOrder = kelondroBase64Order.enhancedCoder;
private final indexRAMRI dhtOutCache, dhtInCache;
private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster
public boolean busyCacheFlush; // shows if a cache flush is currently performed
private int idleDivisor, busyDivisor;
public plasmaWordIndex(File oldDatabaseRoot, File newIndexRoot, boolean dummy, int bufferkb, long preloadTime, serverLog log) {
this.oldDatabaseRoot = oldDatabaseRoot;
File textindexcache = new File(newIndexRoot, "PUBLIC/TEXT/RICACHE");
public plasmaWordIndex(File indexRoot, long rwibuffer, long lurlbuffer, long preloadTime, serverLog log) {
File textindexcache = new File(indexRoot, "PUBLIC/TEXT/RICACHE");
if (!(textindexcache.exists())) textindexcache.mkdirs();
this.dhtOutCache = new indexRAMRI(textindexcache, payloadrownew, 1024, "dump1.array", log, true);
this.dhtInCache = new indexRAMRI(textindexcache, payloadrownew, 1024, "dump2.array", log, true);
this.dhtOutCache = new indexRAMRI(textindexcache, indexRWIEntryNew.urlEntryRow, 2040, "dump1.array", log);
this.dhtInCache = new indexRAMRI(textindexcache, indexRWIEntryNew.urlEntryRow, 2040, "dump2.array", log);
// create collections storage path
File textindexcollections = new File(newIndexRoot, "PUBLIC/TEXT/RICOLLECTION");
File textindexcollections = new File(indexRoot, "PUBLIC/TEXT/RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
this.collections = new indexCollectionRI(textindexcollections, "collection", bufferkb * 1024, preloadTime, payloadrownew);
this.collections = new indexCollectionRI(textindexcollections, "collection", rwibuffer, preloadTime, indexRWIEntryNew.urlEntryRow);
// create LURL-db
loadedURL = new plasmaCrawlLURL(indexRoot, lurlbuffer, preloadTime);
// performance settings
busyCacheFlush = false;
this.busyDivisor = 5000;
this.idleDivisor = 420;
}
public kelondroRow payloadrow() {
return payloadrownew;
}
public indexRWIEntry newRWIEntry(
String urlHash,
int urlLength,
int urlComps,
int titleLength,
int hitcount,
int wordcount,
int phrasecount,
int posintext,
int posinphrase,
int posofphrase,
int worddistance,
int sizeOfPage,
long lastmodified,
long updatetime,
int quality,
String language,
char doctype,
int outlinksSame,
int outlinksOther,
kelondroBitfield flags ) {
return new indexRWIEntryNew(urlHash, urlLength, urlComps, titleLength, hitcount, wordcount, phrasecount,
posintext, posinphrase, posofphrase, worddistance, sizeOfPage, lastmodified, updatetime, quality, language, doctype,
outlinksSame, outlinksOther, flags);
}
public File getRoot() {
return oldDatabaseRoot;
}
public int maxURLinDHTOutCache() {
return dhtOutCache.maxURLinCache();
}
@ -184,12 +146,12 @@ public final class plasmaWordIndex implements indexRI {
}
public indexContainer emptyContainer(String wordHash) {
return new indexContainer(wordHash, payloadrow(), true);
return new indexContainer(wordHash, indexRWIEntryNew.urlEntryRow);
}
public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtInCase) {
public void addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtInCase) {
if (entry instanceof indexRWIEntryOld) {
if (entry.urlHash() == null) return null;
if (entry.urlHash() == null) return;
entry = new indexRWIEntryNew((indexRWIEntryOld) entry);
}
@ -203,12 +165,12 @@ public final class plasmaWordIndex implements indexRI {
dhtOutCache.addEntry(wordHash, entry, updateTime, false);
flushControl();
}
return null;
}
/*
private indexContainer convertOld2New(indexContainer entries) {
// convert old entries to new entries
indexContainer newentries = new indexContainer(entries.getWordHash(), payloadrownew, true);
indexContainer newentries = new indexContainer(entries.getWordHash(), indexRWIEntryNew.urlEntryRow);
Iterator i = entries.entries();
indexRWIEntryOld old;
while (i.hasNext()) {
@ -219,9 +181,9 @@ public final class plasmaWordIndex implements indexRI {
}
return newentries;
}
public indexContainer addEntries(indexContainer entries, long updateTime, boolean dhtInCase) {
if (entries.row().objectsize() == payloadrowold.objectsize()) entries = convertOld2New(entries);
*/
public void addEntries(indexContainer entries, long updateTime, boolean dhtInCase) {
assert (entries.row().objectsize() == indexRWIEntryNew.urlEntryRow.objectsize());
// set dhtInCase depending on wordHash
if ((!dhtInCase) && (yacyDHTAction.shallBeOwnWord(entries.getWordHash()))) dhtInCase = true;
@ -233,7 +195,6 @@ public final class plasmaWordIndex implements indexRI {
dhtOutCache.addEntries(entries, updateTime, false);
flushControl();
}
return null;
}
public void flushCacheSome(boolean busy) {
@ -263,12 +224,7 @@ public final class plasmaWordIndex implements indexRI {
// flush the wordHash
indexContainer c = ram.deleteContainer(wordHash);
if (c != null) {
indexContainer feedback = collections.addEntries(c, c.updated(), false);
if (feedback != null) {
throw new RuntimeException("indexCollectionRI shall not return feedback entries; feedback = " + feedback.toString());
}
}
if (c != null) collections.addEntries(c, c.updated(), false);
// pause to next loop to give other processes a chance to use IO
//try {this.wait(8);} catch (InterruptedException e) {}
@ -330,7 +286,7 @@ public final class plasmaWordIndex implements indexRI {
wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = plasmaCondenser.word2hash(word);
ientry = newRWIEntry(urlHash,
ientry = new indexRWIEntryNew(urlHash,
urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(),
wprop.count,
condenser.RESULT_SIMI_WORDS,
@ -415,16 +371,16 @@ public final class plasmaWordIndex implements indexRI {
return size;
}
public void close(int waitingBoundSeconds) {
public void close() {
synchronized (this) {
dhtInCache.close(waitingBoundSeconds);
dhtOutCache.close(waitingBoundSeconds);
collections.close(-1);
dhtInCache.close();
dhtOutCache.close();
collections.close();
}
}
public indexContainer deleteContainer(String wordHash) {
indexContainer c = new indexContainer(wordHash, payloadrow(), true);
indexContainer c = new indexContainer(wordHash, indexRWIEntryNew.urlEntryRow);
c.add(dhtInCache.deleteContainer(wordHash), -1);
c.add(dhtOutCache.deleteContainer(wordHash), -1);
c.add(collections.deleteContainer(wordHash), -1);
@ -456,9 +412,7 @@ public final class plasmaWordIndex implements indexRI {
}
public static final int RL_RAMCACHE = 0;
public static final int RL_COLLECTIONS = 1; // the new index structure
public static final int RL_ASSORTMENTS = 2; // (to be) outdated structure
public static final int RL_WORDFILES = 3; // (to be) outdated structure
public static final int RL_COLLECTIONS = 1;
public int tryRemoveURLs(String urlHash) {
// this tries to delete an index from the cache that has this
@ -468,14 +422,14 @@ public final class plasmaWordIndex implements indexRI {
return dhtInCache.tryRemoveURLs(urlHash) | dhtOutCache.tryRemoveURLs(urlHash);
}
public TreeSet indexContainerSet(String startHash, int resourceLevel, boolean rot, int count) {
public TreeSet indexContainerSet(String startHash, boolean ram, boolean rot, int count) {
// creates a set of indexContainers
// this does not use the dhtInCache
kelondroOrder containerOrder = new indexContainerOrder((kelondroOrder) indexOrder.clone());
containerOrder.rotate(startHash.getBytes());
TreeSet containers = new TreeSet(containerOrder);
Iterator i = wordContainers(startHash, resourceLevel, rot);
if (resourceLevel == plasmaWordIndex.RL_RAMCACHE) count = Math.min(dhtOutCache.size(), count);
Iterator i = wordContainers(startHash, ram, rot);
if (ram) count = Math.min(dhtOutCache.size(), count);
indexContainer container;
while ((count > 0) && (i.hasNext())) {
container = (indexContainer) i.next();
@ -486,38 +440,35 @@ public final class plasmaWordIndex implements indexRI {
}
return containers;
}
public Iterator wordContainers(String startHash, boolean rot) {
// returns an iteration of indexContainers
return wordContainers(startHash, RL_WORDFILES, rot);
}
public Iterator wordContainers(String startHash, int resourceLevel, boolean rot) {
if (rot) return new rotatingContainerIterator(startHash, resourceLevel);
else return wordContainers(startHash, resourceLevel);
public Iterator wordContainers(String startHash, boolean ram, boolean rot) {
if (rot) return new rotatingContainerIterator(startHash, ram);
else return wordContainers(startHash, ram);
}
private Iterator wordContainers(String startWordHash, int resourceLevel) {
public Iterator wordContainers(String startWordHash, boolean ram) {
kelondroOrder containerOrder = new indexContainerOrder((kelondroOrder) indexOrder.clone());
containerOrder.rotate(startWordHash.getBytes());
if (resourceLevel == plasmaWordIndex.RL_RAMCACHE) {
if (ram) {
return dhtOutCache.wordContainers(startWordHash, false);
}
return new kelondroMergeIterator(
} else {
return new kelondroMergeIterator(
dhtOutCache.wordContainers(startWordHash, false),
collections.wordContainers(startWordHash, false),
containerOrder,
indexContainer.containerMergeMethod,
true);
}
}
public class rotatingContainerIterator implements Iterator {
Iterator i;
int resourceLevel;
boolean ram;
public rotatingContainerIterator(String startWordHash, int resourceLevel) {
this.resourceLevel = resourceLevel;
i = wordContainers(startWordHash, resourceLevel);
public rotatingContainerIterator(String startWordHash, boolean ram) {
this.ram = ram;
i = wordContainers(startWordHash, ram);
}
public void finalize() {
@ -527,7 +478,7 @@ public final class plasmaWordIndex implements indexRI {
public boolean hasNext() {
if (i.hasNext()) return true;
else {
i = wordContainers("------------", resourceLevel);
i = wordContainers("------------", ram);
return i.hasNext();
}
}
@ -541,44 +492,6 @@ public final class plasmaWordIndex implements indexRI {
}
} // class rotatingContainerIterator
public Object migrateWords2index(String wordhash) throws IOException {
// returns the number of entries that had been added to the assortments
// can be negative if some assortments have been moved to the backend
File db = plasmaWordIndexFile.wordHash2path(oldDatabaseRoot, wordhash);
if (!(db.exists())) return "not available";
plasmaWordIndexFile entity = null;
try {
entity = new plasmaWordIndexFile(oldDatabaseRoot, wordhash, true);
int size = entity.size();
indexContainer container = new indexContainer(wordhash, payloadrow(), true);
try {
Iterator entries = entity.elements(true);
indexRWIEntry entry;
while (entries.hasNext()) {
entry = (indexRWIEntry) entries.next();
// System.out.println("ENTRY = " + entry.getUrlHash());
container.add(new indexRWIEntry[] { entry }, System.currentTimeMillis());
}
// we have read all elements, now delete the entity
entity.deleteComplete();
entity.close();
entity = null;
indexContainer feedback = collections.addEntries(container, container.updated(), false);
if (feedback != null) return feedback;
return new Integer(size);
} catch (kelondroException e) {
// database corrupted, we simply give up the database and delete it
try { entity.close(); } catch (Exception ee) { }
entity = null;
try { db.delete(); } catch (Exception ee) { }
return "database corrupted; deleted";
}
} finally {
if (entity != null) try {entity.close();}catch(Exception e){}
}
}
// The Cleaner class was provided as "UrldbCleaner" by Hydrox
// see http://www.yacy-forum.de/viewtopic.php?p=18093#18093
@ -609,7 +522,7 @@ public final class plasmaWordIndex implements indexRI {
indexRWIEntry entry = null;
URL url = null;
HashSet urlHashs = new HashSet();
Iterator indexContainerIterator = indexContainerSet(startHash, plasmaWordIndex.RL_WORDFILES, false, 100).iterator();
Iterator indexContainerIterator = indexContainerSet(startHash, false, false, 100).iterator();
while (indexContainerIterator.hasNext() && run) {
waiter();
container = (indexContainer) indexContainerIterator.next();
@ -639,7 +552,7 @@ public final class plasmaWordIndex implements indexRI {
}
if (!containerIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes
TreeSet containers = indexContainerSet(container.getWordHash(), plasmaWordIndex.RL_WORDFILES, false, 100);
TreeSet containers = indexContainerSet(container.getWordHash(), false, false, 100);
indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext()) && (!container.getWordHash().equals(((indexContainer) indexContainerIterator.next()).getWordHash()))) {
@ -693,13 +606,14 @@ public final class plasmaWordIndex implements indexRI {
public static void main(String[] args) {
// System.out.println(kelondroMSetTools.fastStringComparator(true).compare("RwGeoUdyDQ0Y", "rwGeoUdyDQ0Y"));
// System.out.println(new Date(reverseMicroDateDays(microDateDays(System.currentTimeMillis()))));
File plasmadb = new File("D:\\dev\\proxy\\DATA\\PLASMADB");
/*
File indexdb = new File("D:\\dev\\proxy\\DATA\\INDEX");
plasmaWordIndex index = new plasmaWordIndex(plasmadb, indexdb, true, 555, 1000, new serverLog("TESTAPP"));
plasmaWordIndex index = new plasmaWordIndex(indexdb, true, 555, 1000, new serverLog("TESTAPP"));
Iterator containerIter = index.wordContainers("5A8yhZMh_Kmv", plasmaWordIndex.RL_WORDFILES, true);
while (containerIter.hasNext()) {
System.out.println("File: " + (indexContainer) containerIter.next());
}
*/
}
}

@ -58,6 +58,7 @@ import java.util.Iterator;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryNew;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroColumn;
@ -74,12 +75,9 @@ public final class plasmaWordIndexAssortment {
// class variables
private File assortmentFile;
private int assortmentLength;
private serverLog log;
private kelondroCache assortments;
private long bufferSize;
private long preloadTime;
private kelondroRow payloadrow;
private static String intx(int x) {
String s = Integer.toString(x);
@ -92,23 +90,20 @@ public final class plasmaWordIndexAssortment {
structure[0] = new kelondroColumn("byte[] wordhash-" + yacySeedDB.commonHashLength);
structure[1] = new kelondroColumn("Cardinal occ-4 {b256}");
structure[2] = new kelondroColumn("Cardinal time-8 {b256}");
kelondroColumn p = new kelondroColumn("byte[] urlprops-" + payloadrow.objectsize());
kelondroColumn p = new kelondroColumn("byte[] urlprops-" + indexRWIEntryOld.urlEntryRow.objectsize());
for (int i = 0; i < assortmentCapacity; i++) structure[3 + i] = p;
return new kelondroRow(structure);
}
private int assortmentCapacity(int rowsize) {
return (rowsize - yacySeedDB.commonHashLength - 12) / payloadrow.objectsize();
return (rowsize - yacySeedDB.commonHashLength - 12) / indexRWIEntryOld.urlEntryRow.objectsize();
}
public plasmaWordIndexAssortment(File storagePath, kelondroRow payloadrow, int assortmentLength, int bufferkb, long preloadTime, serverLog log) throws IOException {
public plasmaWordIndexAssortment(File storagePath, int assortmentLength, int bufferkb, long preloadTime, serverLog log) throws IOException {
if (!(storagePath.exists())) storagePath.mkdirs();
this.payloadrow = payloadrow;
this.assortmentFile = new File(storagePath, assortmentFileName + intx(assortmentLength) + ".db");
this.assortmentLength = assortmentLength;
//this.bufferStructureLength = 3 + 2 * assortmentLength;
this.bufferSize = bufferkb * 1024;
this.preloadTime = preloadTime;
this.log = log;
// open assortment tree file
long start = System.currentTimeMillis();
@ -122,138 +117,26 @@ public final class plasmaWordIndexAssortment {
assortments.cacheNodeStatus()[1] + " preloaded");
}
public void store(indexContainer newContainer) throws IOException {
// stores a word index to assortment database
// this throws an exception if the word hash already existed
//log.logDebug("storeAssortment: wordHash=" + wordHash + ", urlHash=" + entry.getUrlHash() + ", time=" + creationTime);
if (newContainer.size() != assortmentLength) throw new RuntimeException("plasmaWordIndexAssortment.store: wrong container size");
kelondroRow.Entry row = assortments.row().newEntry();
row.setCol(0, newContainer.getWordHash().getBytes());
row.setCol(1, 1);
row.setCol(2, newContainer.updated());
Iterator entries = newContainer.entries();
indexRWIEntry entry;
for (int i = 0; i < assortmentLength; i++) {
entry = (indexRWIEntry) entries.next();
row.setCol(3 + i, entry.toKelondroEntry().bytes());
}
kelondroRow.Entry oldrow = null;
try {
oldrow = assortments.put(row);
} catch (IOException e) {
e.printStackTrace();
log.logSevere("storeAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB " + assortmentFile, e);
resetDatabase();
} catch (IndexOutOfBoundsException e) {
e.printStackTrace();
log.logSevere("storeAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB " + assortmentFile, e);
resetDatabase();
} catch (kelondroException e) {
e.printStackTrace();
log.logSevere("storeAssortment/kelondro-error: " + e.getMessage() + " - reset assortment-DB " + assortmentFile, e);
resetDatabase();
}
if (oldrow != null) throw new RuntimeException("Store to assortment ambiguous");
}
public indexContainer remove(String wordHash) {
// deletes a word index from assortment database
// and returns the content record
kelondroRow.Entry row = null;
try {
row = assortments.remove(wordHash.getBytes());
} catch (IOException e) {
log.logSevere("removeAssortment/IO-error: " + e.getMessage()
+ " - reset assortment-DB " + assortmentFile, e);
resetDatabase();
return null;
} catch (kelondroException e) {
log.logSevere("removeAssortment/kelondro-error: " + e.getMessage()
+ " - reset assortment-DB " + assortmentFile, e);
resetDatabase();
return null;
}
return row2container(row);
}
public boolean contains(String wordHash) {
// gets a word index from assortment database
// and returns the content record
kelondroRow.Entry row = null;
try {
row = assortments.get(wordHash.getBytes());
return (row != null);
} catch (IOException e) {
return false;
} catch (kelondroException e) {
log.logSevere("removeAssortment/kelondro-error: " + e.getMessage()
+ " - reset assortment-DB " + assortmentFile, e);
resetDatabase();
return false;
}
}
public indexContainer get(String wordHash) {
// gets a word index from assortment database
// and returns the content record
kelondroRow.Entry row = null;
try {
row = assortments.get(wordHash.getBytes());
} catch (IOException e) {
log.logSevere("removeAssortment/IO-error: " + e.getMessage()
+ " - reset assortment-DB " + assortmentFile, e);
resetDatabase();
return null;
} catch (kelondroException e) {
log.logSevere("removeAssortment/kelondro-error: " + e.getMessage()
+ " - reset assortment-DB " + assortmentFile, e);
resetDatabase();
return null;
}
return row2container(row);
}
public final indexContainer row2container(kelondroRow.Entry row) {
if (row == null) return null;
String wordHash = row.getColString(0, null);
final long updateTime = row.getColLong(2);
indexContainer container = new indexContainer(wordHash, payloadrow, false);
indexContainer container = new indexContainer(wordHash, indexRWIEntryNew.urlEntryRow);
int al = assortmentCapacity(row.objectsize());
for (int i = 0; i < al; i++) {
container.add(new indexRWIEntry[] { new indexRWIEntryOld(row.getColBytes(3 + i)) }, updateTime);
// fill AND convert old entries to new entries
container.add(new indexRWIEntry[] { new indexRWIEntryNew(new indexRWIEntryOld(row.getColBytes(3 + i))) }, updateTime);
}
return container;
}
private void resetDatabase() {
// deletes the assortment database and creates a new one
if (assortments != null) try {
assortments.close();
} catch (IOException e) {}
try {
// make a back-up
File backupPath = new File(assortmentFile.getParentFile(), "ABKP");
if (!(backupPath.exists())) backupPath.mkdirs();
File backupFile = new File(backupPath, assortmentFile.getName() + System.currentTimeMillis());
assortmentFile.renameTo(backupFile);
log.logInfo("a back-up of the deleted assortment file is in " + backupFile.toString());
if (assortmentFile.exists()) assortmentFile.delete();
assortments = new kelondroCache(kelondroTree.open(assortmentFile, bufferSize / 2, preloadTime, bufferStructure(assortmentLength)), bufferSize / 2, true, false);
} catch (Exception e) {
// if this fails, delete the file
if (!(assortmentFile.delete())) throw new RuntimeException("cannot delete assortment database");
}
}
public Iterator containers(String startWordHash, boolean up, boolean rot) throws IOException {
public Iterator wordContainers(String startWordHash, boolean up, boolean rot) throws IOException {
// returns an iteration of indexContainer elements
try {
return new containerIterator(startWordHash, up, rot);
} catch (kelondroException e) {
log.logSevere("iterateAssortment/kelondro-error: " + e.getMessage() + " - reset assortment-DB " + assortmentFile, e);
resetDatabase();
return null;
}
}
@ -288,22 +171,6 @@ public final class plasmaWordIndexAssortment {
return 0;
}
}
public int cacheNodeChunkSize() {
return assortments.cacheNodeChunkSize();
}
public int cacheObjectChunkSize() {
return assortments.cacheObjectChunkSize();
}
public int[] cacheNodeStatus() {
return assortments.cacheNodeStatus();
}
public long[] cacheObjectStatus() {
return assortments.cacheObjectStatus();
}
public void close() {
try {

@ -1,408 +0,0 @@
// plasmaWordIndexAssortmentCluster.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// last major change: 20.5.2005
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
/*
An assortment-cluster is a set of assortments.
Each one carries a different number of URL's
*/
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import de.anomic.index.indexContainer;
import de.anomic.index.indexContainerOrder;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRI;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroMergeIterator;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroRow;
import de.anomic.server.logging.serverLog;
public final class plasmaWordIndexAssortmentCluster implements indexRI {
// class variables
private int clusterCount; // number of cluster files
public int clusterCapacity; // number of all url referrences that can be stored to a single word in the cluster
//private serverLog log;
private plasmaWordIndexAssortment[] assortments;
private long completeBufferKB;
private kelondroRow payloadrow;
public plasmaWordIndexAssortmentCluster(File assortmentsPath, int clusterCount, kelondroRow payloadrow, int bufferkb, long preloadTime, serverLog log) throws IOException {
// set class variables
if (!(assortmentsPath.exists())) assortmentsPath.mkdirs();
this.payloadrow = payloadrow;
this.clusterCount = clusterCount;
this.clusterCapacity = clusterCount * (clusterCount + 1) / 2;
this.completeBufferKB = bufferkb;
// this.log = log;
this.assortments = new plasmaWordIndexAssortment[clusterCount];
// open cluster and close it directly again to detect the element sizes
int[] sizes = new int[clusterCount];
int sumSizes = 1;
plasmaWordIndexAssortment testAssortment;
for (int i = 0; i < clusterCount; i++) {
testAssortment = new plasmaWordIndexAssortment(assortmentsPath, payloadrow, i + 1, 0, 0, null);
sizes[i] = testAssortment.size() + clusterCount - i;
sumSizes += sizes[i];
testAssortment.close();
testAssortment = null;
}
// initialize cluster using the cluster elements size for optimal buffer
// size
long nextTime;
long startTime;
long sS = (long) sumSizes;
for (int i = 0; i < clusterCount; i++) {
nextTime = Math.max(0, preloadTime * ((long) sizes[i]) / sS);
startTime = System.currentTimeMillis();
assortments[i] = new plasmaWordIndexAssortment(
assortmentsPath,
payloadrow,
i + 1,
(int) (completeBufferKB * (long) sizes[i] / (long) sumSizes),
nextTime,
log);
preloadTime -= System.currentTimeMillis() - startTime;
sS -= sizes[i];
}
}
private indexContainer storeSingular(indexContainer newContainer) throws IOException {
// this tries to store the record. If the record does not fit, or a same hash already
// exists and would not fit together with the new record, then the record is deleted from
// the assortmen(s) and returned together with the newRecord.
// if storage was successful, NULL is returned.
if (newContainer.size() > clusterCount) return newContainer; // it will not fit
indexContainer buffer;
while ((buffer = assortments[newContainer.size() - 1].remove(newContainer.getWordHash())) != null) {
if (newContainer.add(buffer, -1) == 0) return newContainer; // security check; othervise this loop does not terminate
if (newContainer.size() > clusterCount) return newContainer; // it will not fit
}
// the assortment (newContainer.size() - 1) should now be empty. put it in there
assortments[newContainer.size() - 1].store(newContainer);
// return null to show that we have stored the new Record successfully
return null;
}
private void storeForced(indexContainer newContainer) throws IOException {
// this stores the record and overwrites an existing record.
// this is safe if we can be shure that the record does not exist before.
if ((newContainer == null) || (newContainer.size() == 0) || (newContainer.size() > clusterCount)) return; // it will not fit
assortments[newContainer.size() - 1].store(newContainer);
}
private void storeStretched(indexContainer newContainer) throws IOException {
// this stores the record and stretches the storage over
// all the assortments that are necessary to fit in the record
// IMPORTANT: it must be ensured that the wordHash does not exist in the cluster before
// i.e. by calling removeFromAll
if (newContainer.size() <= clusterCount) {
storeForced(newContainer);
return;
}
// calculate minimum cluster insert point
int clusterMinStart = clusterCount;
int cap = clusterCapacity - newContainer.size() - 2 * clusterCount;
while (cap > 0) {
cap -= clusterMinStart;
clusterMinStart--;
}
// point the real cluster insert point somewhere between the minimum and the maximum
int clusterStart = clusterCount - (int) (Math.random() * (clusterCount - clusterMinStart));
// do the insert
indexContainer c;
Iterator i = newContainer.entries();
for (int j = clusterStart; j >= 1; j--) {
c = new indexContainer(newContainer.getWordHash(), payloadrow, false);
for (int k = 0; k < j; k++) {
if (i.hasNext()) {
c.add((indexRWIEntry) i.next(), newContainer.updated());
} else {
storeForced(c);
return;
}
}
storeForced(c);
}
}
public indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
indexContainer container = new indexContainer(wordHash, payloadrow, false);
container.add(newEntry);
return addEntries(container, updateTime, dhtCase);
}
public long getUpdateTime(String wordHash) {
indexContainer entries = getContainer(wordHash, null, false, -1);
if (entries == null) return 0;
return entries.updated();
}
public indexContainer addEntries(indexContainer newContainer, long creationTime, boolean dhtCase) {
// this is called by the index ram cache flush process
// it returnes NULL if the storage was successful
// it returnes a new container if the given container cannot be stored
// containers that are returned will be stored in a WORDS file
if (newContainer == null) return null;
if (newContainer.size() > clusterCapacity) return newContainer; // it will not fit
// split the container into several smaller containers that will take the whole thing
// first find out how the container can be splitted
int testsize = Math.min(clusterCount, newContainer.size());
int [] spaces = new int[testsize];
for (int i = testsize - 1; i >= 0; i--) spaces[i] = 0;
int need = newContainer.size();
int selectedAssortment = testsize - 1;
while (selectedAssortment >= 0) {
if (selectedAssortment + 1 <= need) {
spaces[selectedAssortment] = (assortments[selectedAssortment].get(newContainer.getWordHash()) == null) ? (selectedAssortment + 1) : 0;
need -= spaces[selectedAssortment];
assert (need >= 0);
if (need == 0) break;
}
selectedAssortment--;
}
if (need == 0) {
// we found spaces so that we can put in the newContainer into these spaces
indexContainer c;
Iterator i = newContainer.entries();
for (int j = testsize - 1; j >= 0; j--) {
if (spaces[j] == 0) continue;
c = new indexContainer(newContainer.getWordHash(), payloadrow, false);
for (int k = 0; k <= j; k++) {
assert (i.hasNext());
c.add((indexRWIEntry) i.next(), newContainer.updated());
}
try {
storeForced(c);
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
if (newContainer.size() <= clusterCount) try {
newContainer = storeSingular(newContainer);
} catch (IOException e) {
e.printStackTrace();
}
if (newContainer == null) return null;
// clean up the whole thing and try to insert the container then
newContainer.add(deleteContainer(newContainer.getWordHash(), -1), -1);
if (newContainer.size() > clusterCapacity) return newContainer;
try {
storeStretched(newContainer);
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
public indexContainer deleteContainer(String wordHash) {
return deleteContainer(wordHash, -1);
}
public indexContainer deleteContainer(String wordHash, long maxTime) {
// removes all records from all the assortments and return them
indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false);
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
long remainingTime;
for (int i = 0; i < clusterCount; i++) {
buffer = assortments[i].remove(wordHash);
remainingTime = limitTime - System.currentTimeMillis();
if (0 > remainingTime) break;
if (buffer != null) record.add(buffer, remainingTime);
}
return record;
}
/*
public int removeEntries(String wordHash, String[] referenceHashes, boolean deleteComplete) {
indexContainer c = deleteContainer(wordHash, -1);
int b = c.size();
c.removeEntries(wordHash, referenceHashes, false);
if (c.size() != 0) {
addEntries(c, c.updated(), false);
}
return b - c.size();
}
*/
public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) {
indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false);
boolean found = false;
for (int i = 0; i < clusterCount; i++) {
buffer = assortments[i].remove(wordHash);
if ((buffer != null) && (buffer.remove(urlHash) != null)) found = true;
record.add(buffer, -1);
if (found) break;
}
// put back remaining
if (record.size() != 0) {
addEntries(record, record.updated(), false);
}
return found;
}
public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete) {
indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false);
int initialSize = urlHashes.size();
for (int i = 0; i < clusterCount; i++) {
buffer = assortments[i].remove(wordHash);
if (buffer != null) {
// sort out url hashes that shall be deleted
Iterator bi = buffer.entries();
indexRWIEntry entry;
while (bi.hasNext()) {
entry = (indexRWIEntry) bi.next();
if (urlHashes.remove(entry.urlHash())) bi.remove();
}
record.add(buffer, -1);
}
if (urlHashes.size() == 0) break;
}
// put back remaining
if (record.size() != 0) {
addEntries(record, record.updated(), false);
}
return initialSize - urlHashes.size();
}
public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) {
// collect all records from all the assortments and return them
indexContainer buffer, record = new indexContainer(wordHash, payloadrow, false);
long timeout = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
for (int i = 0; i < clusterCount; i++) {
buffer = assortments[i].get(wordHash);
if (buffer != null) {
buffer.select(urlselection);
record.add(buffer, -1);
}
if (System.currentTimeMillis() > timeout) break;
}
return record;
}
public int indexSize(String wordHash) {
int size = 0;
for (int i = 0; i < clusterCount; i++) {
if (assortments[i].contains(wordHash)) size += i + 1;
}
return size;
}
public Iterator wordContainers(String startWordHash, boolean rot) {
try {
return wordContainers(startWordHash, true, rot);
} catch (IOException e) {
return new HashSet().iterator();
}
}
public Iterator wordContainers(String startWordHash, boolean up, boolean rot) throws IOException {
// iterates indexContainer - Objects
HashSet containerIterators = new HashSet();
for (int i = 0; i < clusterCount; i++) containerIterators.add(assortments[i].containers(startWordHash, up, rot));
return kelondroMergeIterator.cascade(containerIterators, new indexContainerOrder(kelondroNaturalOrder.naturalOrder), indexContainer.containerMergeMethod, up);
}
public int size() {
int total = 0;
for (int i = 0; i < clusterCount; i++) total += assortments[i].size();
return total;
}
public int[] sizes() {
int[] sizes = new int[clusterCount];
for (int i = 0; i < clusterCount; i++) sizes[i] = assortments[i].size();
return sizes;
}
public int cacheChunkSizeAvg() {
int i = 0;
for (int j = 0; j < clusterCount; j++) {
i += assortments[j].cacheNodeChunkSize();
}
return i / clusterCount;
}
public int cacheObjectSizeAvg() {
long c = 0, k = 0;
for (int j = 0; j < clusterCount; j++) {
c += assortments[j].size() * assortments[j].cacheObjectChunkSize();
k += assortments[j].size();
}
return (k > 0) ? (int) (c / k) : 0;
}
public int[] cacheNodeStatus() {
int[][] a = new int[assortments.length][];
for (int i = assortments.length - 1; i >= 0; i--) a[i] = assortments[i].cacheNodeStatus();
return kelondroRecords.cacheCombinedStatus(a, assortments.length);
}
public long[] cacheObjectStatus() {
long[][] a = new long[assortments.length][];
for (int i = assortments.length - 1; i >= 0; i--) a[i] = assortments[i].cacheObjectStatus();
return kelondroCache.combinedStatus(a, a.length);
}
public void close(int waitingSeconds) {
for (int i = 0; i < clusterCount; i++) assortments[i].close();
}
}

@ -50,6 +50,7 @@ import java.util.Iterator;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryNew;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRow;
@ -131,7 +132,7 @@ public final class plasmaWordIndexFile {
public indexRWIEntry getEntry(String urlhash) throws IOException {
kelondroRow.Entry n = theIndex.get(urlhash.getBytes());
if (n == null) return null;
return new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null));
return new indexRWIEntryNew(new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null)));
}
public boolean contains(String urlhash) throws IOException {
@ -142,33 +143,12 @@ public final class plasmaWordIndexFile {
return (theIndex.get(entry.urlHash().getBytes()) != null);
}
public boolean addEntry(indexRWIEntry entry) throws IOException {
if (entry == null) return false;
indexRWIEntry oldEntry = getEntry(entry.urlHash());
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity
return false;
}
return (theIndex.put(entry.toKelondroEntry()) == null);
public void addEntry(indexRWIEntry entry) {
throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above");
}
public int addEntries(indexContainer container) throws IOException {
//System.out.println("* adding " + newEntries.size() + " cached word index entries for word " + wordHash); // debug
// fetch the index cache
if ((container == null) || (container.size() == 0)) return 0;
// open file
int count = 0;
// write from vector
if (container != null) {
Iterator i = container.entries();
while (i.hasNext()) {
if (addEntry((indexRWIEntry) i.next())) count++;
}
}
// close and return
return count;
public void addEntries(indexContainer container) {
throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above");
}
public boolean deleteComplete() {
@ -228,7 +208,7 @@ public final class plasmaWordIndexFile {
public Object next() {
if (i == null) return null;
kelondroRow.Entry n = (kelondroRow.Entry) i.next();
return new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null));
return new indexRWIEntryNew(new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null)));
}
public void remove() {
throw new UnsupportedOperationException();
@ -239,8 +219,7 @@ public final class plasmaWordIndexFile {
return "DB:" + theIndex.toString();
}
public void merge(plasmaWordIndexFile otherEntity, long time) throws IOException {
public void merge(plasmaWordIndexFile otherEntity, long time) {
// this is a merge of another entity to this entity
// the merge is interrupted when the given time is over
// a time=-1 means: no timeout
@ -255,174 +234,4 @@ public final class plasmaWordIndexFile {
}
}
/*
// join methods
private static int log2(int x) {
int l = 0;
while (x > 0) {x = x >> 1; l++;}
return l;
}
public static plasmaWordIndexEntity joinEntities(Set entities, long time) throws IOException {
// big problem here: there cannot be a time-out for join, since a time-out will leave the joined set too big.
// this will result in a OR behavior of the search instead of an AND behavior
long stamp = System.currentTimeMillis();
// order entities by their size
TreeMap map = new TreeMap();
plasmaWordIndexEntity singleEntity;
Iterator i = entities.iterator();
int count = 0;
while (i.hasNext()) {
// get next entity:
singleEntity = (plasmaWordIndexEntity) i.next();
// check result
if ((singleEntity == null) || (singleEntity.size() == 0)) return new plasmaWordIndexEntity(null); // as this is a cunjunction of searches, we have no result if any word is not known
// store result in order of result size
map.put(new Long(singleEntity.size() * 1000 + count), singleEntity);
count++;
}
// check if there is any result
if (map.size() == 0) return new plasmaWordIndexEntity(null); // no result, nothing found
// the map now holds the search results in order of number of hits per word
// we now must pairwise build up a conjunction of these sets
Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries
plasmaWordIndexEntity searchA, searchB, searchResult = (plasmaWordIndexEntity) map.remove(k);
while ((map.size() > 0) && (searchResult.size() > 0)) {
// take the first element of map which is a result and combine it with result
k = (Long) map.firstKey(); // the next smallest...
time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis();
searchA = searchResult;
searchB = (plasmaWordIndexEntity) map.remove(k);
searchResult = plasmaWordIndexEntity.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1));
// close the input files/structures
if (searchA != searchResult) searchA.close();
if (searchB != searchResult) searchB.close();
}
searchA = null; // free resources
searchB = null; // free resources
// in 'searchResult' is now the combined search result
if (searchResult.size() == 0) return new plasmaWordIndexEntity(null);
return searchResult;
}
public static plasmaWordIndexEntity joinConstructive(plasmaWordIndexEntity i1, plasmaWordIndexEntity i2, long time) throws IOException {
if ((i1 == null) || (i2 == null)) return null;
if ((i1.size() == 0) || (i2.size() == 0)) return new plasmaWordIndexEntity(null);
// decide which method to use
int high = ((i1.size() > i2.size()) ? i1.size() : i2.size());
int low = ((i1.size() > i2.size()) ? i2.size() : i1.size());
int stepsEnum = 10 * (high + low - 1);
int stepsTest = 12 * log2(high) * low;
// start most efficient method
if (stepsEnum > stepsTest) {
if (i1.size() < i2.size())
return joinConstructiveByTest(i1, i2, time);
else
return joinConstructiveByTest(i2, i1, time);
} else {
return joinConstructiveByEnumeration(i1, i2, time);
}
}
private static plasmaWordIndexEntity joinConstructiveByTest(plasmaWordIndexEntity small, plasmaWordIndexEntity large, long time) throws IOException {
System.out.println("DEBUG: JOIN METHOD BY TEST");
plasmaWordIndexEntity conj = new plasmaWordIndexEntity(null); // start with empty search result
Iterator se = small.elements(true);
plasmaWordIndexEntry ie0, ie1;
long stamp = System.currentTimeMillis();
try {
while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) {
ie0 = (plasmaWordIndexEntry) se.next();
ie1 = large.getEntry(ie0.getUrlHash());
if (ie1 != null) {
// this is a hit. Calculate word distance:
ie0.combineDistance(ie1);
conj.addEntry(ie0);
}
}
} catch (kelondroException e) {
//serverLog.logSevere("PLASMA", "joinConstructiveByTest: Database corrupt (" + e.getMessage() + "), deleting index");
small.deleteComplete();
return conj;
}
return conj;
}
private static plasmaWordIndexEntity joinConstructiveByEnumeration(plasmaWordIndexEntity i1, plasmaWordIndexEntity i2, long time) throws IOException {
System.out.println("DEBUG: JOIN METHOD BY ENUMERATION");
plasmaWordIndexEntity conj = new plasmaWordIndexEntity(null); // start with empty search result
Iterator e1 = i1.elements(true);
Iterator e2 = i2.elements(true);
int c;
if ((e1.hasNext()) && (e2.hasNext())) {
plasmaWordIndexEntry ie1;
plasmaWordIndexEntry ie2;
try {
ie1 = (plasmaWordIndexEntry) e1.next();
} catch (kelondroException e) {
//serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database corrupt 1 (" + e.getMessage() + "), deleting index");
i1.deleteComplete();
return conj;
}
try {
ie2 = (plasmaWordIndexEntry) e2.next();
} catch (kelondroException e) {
//serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database corrupt 2 (" + e.getMessage() + "), deleting index");
i2.deleteComplete();
return conj;
}
long stamp = System.currentTimeMillis();
while ((System.currentTimeMillis() - stamp) < time) {
c = ie1.getUrlHash().compareTo(ie2.getUrlHash());
if (c < 0) {
try {
if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
} catch (kelondroException e) {
//serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 1 corrupt (" + e.getMessage() + "), deleting index");
i1.deleteComplete();
break;
}
} else if (c > 0) {
try {
if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
} catch (kelondroException e) {
//serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 2 corrupt (" + e.getMessage() + "), deleting index");
i2.deleteComplete();
break;
}
} else {
// we have found the same urls in different searches!
ie1.combineDistance(ie2);
conj.addEntry(ie1);
try {
if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
} catch (kelondroException e) {
//serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 1 corrupt (" + e.getMessage() + "), deleting index");
i1.deleteComplete();
break;
}
try {
if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
} catch (kelondroException e) {
//serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 2 corrupt (" + e.getMessage() + "), deleting index");
i2.deleteComplete();
break;
}
}
}
}
return conj;
}
*/
}

@ -43,7 +43,6 @@
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
@ -53,23 +52,19 @@ import java.util.TreeSet;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRI;
import de.anomic.index.indexRWIEntryNew;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroRow;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
public class plasmaWordIndexFileCluster implements indexRI {
// class variables
private final File databaseRoot;
private final serverLog log;
private int size;
private kelondroRow payloadrow;
private final File databaseRoot;
private int size;
public plasmaWordIndexFileCluster(File databaseRoot, kelondroRow payloadrow, serverLog log) {
public plasmaWordIndexFileCluster(File databaseRoot) {
this.databaseRoot = databaseRoot;
this.payloadrow = payloadrow;
this.log = log;
this.size = 0;
}
@ -77,7 +72,6 @@ public class plasmaWordIndexFileCluster implements indexRI {
return size;
}
public Iterator wordContainers(String startHash, boolean rot) {
return new containerIterator(wordHashes(startHash, rot));
}
@ -234,16 +228,16 @@ public class plasmaWordIndexFileCluster implements indexRI {
if ((maxTime < 0) || (maxTime > 60000)) maxTime=60000; // maximum is one minute
if (exists(wordHash)) {
plasmaWordIndexFile entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10);
indexContainer container = new indexContainer(wordHash, payloadrow, false);
indexRWIEntry entry;
indexContainer container = new indexContainer(wordHash, indexRWIEntryNew.urlEntryRow);
indexRWIEntryNew entry;
Iterator i = entity.elements(true);
while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) {
entry = (indexRWIEntry) i.next();
entry = new indexRWIEntryNew((indexRWIEntryOld) i.next());
if ((urlselection == null) || (urlselection.contains(entry.urlHash()))) container.add(entry);
}
return container;
} else {
return new indexContainer(wordHash, payloadrow, false);
return new indexContainer(wordHash, indexRWIEntryNew.urlEntryRow);
}
}
@ -258,80 +252,26 @@ public class plasmaWordIndexFileCluster implements indexRI {
public indexContainer deleteContainer(String wordHash) {
plasmaWordIndexFile.removePlasmaIndex(databaseRoot, wordHash);
return new indexContainer(wordHash, payloadrow, false);
return null;
}
public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) {
// removes all given url hashes from a single word index. Returns number of deletions.
plasmaWordIndexFile pi = null;
boolean removed = false;
if (exists(wordHash)) try {
pi = getEntity(wordHash, true, -1);
if (pi.removeEntry(urlHash, deleteComplete)) removed = true;
int size = pi.size();
pi.close(); pi = null;
// check if we can remove the index completely
if ((deleteComplete) && (size == 0)) deleteContainer(wordHash);
return removed;
} catch (IOException e) {
log.logSevere("plasmaWordIndexClassic.removeEntries: " + e.getMessage());
return false;
} finally {
if (pi != null) try{pi.close();}catch(Exception e){}
} else return false;
throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above");
}
public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete) {
// removes all given url hashes from a single word index. Returns number of deletions.
plasmaWordIndexFile pi = null;
int count = 0;
if (exists(wordHash)) try {
pi = getEntity(wordHash, true, -1);
Iterator i = urlHashes.iterator();
while (i.hasNext()) if (pi.removeEntry((String) i.next(), deleteComplete)) count++;
int size = pi.size();
pi.close(); pi = null;
// check if we can remove the index completely
if ((deleteComplete) && (size == 0)) deleteContainer(wordHash);
return count;
} catch (IOException e) {
log.logSevere("plasmaWordIndexClassic.removeEntries: " + e.getMessage());
return count;
} finally {
if (pi != null) try{pi.close();}catch(Exception e){}
} else return 0;
throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above");
}
public indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
indexContainer container = new indexContainer(wordHash, payloadrow, false);
container.add(newEntry);
return addEntries(container, updateTime, dhtCase);
public void addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above");
}
public indexContainer addEntries(indexContainer container, long creationTime, boolean highPriority) {
//System.out.println("* adding " + newEntries.size() + " cached word index entries for word " + wordHash); // debug
// fetch the index cache
if ((container == null) || (container.size() == 0)) return null;
// open file
plasmaWordIndexFile pi = null;
try {
pi = new plasmaWordIndexFile(databaseRoot, container.getWordHash(), false);
pi.addEntries(container);
// close and return
pi.close(); pi = null;
return null;
} catch (IOException e) {
log.logSevere("plasmaWordIndexClassic.addEntries: " + e.getMessage());
return container;
} finally {
if (pi != null) try{pi.close();}catch (Exception e){}
}
public void addEntries(indexContainer container, long creationTime, boolean highPriority) {
throw new UnsupportedOperationException("word files are not supported in YaCy 0.491 and above");
}
public void close(int waitingSeconds) {
public void close() {
}
public int indexSize(String wordHash) {

@ -190,9 +190,9 @@ public class urlRedirectord implements serverHandler {
) {
// first delete old entry, if exists
String urlhash = plasmaURL.urlHash(this.nextURL);
switchboard.urlPool.loadedURL.remove(urlhash);
switchboard.urlPool.noticeURL.remove(urlhash);
switchboard.urlPool.errorURL.remove(urlhash);
switchboard.wordIndex.loadedURL.remove(urlhash);
switchboard.noticeURL.remove(urlhash);
switchboard.errorURL.remove(urlhash);
// enqueuing URL for crawling
reasonString = switchboard.sbStackCrawlThread.stackCrawl(

@ -52,7 +52,6 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
@ -520,30 +519,12 @@ public final class yacyClient {
// save the url entry
indexRWIEntry entry;
if (urlEntry.word() == null) {
// the old way to define words
int urlLength = comp.url().toNormalform().length();
int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length;
entry = wordIndex.newRWIEntry(
urlEntry.hash(),
urlLength,
urlComps,
comp.descr().length(),
urlEntry.wordCount(),
0, 0, 0, 0, 0, 0,
urlEntry.size(),
urlEntry.moddate().getTime(),
System.currentTimeMillis(),
0,
urlEntry.language(),
urlEntry.doctype(),
0,0,
new kelondroBitfield(4)
);
} else {
// the new way: the search-result-url transports all the attributes of word indexes
entry = urlEntry.word();
yacyCore.log.logWarning("DEBUG-SEARCH: no word attached from peer " + targetPeer.getName() + ", version " + targetPeer.getVersion());
continue; // no word attached
}
// the search-result-url transports all the attributes of word indexes
entry = urlEntry.word();
if (urlEntry.snippet() != null) {
// we don't store the snippets along the url entry, because they are search-specific.
// instead, they are placed in a snipped-search cache.

@ -234,6 +234,7 @@ public class yacyDHTAction implements yacyPeerAction {
}
public static boolean shallBeOwnWord(String wordhash) {
if (yacyCore.seedDB == null) return false;
if (yacyCore.seedDB.mySeed.isPotential()) return false;
final double distance = dhtDistance(yacyCore.seedDB.mySeed.hash, wordhash);
final double max = 1.2 / yacyCore.seedDB.sizeConnected();

@ -134,8 +134,8 @@ public class yacyPeerActions {
sb.setConfig("totalPPM", Long.toString(indexedc / 1)); //no division by zero
seedDB.mySeed.put(yacySeed.UPTIME, Long.toString(uptime/60)); // the number of minutes that the peer is up in minutes/day (moving average MA30)
seedDB.mySeed.put(yacySeed.LCOUNT, Integer.toString(sb.urlPool.loadedURL.size())); // the number of links that the peer has stored (LURL's)
seedDB.mySeed.put(yacySeed.NCOUNT, Integer.toString(sb.urlPool.noticeURL.stackSize())); // the number of links that the peer has noticed, but not loaded (NURL's)
seedDB.mySeed.put(yacySeed.LCOUNT, Integer.toString(sb.wordIndex.loadedURL.size())); // the number of links that the peer has stored (LURL's)
seedDB.mySeed.put(yacySeed.NCOUNT, Integer.toString(sb.noticeURL.stackSize())); // the number of links that the peer has noticed, but not loaded (NURL's)
seedDB.mySeed.put(yacySeed.ICOUNT, Integer.toString(sb.cacheSizeMin())); // the minimum number of words that the peer has indexed (as it says)
seedDB.mySeed.put(yacySeed.SCOUNT, Integer.toString(seedDB.sizeConnected())); // the number of seeds that the peer has stored
seedDB.mySeed.put(yacySeed.CCOUNT, Double.toString(((int) ((seedDB.sizeConnected() + seedDB.sizeDisconnected() + seedDB.sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour)

@ -72,11 +72,12 @@ import de.anomic.http.httpdFileHandler;
import de.anomic.http.httpdProxyHandler;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexRWIEntryNew;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURLEntryOld;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroRow;
@ -87,10 +88,11 @@ import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURLPool;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.plasmaWordIndexAssortmentCluster;
import de.anomic.plasma.plasmaWordIndexAssortment;
import de.anomic.plasma.plasmaWordIndexFile;
import de.anomic.plasma.plasmaWordIndexFileCluster;
import de.anomic.plasma.dbImport.AssortmentImporter;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils;
@ -655,34 +657,96 @@ public final class yacy {
File indexRoot = new File(new File(homePath), "DATA/INDEX");
serverLog log = new serverLog("WORDMIGRATION");
log.logInfo("STARTING MIGRATION");
plasmaWordIndex wordIndexCache = null;
wordIndexCache = new plasmaWordIndex(dbroot, indexRoot, true, 20000, 10000, log);
plasmaWordIndex wordIndexCache = new plasmaWordIndex(indexRoot, 60000000, 60000000, 10000, log);
enumerateFiles words = new enumerateFiles(new File(dbroot, "WORDS"), true, false, true, true);
String wordhash;
File wordfile;
Object migrationStatus;
int migrationCount;
while (words.hasMoreElements())
try {
wordfile = (File) words.nextElement();
wordhash = wordfile.getName().substring(0, 12);
// System.out.println("NOW: " + wordhash);
migrationStatus = wordIndexCache.migrateWords2index(wordhash);
if (migrationStatus instanceof Integer) {
int migrationCount = ((Integer) migrationStatus).intValue();
migrationCount = migrateWords2index(dbroot, wordhash, wordIndexCache);
if (migrationCount >= 0) {
if (migrationCount == 0)
log.logInfo("SKIPPED " + wordhash + ": empty");
else if (migrationCount > 0)
log.logInfo("MIGRATED " + wordhash + ": " + migrationCount + " entries");
else
log.logInfo("REVERSED " + wordhash + ": " + (-migrationCount) + " entries");
} else if (migrationStatus instanceof String) {
log.logInfo("SKIPPED " + wordhash + ": " + migrationStatus);
} else {
log.logInfo("SKIPPED " + wordhash);
}
} catch (Exception e) {
log.logSevere("Exception", e);
}
log.logInfo("FINISHED MIGRATION JOB, WAIT FOR DUMP");
wordIndexCache.close(60);
wordIndexCache.close();
log.logInfo("TERMINATED MIGRATION");
}
public static int migrateWords2index(File oldDatabaseRoot, String wordhash, plasmaWordIndex wi) throws IOException {
// returns the number of entries that had been added to the assortments
// can be negative if some assortments have been moved to the backend
File db = plasmaWordIndexFile.wordHash2path(oldDatabaseRoot, wordhash);
if (!(db.exists())) {
serverLog.logSevere("migrateWordIndex", "word index file for hash " + wordhash + " not found");
return -1;
}
plasmaWordIndexFile entity = null;
try {
entity = new plasmaWordIndexFile(oldDatabaseRoot, wordhash, true);
int size = entity.size();
indexContainer container = new indexContainer(wordhash, indexRWIEntryNew.urlEntryRow);
try {
Iterator entries = entity.elements(true);
indexRWIEntry entry;
while (entries.hasNext()) {
entry = (indexRWIEntry) entries.next();
// System.out.println("ENTRY = " + entry.getUrlHash());
container.add(new indexRWIEntry[] { entry }, System.currentTimeMillis());
}
// we have read all elements, now delete the entity
entity.deleteComplete();
entity.close();
entity = null;
wi.addEntries(container, container.updated(), false);
return size;
} catch (kelondroException e) {
// database corrupted, we simply give up the database and delete it
try { entity.close(); } catch (Exception ee) { }
entity = null;
try { db.delete(); } catch (Exception ee) { }
serverLog.logSevere("migrateWordIndex", "database for hash " + wordhash + " corrupted; deleted");
return -1;
}
} finally {
if (entity != null) try {entity.close();}catch(Exception e){}
}
}
public static void migrateAssortments(String homePath) {
// run with "java -classpath classes yacy -migrateassortments"
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
serverLog log = new serverLog("ASSORTMENTMIGRATION");
File aclusterroot = new File(new File(homePath), "DATA/PLASMADB/ACLUSTER");
File indexRoot = new File(new File(homePath), "DATA/INDEX");
plasmaWordIndex wordIndexCache = new plasmaWordIndex(indexRoot, 60000000, 60000000, 10000, log);
log.logInfo("STARTING MIGRATION");
String[] a = aclusterroot.list();
AssortmentImporter importer = new AssortmentImporter(wordIndexCache);
for (int i = a.length - 1; i >= 0; i--) {
if (a[i].startsWith("indexAssortment")) {
importer.init(new File(aclusterroot, a[i]), 16000000, 2000);
importer.run();
}
}
log.logInfo("FINISHED MIGRATION JOB, WAIT FOR DUMP");
wordIndexCache.close();
log.logInfo("TERMINATED MIGRATION");
}
@ -693,7 +757,6 @@ public final class yacy {
public static void minimizeUrlDB(String homePath, int dbcache) {
// run with "java -classpath classes yacy -minimizeUrlDB"
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
File plasmaroot = new File(new File(homePath), "DATA/PLASMADB");
File indexRoot = new File(new File(homePath), "DATA/INDEX");
serverLog log = new serverLog("URL-CLEANUP");
try {
@ -702,17 +765,17 @@ public final class yacy {
// db containing all currently loades urls
int cache = dbcache * 1024; // in KB
log.logFine("URLDB-Caches: "+cache+" bytes");
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(plasmaroot, indexRoot, cache, 10000);
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexRoot, cache, 10000);
// db used to hold all neede urls
plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(new File(plasmaroot, "minimized"), indexRoot, cache, 10000);
plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(indexRoot, cache, 10000);
Runtime rt = Runtime.getRuntime();
int cacheMem = (int)((serverMemory.max-rt.totalMemory())/1024)-(2*cache + 8*1024);
if (cacheMem < 2048) throw new OutOfMemoryError("Not enough memory available to start clean up.");
int cacheMem = (int)(serverMemory.max-rt.totalMemory());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
plasmaWordIndex wordIndex = new plasmaWordIndex(plasmaroot, indexRoot, true, cacheMem, 10000, log);
Iterator indexContainerIterator = wordIndex.wordContainers("------------", plasmaWordIndex.RL_WORDFILES, false);
plasmaWordIndex wordIndex = new plasmaWordIndex(indexRoot, cacheMem, cacheMem, 10000, log);
Iterator indexContainerIterator = wordIndex.wordContainers("------------", false, false);
long urlCounter = 0, wordCounter = 0;
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0;
@ -767,7 +830,7 @@ public final class yacy {
currentUrlDB.close();
minimizedUrlDB.close();
wordIndex.close(600);
wordIndex.close();
// TODO: rename the mimimized UrlDB to the name of the previous UrlDB
@ -941,16 +1004,16 @@ public final class yacy {
File root = new File(homePath);
try {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, 1000, 1000, 10000);
final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
HashMap doms = new HashMap();
System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries.");
System.out.println("Started domain list extraction from " + sb.wordIndex.loadedURL.size() + " url entries.");
System.out.println("a dump will be written after double-check of all extracted domains.");
System.out.println("This process may fail in case of too less memory. To increase memory, start with");
System.out.println("java -Xmx<megabytes>m -classpath classes yacy -domlist [ -source { nurl | lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ <path to DATA folder> ]");
int c = 0;
long start = System.currentTimeMillis();
if (source.equals("lurl")) {
Iterator eiter = pool.loadedURL.entries(true, false, null);
Iterator eiter = sb.wordIndex.loadedURL.entries(true, false, null);
indexURLEntry entry;
while (eiter.hasNext()) {
try {
@ -966,11 +1029,11 @@ public final class yacy {
c + " urls checked, " +
doms.size() + " domains collected, " +
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
}
if (source.equals("eurl")) {
Iterator eiter = pool.errorURL.entries(true, false, null);
Iterator eiter = sb.errorURL.entries(true, false, null);
plasmaCrawlEURL.Entry entry;
while (eiter.hasNext()) {
try {
@ -985,11 +1048,11 @@ public final class yacy {
c + " urls checked, " +
doms.size() + " domains collected, " +
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
}
if (source.equals("nurl")) {
Iterator eiter = pool.noticeURL.entries(true, false, null);
Iterator eiter = sb.noticeURL.entries(true, false, null);
plasmaCrawlNURL.Entry entry;
while (eiter.hasNext()) {
try {
@ -1004,7 +1067,7 @@ public final class yacy {
c + " urls checked, " +
doms.size() + " domains collected, " +
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
((System.currentTimeMillis() - start) * (pool.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
}
@ -1048,7 +1111,7 @@ public final class yacy {
System.out.println("Started domain list dump to file " + file);
serverFileUtils.saveSet(file, "plain", doms.keySet(), new String(serverCore.crlf));
}
pool.close();
sb.close();
} catch (IOException e) {
e.printStackTrace();
}
@ -1057,12 +1120,12 @@ public final class yacy {
private static void urllist(String homePath, String source, boolean html, String targetName) {
File root = new File(homePath);
try {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, 1000, 1000, 10000);
final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
File file = new File(root, targetName);
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
if (source.equals("lurl")) {
Iterator eiter = pool.loadedURL.entries(true, false, null);
Iterator eiter = sb.wordIndex.loadedURL.entries(true, false, null);
indexURLEntry entry;
while (eiter.hasNext()) {
entry = (indexURLEntry) eiter.next();
@ -1079,7 +1142,7 @@ public final class yacy {
}
}
if (source.equals("eurl")) {
Iterator eiter = pool.errorURL.entries(true, false, null);
Iterator eiter = sb.errorURL.entries(true, false, null);
plasmaCrawlEURL.Entry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlEURL.Entry) eiter.next();
@ -1095,7 +1158,7 @@ public final class yacy {
}
}
if (source.equals("nurl")) {
Iterator eiter = pool.noticeURL.entries(true, false, null);
Iterator eiter = sb.noticeURL.entries(true, false, null);
plasmaCrawlNURL.Entry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlNURL.Entry) eiter.next();
@ -1111,14 +1174,14 @@ public final class yacy {
}
}
bos.close();
pool.close();
sb.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private static void migratelurls(File root, File urlHash) {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, 1000, 1000, 10000);
private static void migratelurls(String homePath, File urlHash) {
final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
kelondroTree oldindex = null;
try {
oldindex = new kelondroTree(urlHash, 1000, -1, indexURLEntryOld.rowdef);
@ -1146,7 +1209,7 @@ public final class yacy {
if (oldrow != null) try {
oldentry = new indexURLEntryOld(oldrow, null);
comp = oldentry.comp();
newentry = pool.loadedURL.newEntry(
newentry = sb.wordIndex.loadedURL.newEntry(
comp.url(),
comp.descr(),
"",
@ -1163,7 +1226,7 @@ public final class yacy {
new kelondroBitfield(4),
oldentry.language(),
0, 0, 0, 0, 0, 0);
pool.loadedURL.store(newentry);
sb.wordIndex.loadedURL.store(newentry);
c++;
} catch (IOException e) {
// ignore
@ -1173,7 +1236,7 @@ public final class yacy {
last = System.currentTimeMillis();
}
}
pool.close();
sb.close();
try { oldindex.close(); } catch (IOException e) { }
System.out.println("MIGRATION OF " + c + " URLs FINISHED");
}
@ -1193,12 +1256,11 @@ public final class yacy {
*/
private static void urldbcleanup(String homePath) {
File root = new File(homePath);
File plasmaroot = new File(root, "DATA/PLASMADB");
File indexroot = new File(root, "DATA/INDEX");
serverLog log = new serverLog("URLDBCLEANUP");
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
try {
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(plasmaroot, indexroot, 4194304, 10000);
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexroot, 4194304, 10000);
currentUrlDB.urldbcleanup();
currentUrlDB.close();
} catch (IOException e) {
@ -1218,19 +1280,16 @@ public final class yacy {
try {
Iterator indexContainerIterator = null;
if (resource.equals("all")) {
WordIndex = new plasmaWordIndex(homeDBroot, indexRoot, true, 8*1024*1024, 3000, log);
indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
} else if (resource.equals("assortments")) {
plasmaWordIndexAssortmentCluster assortmentCluster = new plasmaWordIndexAssortmentCluster(new File(homeDBroot, "ACLUSTER"), 64, indexRWIEntryOld.urlEntryRow, 16*1024*1024, 3000, log);
indexContainerIterator = assortmentCluster.wordContainers(wordChunkStartHash, true, false);
} /*else if (resource.startsWith("assortment")) {
WordIndex = new plasmaWordIndex(indexRoot, 8*1024*1024, 8*1024*1024, 3000, log);
indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, false, false);
} else if (resource.startsWith("assortment")) {
int a = Integer.parseInt(resource.substring(10));
plasmaWordIndexAssortment assortment = new plasmaWordIndexAssortment(new File(homeDBroot, "ACLUSTER"), a, 8*1024*1024, 3000, null);
indexContainerIterator = assortment.hashes(wordChunkStartHash, true, false);
indexContainerIterator = assortment.wordContainers(wordChunkStartHash, true, false);
} else if (resource.equals("words")) {
plasmaWordIndexFileCluster fileDB = new plasmaWordIndexFileCluster(homeDBroot, log);
indexContainerIterator = fileDB.wordContainers(wordChunkStartHash, true, false);
}*/ // *** FIXME ***
plasmaWordIndexFileCluster fileDB = new plasmaWordIndexFileCluster(homeDBroot);
indexContainerIterator = fileDB.wordContainers(wordChunkStartHash, false);
}
int counter = 0;
indexContainer container = null;
if (format.equals("zip")) {
@ -1269,7 +1328,7 @@ public final class yacy {
log.logSevere("IOException", e);
}
if (WordIndex != null) {
WordIndex.close(60);
WordIndex.close();
WordIndex = null;
}
}
@ -1354,10 +1413,15 @@ public final class yacy {
if (args.length == 2) applicationRoot= args[1];
shutdown(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-migratewords"))) {
// migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible
// migrate words from DATA/PLASMADB/WORDS path to collection index
// attention: this may run long and should not be interrupted!
if (args.length == 2) applicationRoot= args[1];
migrateWords(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-migrateassortments"))) {
// migrate assortments from DATA/PLASMADB/ACLUSTER path to collection index
// attention: this may run long and should not be interrupted!
if (args.length == 2) applicationRoot= args[1];
migrateAssortments(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-minimizeurldb"))) {
// migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible
// attention: this may run long and should not be interrupted!
@ -1437,7 +1501,7 @@ public final class yacy {
urllist(applicationRoot, source, html, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-migratelurls"))) {
File root = new File(applicationRoot);
migratelurls(root, new File(root, "DATA/PLASMADB/urlHash.db"));
migratelurls(applicationRoot, new File(root, "DATA/PLASMADB/urlHash.db"));
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
// generate a url list and save it in a file
if (args.length == 2) applicationRoot= args[1];

Loading…
Cancel
Save