diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java
index 2c5d10b58..72cdd622b 100644
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@@ -77,7 +77,7 @@ public class IndexControl_p {
prop.put("urlhash", "");
prop.put("result", "");
prop.put("wcount", "" + switchboard.wordIndex.size());
- prop.put("ucount", "" + switchboard.loadedURL.size());
+ prop.put("ucount", "" + switchboard.urlPool.loadedURL.size());
prop.put("otherHosts", "");
prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : "");
prop.put("indexReceiveChecked", (switchboard.getConfig("allowReceiveIndex", "true").equals("true")) ? "checked" : "");
@@ -132,7 +132,7 @@ public class IndexControl_p {
}
}
if (delurlref) for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true);
- if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.loadedURL.remove(urlx[i]);
+ if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.urlPool.loadedURL.remove(urlx[i]);
switchboard.wordIndex.deleteIndex(keyhash);
post.remove("keyhashdeleteall");
if ((keystring.length() > 0) && (plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)))
@@ -143,7 +143,7 @@ public class IndexControl_p {
if (post.containsKey("keyhashdelete")) {
if (delurlref) for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true);
- if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.loadedURL.remove(urlx[i]);
+ if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.urlPool.loadedURL.remove(urlx[i]);
switchboard.wordIndex.removeEntries(keyhash, urlx, true);
// this shall lead to a presentation of the list; so handle that the remaining program
// thinks that it was called for a list presentation
@@ -161,14 +161,14 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashdelete")) {
- plasmaCrawlLURL.entry entry = switchboard.loadedURL.getEntry(urlhash);
+ plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url();
if (url == null) {
prop.put("result", "No Entry for url hash " + urlhash + "; nothing deleted.");
} else {
urlstring = htmlFilterContentScraper.urlNormalform(url);
prop.put("urlstring", "");
- switchboard.loadedURL.remove(urlhash);
+ switchboard.urlPool.loadedURL.remove(urlhash);
prop.put("result", "Removed URL " + urlstring);
}
}
@@ -198,7 +198,7 @@ public class IndexControl_p {
String result;
long starttime = System.currentTimeMillis();
indexes[0] = switchboard.wordIndex.getEntity(keyhash, true);
- result = yacyClient.transferIndex(yacyCore.seedDB.getConnected(post.get("hostHash", "")), indexes, switchboard.loadedURL);
+ result = yacyClient.transferIndex(yacyCore.seedDB.getConnected(post.get("hostHash", "")), indexes, switchboard.urlPool.loadedURL);
prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
try {indexes[0].close();} catch (IOException e) {}
}
@@ -227,7 +227,7 @@ public class IndexControl_p {
URL url = new URL(urlstring);
urlhash = plasmaURL.urlHash(url);
prop.put("urlhash", urlhash);
- plasmaCrawlLURL.entry entry = switchboard.loadedURL.getEntry(urlhash);
+ plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
prop.put("result", genUrlProfile(switchboard, entry, urlhash));
} catch (MalformedURLException e) {
prop.put("urlstring", "wrong url: " + urlstring);
@@ -236,7 +236,7 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashsearch")) {
- plasmaCrawlLURL.entry entry = switchboard.loadedURL.getEntry(urlhash);
+ plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url();
if (url == null) {
prop.put("result", "No Entry for url hash " + urlhash);
@@ -249,7 +249,7 @@ public class IndexControl_p {
if (post.containsKey("urlhashsimilar")) {
try {
- Iterator hashIt = switchboard.loadedURL.urlHashes(urlhash, true);
+ Iterator hashIt = switchboard.urlPool.loadedURL.urlHashes(urlhash, true);
String result = "Sequential List of URL-Hashes:
";
String hash;
int i = 0;
@@ -290,7 +290,7 @@ public class IndexControl_p {
// insert constants
prop.put("wcount", "" + switchboard.wordIndex.size());
- prop.put("ucount", "" + switchboard.loadedURL.size());
+ prop.put("ucount", "" + switchboard.urlPool.loadedURL.size());
prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : "");
prop.put("indexReceiveChecked", (switchboard.getConfig("allowReceiveIndex", "true").equals("true")) ? "checked" : "");
// return rewrite properties
@@ -307,7 +307,7 @@ public class IndexControl_p {
"
Description | " + entry.descr() + " |
" +
"Modified-Date | " + entry.moddate() + " |
" +
"Loaded-Date | " + entry.loaddate() + " |
" +
- "Referrer | " + switchboard.loadedURL.getEntry(entry.referrerHash()).url() + " |
" +
+ "Referrer | " + switchboard.urlPool.loadedURL.getEntry(entry.referrerHash()).url() + " |
" +
"Doctype | " + entry.doctype() + " |
" +
"Copy-Count | " + entry.copyCount() + " |
" +
"Local-Flag | " + entry.local() + " |
" +
@@ -351,8 +351,8 @@ public class IndexControl_p {
uh = ie.getUrlHash();
result +=
"";
- if (switchboard.loadedURL.exists(uh)) {
- us = switchboard.loadedURL.getEntry(uh).url().toString();
+ if (switchboard.urlPool.loadedURL.exists(uh)) {
+ us = switchboard.urlPool.loadedURL.getEntry(uh).url().toString();
result +=
" 0) {
- urlHash = switchboard.noticeURL.corePop().hash();
- if (urlHash != null) { switchboard.noticeURL.remove(urlHash); c++; }
+ while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) {
+ urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE).hash();
+ if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
}
- while (switchboard.noticeURL.limitStackSize() > 0) {
- urlHash = switchboard.noticeURL.limitPop().hash();
- if (urlHash != null) { switchboard.noticeURL.remove(urlHash); c++; }
+ while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) > 0) {
+ urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash();
+ if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
}
- while (switchboard.noticeURL.remoteStackSize() > 0) {
- urlHash = switchboard.noticeURL.remotePop().hash();
- if (urlHash != null) { switchboard.noticeURL.remove(urlHash); c++; }
+ while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) > 0) {
+ urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash();
+ if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
}
prop.put("info", 3);//crawling queue cleared
prop.put("info_numEntries", c);
@@ -211,7 +211,7 @@ public class IndexCreate_p {
int queueStackSize = switchboard.queueStack.size();
int loaderThreadsSize = switchboard.cacheLoader.size();
- int crawlerListSize = switchboard.noticeURL.stackSize();
+ int crawlerListSize = switchboard.urlPool.noticeURL.stackSize();
int completequeue = queueStackSize + loaderThreadsSize + crawlerListSize;
if ((completequeue > 0) || ((post != null) && (post.containsKey("refreshpage")))) {
@@ -279,11 +279,11 @@ public class IndexCreate_p {
}
// failure cases
- if (switchboard.errorURL.stackSize() != 0) {
- if (showRejectedCount > switchboard.errorURL.stackSize()) showRejectedCount = switchboard.errorURL.stackSize();
+ if (switchboard.urlPool.errorURL.stackSize() != 0) {
+ if (showRejectedCount > switchboard.urlPool.errorURL.stackSize()) showRejectedCount = switchboard.urlPool.errorURL.stackSize();
prop.put("rejected", 1);
- prop.put("rejected_num", switchboard.errorURL.stackSize());
- if (showRejectedCount != switchboard.errorURL.stackSize()) {
+ prop.put("rejected_num", switchboard.urlPool.errorURL.stackSize());
+ if (showRejectedCount != switchboard.urlPool.errorURL.stackSize()) {
prop.put("rejected_only-latest", 1);
prop.put("rejected_only-latest_num", showRejectedCount);
prop.put("rejected_only-latest_newnum", ((int) (showRejectedCount * 1.5)));
@@ -295,8 +295,8 @@ public class IndexCreate_p {
plasmaCrawlEURL.entry entry;
yacySeed initiatorSeed, executorSeed;
int j=0;
- for (i = switchboard.errorURL.stackSize() - 1; i >= (switchboard.errorURL.stackSize() - showRejectedCount); i--) {
- entry = (plasmaCrawlEURL.entry) switchboard.errorURL.getStack(i);
+ for (i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) {
+ entry = (plasmaCrawlEURL.entry) switchboard.urlPool.errorURL.getStack(i);
initiatorHash = entry.initiator();
executorHash = entry.executor();
url = entry.url().toString();
@@ -380,12 +380,12 @@ public class IndexCreate_p {
prop.put("loader-set_list", i );
}
- int localStackSize = switchboard.noticeURL.coreStackSize();
+ int localStackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
if (localStackSize == 0) {
prop.put("crawler-queue", 0);
} else {
prop.put("crawler-queue", 1);
- plasmaCrawlNURL.entry[] crawlerList = switchboard.noticeURL.coreTop(20);
+ plasmaCrawlNURL.entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, 20);
prop.put("crawler-queue_num", localStackSize);//num Entries
prop.put("crawler-queue_show-num", crawlerList.length); //showin sjow-num most recent
plasmaCrawlNURL.entry urle;
diff --git a/htroot/IndexMonitor.java b/htroot/IndexMonitor.java
index a6ff070ee..5793e1fd0 100644
--- a/htroot/IndexMonitor.java
+++ b/htroot/IndexMonitor.java
@@ -95,12 +95,12 @@ public class IndexMonitor {
}
// do the commands
- if (post.containsKey("clearlist")) switchboard.loadedURL.clearStack(process);
+ if (post.containsKey("clearlist")) switchboard.urlPool.loadedURL.clearStack(process);
if (post.containsKey("deleteentry")) {
String hash = post.get("hash", null);
if (hash != null) {
// delete from database
- switchboard.loadedURL.remove(hash);
+ switchboard.urlPool.loadedURL.remove(hash);
}
}
if (post.containsKey("moreIndexed")) {
@@ -113,7 +113,7 @@ public class IndexMonitor {
if (process == 0) {
prop.put("table", 2);
} else {
- prop.putAll(switchboard.loadedURL.genTableProps(process, showIndexedCount, si, se, "unknown", null, "IndexMonitor.html", true));
+ prop.putAll(switchboard.urlPool.loadedURL.genTableProps(process, showIndexedCount, si, se, "unknown", null, "IndexMonitor.html", true));
}
prop.put("process", process);
// return rewrite properties
diff --git a/htroot/IndexShare_p.java b/htroot/IndexShare_p.java
index 4315153cc..69321499f 100644
--- a/htroot/IndexShare_p.java
+++ b/htroot/IndexShare_p.java
@@ -66,7 +66,7 @@ public class IndexShare_p {
prop.put("dtable", "");
prop.put("rtable", "");
prop.put("wcount", "" + switchboard.wordIndex.size());
- prop.put("ucount", "" + switchboard.loadedURL.size());
+ prop.put("ucount", "" + switchboard.urlPool.loadedURL.size());
return prop; // be save
}
@@ -79,7 +79,7 @@ public class IndexShare_p {
// insert constants
prop.put("wcount", "" + switchboard.wordIndex.size());
- prop.put("ucount", "" + switchboard.loadedURL.size());
+ prop.put("ucount", "" + switchboard.urlPool.loadedURL.size());
// return rewrite properties
return prop;
}
diff --git a/htroot/ProxyIndexingMonitor_p.java b/htroot/ProxyIndexingMonitor_p.java
index 1dd018b8f..fb9b7fcca 100644
--- a/htroot/ProxyIndexingMonitor_p.java
+++ b/htroot/ProxyIndexingMonitor_p.java
@@ -74,12 +74,12 @@ public class ProxyIndexingMonitor_p {
prop.put("info_message", "");
if (post != null) {
- if (post.containsKey("clearlist4")) switchboard.loadedURL.clearStack(4); // local: by proxy crawl
+ if (post.containsKey("clearlist4")) switchboard.urlPool.loadedURL.clearStack(4); // local: by proxy crawl
if (post.containsKey("deleteentry")) {
String hash = post.get("hash", null);
if (hash != null) {
// delete from database
- switchboard.loadedURL.remove(hash);
+ switchboard.urlPool.loadedURL.remove(hash);
}
}
@@ -123,7 +123,7 @@ public class ProxyIndexingMonitor_p {
// create tables
String myname = yacyCore.seedDB.mySeed.getName();
- prop.putAll(switchboard.loadedURL.genTableProps(4, showIndexedCount, false, false, "proxy", null, "ProxyIndexingMonitor_p.html", true));
+ prop.putAll(switchboard.urlPool.loadedURL.genTableProps(4, showIndexedCount, false, false, "proxy", null, "ProxyIndexingMonitor_p.html", true));
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("proxyStoreHTCacheChecked", env.getConfig("proxyStoreHTCache", "").equals("true") ? 1 : 0);
diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java
index 936e85781..5586ba4e7 100644
--- a/htroot/htdocsdefault/dir.java
+++ b/htroot/htdocsdefault/dir.java
@@ -443,7 +443,7 @@ public class dir {
try {
URL url = new URL(urlstring);
plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
- plasmaCrawlLURL.entry newEntry = switchboard.loadedURL.newEntry(
+ plasmaCrawlLURL.entry newEntry = switchboard.urlPool.loadedURL.newEntry(
url, "YaCyShare: " + descr, new Date(), new Date(),
"____________", /*initiator*/
yacyCore.seedDB.mySeed.hash, /*executor*/
@@ -468,7 +468,7 @@ public class dir {
String urlhash = plasmaURL.urlHash(new URL(urlstring));
Set words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes());
switchboard.removeReferences(urlhash, words);
- switchboard.loadedURL.remove(urlhash);
+ switchboard.urlPool.loadedURL.remove(urlhash);
} catch (Exception e) {
System.out.println("INTERNAL ERROR in dir.deletePhrase:");
e.printStackTrace();
diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java
index ed619d7db..3e889ba36 100644
--- a/htroot/yacy/crawlOrder.java
+++ b/htroot/yacy/crawlOrder.java
@@ -152,10 +152,10 @@ public class crawlOrder {
reason = reasonString;
delay = "" + (acceptDelay / 4);
// send lurl-Entry as response
- plasmaCrawlLURL.entry entry = switchboard.loadedURL.getEntry(plasmaCrawlLURL.urlHash(url));
+ plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaCrawlLURL.urlHash(url));
if (entry != null) {
response = "double";
- switchboard.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
+ switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
lurl = crypt.simpleEncode(entry.toString());
delay = "1";
} else {
diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java
index bbfe013ed..5928d6098 100644
--- a/htroot/yacy/crawlReceipt.java
+++ b/htroot/yacy/crawlReceipt.java
@@ -111,11 +111,11 @@ public class crawlReceipt {
prop.put("delay", "3600");
} else if (result.equals("fill")) {
// put new data into database
- switchboard.loadedURL.newEntry(propStr, true, youare, iam, 1);
- switchboard.noticeURL.remove(urlhash);
+ switchboard.urlPool.loadedURL.newEntry(propStr, true, youare, iam, 1);
+ switchboard.urlPool.noticeURL.remove(urlhash);
// write log
- plasmaCrawlLURL.entry entry = switchboard.loadedURL.getEntry(urlhash);
+ plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
if (entry == null) {
switchboard.getLog().logError("RECEIVED wrong RECEIPT for hash " + urlhash + " from peer " + iam);
} else {
@@ -125,10 +125,10 @@ public class crawlReceipt {
// ready for more
prop.put("delay", "10");
} else {
- plasmaCrawlNURL.entry en = switchboard.noticeURL.getEntry(urlhash);
+ plasmaCrawlNURL.entry en = switchboard.urlPool.noticeURL.getEntry(urlhash);
if (en != null) {
- switchboard.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(plasmaURL.urlFlagLength), false);
- switchboard.noticeURL.remove(urlhash);
+ switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(plasmaURL.urlFlagLength), false);
+ switchboard.urlPool.noticeURL.remove(urlhash);
}
prop.put("delay", "100"); // what shall we do with that???
}
diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java
index 505d05651..8138fe0fc 100644
--- a/htroot/yacy/transferRWI.java
+++ b/htroot/yacy/transferRWI.java
@@ -110,7 +110,7 @@ public class transferRWI {
switchboard.wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry));
urlHash = entry.getUrlHash();
if ((!(unknownURL.contains(urlHash))) &&
- (!(switchboard.loadedURL.exists(urlHash)))) {
+ (!(switchboard.urlPool.loadedURL.exists(urlHash)))) {
unknownURL.add(urlHash);
}
received++;
diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java
index 1dcafca8d..cec81faed 100644
--- a/htroot/yacy/transferURL.java
+++ b/htroot/yacy/transferURL.java
@@ -71,13 +71,13 @@ public class transferURL {
if (granted) {
int received = 0;
- int sizeBefore = switchboard.loadedURL.size();
+ int sizeBefore = switchboard.urlPool.loadedURL.size();
// read the urls from the other properties and store
String urls;
for (int i = 0; i < urlc; i++) {
urls = (String) post.get("url" + i);
if (urls != null) {
- switchboard.loadedURL.newEntry(urls, true, iam, iam, 3);
+ switchboard.urlPool.loadedURL.newEntry(urls, true, iam, iam, 3);
received++;
}
}
@@ -85,7 +85,7 @@ public class transferURL {
yacyCore.seedDB.mySeed.incRU(received);
// return rewrite properties
- int more = switchboard.loadedURL.size() - sizeBefore;
+ int more = switchboard.urlPool.loadedURL.size() - sizeBefore;
doublevalues = "" + (received - more);
switchboard.getLog().logInfo("Received " + received + " URL's from peer " + iam);
if ((received - more) > 0) switchboard.getLog().logError("Received " + doublevalues + " double URL's from peer " + iam);
diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java
index d1c2577d5..144c71938 100644
--- a/source/de/anomic/plasma/plasmaCrawlNURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlNURL.java
@@ -61,16 +61,22 @@ import de.anomic.tools.bitfield;
public class plasmaCrawlNURL extends plasmaURL {
- public static final int STACK_TYPE_NULL = 0; // do not stack
- public static final int STACK_TYPE_CORE = 1; // put on local stack
- public static final int STACK_TYPE_LIMIT = 2; // put on global stack
- public static final int STACK_TYPE_OVERHANG = 3; // put on overhang stack; links that are known but not crawled
- public static final int STACK_TYPE_REMOTE = 4; // put on remote-triggered stack
+ public static final int STACK_TYPE_NULL = 0; // do not stack
+ public static final int STACK_TYPE_CORE = 1; // put on local stack
+ public static final int STACK_TYPE_LIMIT = 2; // put on global stack
+ public static final int STACK_TYPE_OVERHANG = 3; // put on overhang stack; links that are known but not crawled
+ public static final int STACK_TYPE_REMOTE = 4; // put on remote-triggered stack
+ public static final int STACK_TYPE_IMAGE = 11; // put on image stack
+ public static final int STACK_TYPE_MOVIE = 12; // put on movie stack
+ public static final int STACK_TYPE_MUSIC = 13; // put on music stack
private kelondroStack coreStack; // links found by crawling to depth-1
private kelondroStack limitStack; // links found by crawling at target depth
private kelondroStack overhangStack; // links found by crawling at depth+1
private kelondroStack remoteStack; // links from remote crawl orders
+ private kelondroStack imageStack; // links pointing to image resources
+ private kelondroStack movieStack; // links pointing to movie resources
+ private kelondroStack musicStack; // links pointing to music resources
private HashSet stackIndex; // to find out if a specific link is already on any stack
@@ -104,39 +110,41 @@ public class plasmaCrawlNURL extends plasmaURL {
urlHashCache = new kelondroTree(cacheFile, bufferkb * 0x400, ce);
}
- File localCrawlStack = new File(cacheStacksPath, "urlNoticeLocal0.stack");
- if (localCrawlStack.exists()) {
- coreStack = new kelondroStack(localCrawlStack, 0);
- } else {
- coreStack = new kelondroStack(localCrawlStack, 0, new int[] {plasmaURL.urlHashLength});
- }
- File limitCrawlStack = new File(cacheStacksPath, "urlNoticeLimit0.stack");
- if (limitCrawlStack.exists()) {
- limitStack = new kelondroStack(limitCrawlStack, 0);
- } else {
- limitStack = new kelondroStack(limitCrawlStack, 0, new int[] {plasmaURL.urlHashLength});
- }
- File overhangCrawlStack = new File(cacheStacksPath, "urlNoticeOverhang0.stack");
- if (overhangCrawlStack.exists()) {
- overhangStack = new kelondroStack(overhangCrawlStack, 0);
- } else {
- overhangStack = new kelondroStack(overhangCrawlStack, 0, new int[] {plasmaURL.urlHashLength});
- }
- File globalCrawlStack = new File(cacheStacksPath, "urlNoticeRemote0.stack");
- if (globalCrawlStack.exists()) {
- remoteStack = new kelondroStack(globalCrawlStack, 0);
- } else {
- remoteStack = new kelondroStack(globalCrawlStack, 0, new int[] {plasmaURL.urlHashLength});
- }
-
+ File coreStackFile = new File(cacheStacksPath, "urlNoticeLocal0.stack");
+ File limitStackFile = new File(cacheStacksPath, "urlNoticeLimit0.stack");
+ File overhangStackFile = new File(cacheStacksPath, "urlNoticeOverhang0.stack");
+ File remoteStackFile = new File(cacheStacksPath, "urlNoticeRemote0.stack");
+ File imageStackFile = new File(cacheStacksPath, "urlNoticeImage0.stack");
+ File movieStackFile = new File(cacheStacksPath, "urlNoticeMovie0.stack");
+ File musicStackFile = new File(cacheStacksPath, "urlNoticeMusic0.stack");
+ if (coreStackFile.exists()) coreStack = new kelondroStack(coreStackFile, 0); else coreStack = new kelondroStack(coreStackFile, 0, new int[] {plasmaURL.urlHashLength});
+ if (limitStackFile.exists()) limitStack = new kelondroStack(limitStackFile, 0); else limitStack = new kelondroStack(limitStackFile, 0, new int[] {plasmaURL.urlHashLength});
+ if (overhangStackFile.exists()) overhangStack = new kelondroStack(overhangStackFile, 0); else overhangStack = new kelondroStack(overhangStackFile, 0, new int[] {plasmaURL.urlHashLength});
+ if (remoteStackFile.exists()) remoteStack = new kelondroStack(remoteStackFile, 0); else remoteStack = new kelondroStack(remoteStackFile, 0, new int[] {plasmaURL.urlHashLength});
+ if (imageStackFile.exists()) imageStack = new kelondroStack(imageStackFile, 0); else imageStack = new kelondroStack(imageStackFile, 0, new int[] {plasmaURL.urlHashLength});
+ if (movieStackFile.exists()) movieStack = new kelondroStack(movieStackFile, 0); else movieStack = new kelondroStack(movieStackFile, 0, new int[] {plasmaURL.urlHashLength});
+ if (musicStackFile.exists()) musicStack = new kelondroStack(musicStackFile, 0); else musicStack = new kelondroStack(musicStackFile, 0, new int[] {plasmaURL.urlHashLength});
+
// init stack Index
stackIndex = new HashSet();
- Iterator i = coreStack.iterator();
- while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
- i = remoteStack.iterator();
- while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
+ new initStackIndex().start();
}
+ public class initStackIndex extends Thread {
+ public void run() {
+ Iterator i;
+ try {
+ i = coreStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
+ i = limitStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
+ i = overhangStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
+ i = remoteStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
+ i = imageStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
+ i = movieStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
+ i = musicStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
+ } catch (IOException e) {}
+ }
+ }
+
private static String normalizeHost(String host) {
if (host.length() > urlHostLength) host = host.substring(0, urlHostLength);
host = host.toLowerCase();
@@ -155,20 +163,17 @@ public class plasmaCrawlNURL extends plasmaURL {
return coreStack.size() + limitStack.size() + remoteStack.size();
}
- public int coreStackSize() {
- return coreStack.size();
- }
-
- public int limitStackSize() {
- return limitStack.size();
- }
-
- public int overhangStackSize() {
- return overhangStack.size();
- }
-
- public int remoteStackSize() {
- return remoteStack.size();
+ public int stackSize(int stackType) {
+ switch (stackType) {
+ case STACK_TYPE_CORE: return coreStack.size();
+ case STACK_TYPE_LIMIT: return limitStack.size();
+ case STACK_TYPE_OVERHANG: return overhangStack.size();
+ case STACK_TYPE_REMOTE: return remoteStack.size();
+ case STACK_TYPE_IMAGE: return imageStack.size();
+ case STACK_TYPE_MOVIE: return movieStack.size();
+ case STACK_TYPE_MUSIC: return musicStack.size();
+ default: return -1;
+ }
}
public boolean existsInStack(String urlhash) {
@@ -179,35 +184,48 @@ public class plasmaCrawlNURL extends plasmaURL {
String profile, int depth, int anchors, int forkfactor, int stackMode) {
entry e = new entry(initiator, url, referrer, name, loaddate, profile,
depth, anchors, forkfactor);
-
- // stackMode can have 3 cases:
- // 0 = do not stack
- // 1 = on local stack
- // 2 = on global stack
- // 3 = on overhang stack
- // 4 = on remote stack
try {
- if (stackMode == 1) coreStack.push(new byte[][] {e.hash.getBytes()});
- if (stackMode == 2) limitStack.push(new byte[][] {e.hash.getBytes()});
- if (stackMode == 3) overhangStack.push(new byte[][] {e.hash.getBytes()});
- if (stackMode == 4) remoteStack.push(new byte[][] {e.hash.getBytes()});
+ switch (stackMode) {
+ case STACK_TYPE_CORE: coreStack.push(new byte[][] {e.hash.getBytes()}); break;
+ case STACK_TYPE_LIMIT: limitStack.push(new byte[][] {e.hash.getBytes()}); break;
+ case STACK_TYPE_OVERHANG: overhangStack.push(new byte[][] {e.hash.getBytes()}); break;
+ case STACK_TYPE_REMOTE: remoteStack.push(new byte[][] {e.hash.getBytes()}); break;
+ case STACK_TYPE_IMAGE: imageStack.push(new byte[][] {e.hash.getBytes()}); break;
+ case STACK_TYPE_MOVIE: movieStack.push(new byte[][] {e.hash.getBytes()}); break;
+ case STACK_TYPE_MUSIC: musicStack.push(new byte[][] {e.hash.getBytes()}); break;
+ default: break;
+ }
stackIndex.add(new String(e.hash.getBytes()));
} catch (IOException er) {
}
return e;
}
- public entry corePop() { return pop(coreStack); }
- public entry[] coreTop(int count) { return top(coreStack, count); }
-
- public entry limitPop() { return pop(limitStack); }
- public entry[] limitTop(int count) { return top(limitStack, count); }
-
- public entry overhangPop() { return pop(overhangStack); }
- public entry[] overhangTop(int count) { return top(overhangStack, count); }
+ public entry[] top(int stackType, int count) {
+ switch (stackType) {
+ case STACK_TYPE_CORE: return top(coreStack, count);
+ case STACK_TYPE_LIMIT: return top(limitStack, count);
+ case STACK_TYPE_OVERHANG: return top(overhangStack, count);
+ case STACK_TYPE_REMOTE: return top(remoteStack, count);
+ case STACK_TYPE_IMAGE: return top(imageStack, count);
+ case STACK_TYPE_MOVIE: return top(movieStack, count);
+ case STACK_TYPE_MUSIC: return top(musicStack, count);
+ default: return null;
+ }
+ }
- public entry remotePop() { return pop(remoteStack); }
- public entry[] remoteTop(int count) { return top(remoteStack, count); }
+ public entry pop(int stackType) {
+ switch (stackType) {
+ case STACK_TYPE_CORE: return pop(coreStack);
+ case STACK_TYPE_LIMIT: return pop(limitStack);
+ case STACK_TYPE_OVERHANG: return pop(overhangStack);
+ case STACK_TYPE_REMOTE: return pop(remoteStack);
+ case STACK_TYPE_IMAGE: return pop(imageStack);
+ case STACK_TYPE_MOVIE: return pop(movieStack);
+ case STACK_TYPE_MUSIC: return pop(musicStack);
+ default: return null;
+ }
+ }
private entry pop(kelondroStack stack) {
// this is a filo - pop
@@ -237,7 +255,7 @@ public class plasmaCrawlNURL extends plasmaURL {
return null;
}
}
-
+
public synchronized entry getEntry(String hash) {
return new entry(hash);
}
@@ -247,10 +265,11 @@ public class plasmaCrawlNURL extends plasmaURL {
urlHashCache.remove(hash.getBytes());
} catch (IOException e) {}
}
-
+
public class entry {
- private String initiator; // the initiator hash, is NULL or "" if it is the own proxy
+ private String initiator; // the initiator hash, is NULL or "" if it is the own proxy;
+ // if this is generated by a crawl, the own peer hash in entered
private String hash; // the url's hash
private String referrer; // the url's referrer hash
private URL url; // the url as string
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 1d78f87f4..bb035fe18 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -158,9 +158,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
private File cachePath;
private File plasmaPath;
public File listsPath;
- public plasmaCrawlLURL loadedURL;
- public plasmaCrawlNURL noticeURL;
- public plasmaCrawlEURL errorURL;
+ public plasmaURLPool urlPool;
public plasmaWordIndex wordIndex;
public plasmaSearch searchManager;
public plasmaHTCache cacheManager;
@@ -248,13 +246,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// start indexing management
log.logSystem("Starting Indexing Management");
- loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL);
- noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL);
- errorURL = new plasmaCrawlEURL(new File(plasmaPath, "urlErr0.db"), ramEURL);
+ urlPool = new plasmaURLPool(plasmaPath, ramLURL, ramNURL, ramEURL);
+
+
wordIndex = new plasmaWordIndex(plasmaPath, ramRWI, log);
int wordCacheMax = Integer.parseInt((String) getConfig("wordCacheMax", "10000"));
wordIndex.setMaxWords(wordCacheMax);
- searchManager = new plasmaSearch(loadedURL, wordIndex);
+ searchManager = new plasmaSearch(urlPool.loadedURL, wordIndex);
// start a cache manager
log.logSystem("Starting HT Cache Manager");
@@ -402,7 +400,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} catch (IOException e) {}
}
private void cleanProfiles() {
- if ((queueStack.size() > 0) || (cacheLoader.size() > 0) || (noticeURL.stackSize() > 0)) return;
+ if ((queueStack.size() > 0) || (cacheLoader.size() > 0) || (urlPool.noticeURL.stackSize() > 0)) return;
Iterator i = profiles.profiles(true);
plasmaCrawlProfile.entry entry;
try {
@@ -446,9 +444,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
wikiDB.close();
messageDB.close();
facilityDB.close();
- loadedURL.close();
- noticeURL.close();
- errorURL.close();
+ urlPool.close();
profiles.close();
parser.close();
cacheManager.close();
@@ -468,7 +464,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public int lUrlSize() {
- return loadedURL.size();
+ return urlPool.loadedURL.size();
}
public int cacheSizeMin() {
@@ -496,10 +492,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// do one processing step
log.logDebug("DEQUEUE: cacheManager=" + ((cacheManager.idle()) ? "idle" : "busy") +
", queueStack=" + queueStack.size() +
- ", coreStackSize=" + noticeURL.coreStackSize() +
- ", limitStackSize=" + noticeURL.limitStackSize() +
- ", overhangStackSize=" + noticeURL.overhangStackSize() +
- ", remoteStackSize=" + noticeURL.remoteStackSize());
+ ", coreStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) +
+ ", limitStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) +
+ ", overhangStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) +
+ ", remoteStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE));
nextentry = (plasmaHTCache.Entry) queueStack.removeFirst();
}
processResourceStack(nextentry);
@@ -508,9 +504,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public int cleanupJobSize() {
int c = 0;
- if ((errorURL.stackSize() > 1000)) c++;
+ if ((urlPool.errorURL.stackSize() > 1000)) c++;
for (int i = 1; i <= 6; i++) {
- if (loadedURL.getStackSize(i) > 1000) c++;
+ if (urlPool.loadedURL.getStackSize(i) > 1000) c++;
}
return c;
}
@@ -520,14 +516,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
boolean hasDoneSomething = false;
// clean up error stack
- if ((errorURL.stackSize() > 1000)) {
- errorURL.clearStack();
+ if ((urlPool.errorURL.stackSize() > 1000)) {
+ urlPool.errorURL.clearStack();
hasDoneSomething = true;
}
// clean up loadedURL stack
for (int i = 1; i <= 6; i++) {
- if (loadedURL.getStackSize(i) > 1000) {
- loadedURL.clearStack(i);
+ if (urlPool.loadedURL.getStackSize(i) > 1000) {
+ urlPool.loadedURL.clearStack(i);
hasDoneSomething = true;
}
}
@@ -567,11 +563,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public int coreCrawlJobSize() {
- return noticeURL.coreStackSize();
+ return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
}
public boolean coreCrawlJob() {
- if (noticeURL.coreStackSize() == 0) {
+ if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) {
//log.logDebug("CoreCrawl: queue is empty");
return false;
}
@@ -600,13 +596,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// do a local crawl
- plasmaCrawlNURL.entry urlEntry = noticeURL.corePop();
+ plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE);
if (urlEntry.url() == null) return false;
String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
if (profile == null) {
- log.logError("LOCALCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
+ log.logError("LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return false;
}
log.logDebug("LOCALCRAWL: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
@@ -617,11 +613,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public int limitCrawlTriggerJobSize() {
- return noticeURL.limitStackSize();
+ return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
}
public boolean limitCrawlTriggerJob() {
- if (noticeURL.limitStackSize() == 0) {
+ if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) == 0) {
//log.logDebug("LimitCrawl: queue is empty");
return false;
}
@@ -639,13 +635,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// start a global crawl, if possible
- plasmaCrawlNURL.entry urlEntry = noticeURL.limitPop();
+ plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
if (urlEntry.url() == null) return true;
String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
if (profile == null) {
- log.logError("REMOTECRAWLTRIGGER[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
+ log.logError("REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return false;
}
log.logDebug("plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
@@ -653,7 +649,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false")));
boolean tryRemote =
- ((noticeURL.coreStackSize() != 0) || (queueStack.size() != 0)) /* should do ourself */ &&
+ ((urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (queueStack.size() != 0)) /* should do ourself */ &&
(profile.remoteIndexing()) /* granted */ &&
(urlEntry.initiator() != null) && (!(urlEntry.initiator().equals(plasmaURL.dummyHash))) /* not proxy */ &&
((yacyCore.seedDB.mySeed.isSenior()) ||
@@ -681,7 +677,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public int remoteTriggeredCrawlJobSize() {
- return noticeURL.remoteStackSize();
+ return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE);
}
public boolean remoteTriggeredCrawlJob() {
@@ -689,7 +685,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// do nothing if either there are private processes to be done
// or there is no global crawl on the stack
- if (noticeURL.remoteStackSize() == 0) {
+ if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) == 0) {
//log.logDebug("GlobalCrawl: queue is empty");
return false;
}
@@ -720,13 +716,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
- plasmaCrawlNURL.entry urlEntry = noticeURL.remotePop();
+ plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE);
if (urlEntry.url() == null) return false;
String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
if (profile == null) {
- log.logError("REMOTETRIGGEREDCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
+ log.logError("REMOTETRIGGEREDCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return false;
}
log.logDebug("plasmaSwitchboard.remoteTriggeredCrawlJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
@@ -807,12 +803,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (rejectReason == null) {
c++;
} else {
- errorURL.newEntry(new URL(nexturlstring), entry.nomalizedURLString, entry.initiator(), yacyCore.seedDB.mySeed.hash,
+ urlPool.errorURL.newEntry(new URL(nexturlstring), entry.nomalizedURLString, entry.initiator(), yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false);
}
}
log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url.toString() +
- ", NEW CRAWL STACK SIZE IS " + noticeURL.coreStackSize());
+ ", NEW CRAWL STACK SIZE IS " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
}
// create index
@@ -835,7 +831,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
//log.logInfo("INDEXING HEADLINE:" + descr);
try {
log.logDebug("(Profile) Create LURL-Entry for '" + entry.nomalizedURLString + "'");
- plasmaCrawlLURL.entry newEntry = loadedURL.newEntry(
+ plasmaCrawlLURL.entry newEntry = urlPool.loadedURL.newEntry(
entry.url, descr, entry.lastModified, new Date(),
initiatorHash,
yacyCore.seedDB.mySeed.hash,
@@ -850,7 +846,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String urlHash = newEntry.hash();
log.logDebug("(Profile) Remove NURL for '" + entry.nomalizedURLString + "'");
- noticeURL.remove(urlHash); // worked-off
+ urlPool.noticeURL.remove(urlHash); // worked-off
if (((processCase == 4) || (processCase == 5) || (processCase == 6)) &&
(entry.profile.localIndexing())) {
@@ -882,7 +878,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} else {
log.logInfo("Not indexed any word in URL " + entry.url + "; cause: " + noIndexReason);
- errorURL.newEntry(entry.url, referrerHash,
+ urlPool.errorURL.newEntry(entry.url, referrerHash,
((entry.proxy()) ? plasmaURL.dummyHash : entry.initiator()),
yacyCore.seedDB.mySeed.hash,
descr, noIndexReason, new bitfield(plasmaURL.urlFlagLength), true);
@@ -925,7 +921,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// filter deny
if ((currentdepth > 0) && (!(nexturlString.matches(profile.generalFilter())))) {
reason = "denied_(does_not_match_filter)";
- errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
+ urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
return reason;
}
@@ -933,7 +929,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// deny cgi
if (plasmaHTCache.isCGI(nexturlString)) {
reason = "denied_(cgi_url)";
- errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
+ urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
return reason;
}
@@ -941,22 +937,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// deny post properties
if ((plasmaHTCache.isPOST(nexturlString)) && (!(profile.crawlingQ()))) {
reason = "denied_(post_url)";
- errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
+ urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
return reason;
}
String nexturlhash = plasmaURL.urlHash(nexturl);
- if (loadedURL.exists(nexturlhash)) {
+ String dbocc = "";
+ if ((dbocc = urlPool.testHash(nexturlhash)) != null) {
// DISTIGUISH OLD/RE-SEARCH CASES HERE!
- reason = "double_(already_loaded)";
- errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
- name, reason, new bitfield(plasmaURL.urlFlagLength), false);
- return reason;
- }
- if (noticeURL.existsInStack(nexturlhash)) {
- reason = "double_(noticed_in_crawler)";
- errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
+ reason = "double_(registered_in_" + dbocc + ")";
+ urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
return reason;
}
@@ -970,7 +961,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
((yacyCore.seedDB.mySeed.isSenior()) ||
(yacyCore.seedDB.mySeed.isPrincipal())) /* qualified */;
- noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
+ urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
nexturl, /* url clear text string */
loadDate, /* load date */
referrerHash, /* last url in crawling queue */
@@ -988,11 +979,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
private URL hash2url(String urlhash) {
if (urlhash.equals(plasmaURL.dummyHash)) return null;
- plasmaCrawlNURL.entry ne = noticeURL.getEntry(urlhash);
+ plasmaCrawlNURL.entry ne = urlPool.noticeURL.getEntry(urlhash);
if (ne != null) return ne.url();
- plasmaCrawlLURL.entry le = loadedURL.getEntry(urlhash);
+ plasmaCrawlLURL.entry le = urlPool.loadedURL.getEntry(urlhash);
if (le != null) return le.url();
- plasmaCrawlEURL.entry ee = errorURL.getEntry(urlhash);
+ plasmaCrawlEURL.entry ee = urlPool.errorURL.getEntry(urlhash);
if (ee != null) return ee.url();
return null;
}
@@ -1005,17 +996,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
private boolean processLocalCrawling(plasmaCrawlNURL.entry urlEntry, plasmaCrawlProfile.entry profile) {
// work off one Crawl stack entry
if ((urlEntry == null) && (urlEntry.url() == null)) {
- log.logInfo("LOCALCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.limitStackSize() + ", " + noticeURL.overhangStackSize() + ", " + noticeURL.remoteStackSize() + "]: urlEntry=null");
+ log.logInfo("LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null");
return false;
}
cacheLoader.loadParallel(urlEntry.url(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile);
- log.logInfo("LOCALCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.limitStackSize() + ", " + noticeURL.overhangStackSize() + ", " + noticeURL.remoteStackSize() + "]: enqueued for load " + urlEntry.url());
+ log.logInfo("LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: enqueued for load " + urlEntry.url());
return true;
}
private boolean processRemoteCrawlTrigger(plasmaCrawlNURL.entry urlEntry) {
if (urlEntry == null) {
- log.logInfo("REMOTECRAWLTRIGGER[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: urlEntry=null");
+ log.logInfo("REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null");
return false;
}
@@ -1077,8 +1068,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
- plasmaCrawlLURL.entry entry = loadedURL.newEntry(propStr, true, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1);
- noticeURL.remove(entry.hash());
+ plasmaCrawlLURL.entry entry = urlPool.loadedURL.newEntry(propStr, true, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1);
+ urlPool.noticeURL.remove(entry.hash());
log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + nexturlString + "). URL IS CONSIDERED AS 'LOADED!'");
return true;
} else {
@@ -1173,7 +1164,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
int fetchpeers = ((int) time / 1000) * 3; // number of target peers; means 30 peers in 10 seconds
long fetchtime = time * 7 / 10; // time to waste
if (fetchcount > count) fetchcount = count;
- globalresults = yacySearch.searchHashes(queryhashes, loadedURL, searchManager, fetchcount, fetchpeers, snippetCache, fetchtime);
+ globalresults = yacySearch.searchHashes(queryhashes, urlPool.loadedURL, searchManager, fetchcount, fetchpeers, snippetCache, fetchtime);
log.logDebug("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
}
prop.put("globalresults", globalresults); // the result are written to the local DB
@@ -1217,7 +1208,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ((seed == null) || ((address = seed.getAddress()) == null)) {
// seed is not known from here
removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + urlentry.descr()).getBytes()));
- loadedURL.remove(urlentry.hash()); // clean up
+ urlPool.loadedURL.remove(urlentry.hash()); // clean up
continue; // next result
}
url = new URL("http://" + address + "/" + host.substring(0, p) + filename);
@@ -1367,7 +1358,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (actionName.equals("urlcount")) {
serverObjects result = new serverObjects();
- result.put("urls","" + loadedURL.size());
+ result.put("urls","" + urlPool.loadedURL.size());
return result;
}
@@ -1392,7 +1383,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// finally, delete the url entry
// determine the url string
- plasmaCrawlLURL.entry entry = loadedURL.getEntry(urlhash);
+ plasmaCrawlLURL.entry entry = urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url();
if (url == null) return 0;
// get set of words
@@ -1401,7 +1392,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// delete all word references
int count = removeReferences(urlhash, words);
// finally delete the url entry itself
- loadedURL.remove(urlhash);
+ urlPool.loadedURL.remove(urlhash);
return count;
}
@@ -1443,7 +1434,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ((yacyCore.seedDB == null) ||
(yacyCore.seedDB.mySeed == null) ||
(yacyCore.seedDB.mySeed.isVirgin()) ||
- (loadedURL.size() < 10) ||
+ (urlPool.loadedURL.size() < 10) ||
(wordIndex.size() < 100) ||
(!(yacyCore.seedDB.mySeed.isJunior()))) return false;
@@ -1453,7 +1444,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (
(queueStack.size() == 0) &&
(cacheLoader.size() == 0) &&
- (noticeURL.stackSize() == 0) &&
+ (urlPool.noticeURL.stackSize() == 0) &&
(getConfig("allowDistributeIndex", "false").equals("true")) &&
((transferred = performTransferIndex(indexCount, peerCount, true)) > 0)) {
indexCount = transferred;
@@ -1508,7 +1499,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
while ((e.hasMoreElements()) && (hc < peerCount)) {
seed = (yacySeed) e.nextElement();
if (seed != null) {
- error = yacyClient.transferIndex(seed, indexEntities, loadedURL);
+ error = yacyClient.transferIndex(seed, indexEntities, urlPool.loadedURL);
if (error == null) {
log.logInfo("Index Transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "] to peer " + seed.getName() + ":" + seed.hash + " successfull");
peerNames += ", " + seed.getName();
diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java
new file mode 100644
index 000000000..93e3318e0
--- /dev/null
+++ b/source/de/anomic/plasma/plasmaURLPool.java
@@ -0,0 +1,77 @@
+// plasmaURLPool.java
+// -----------------------
+// part of YaCy
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2005
+// last major change: 16.06.2005
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+// this class combines all url storage methods into one. It is the host for all url storage
+
+
+package de.anomic.plasma;
+
+import java.io.File;
+import java.io.IOException;
+
+public class plasmaURLPool {
+
+
+ public plasmaCrawlLURL loadedURL;
+ public plasmaCrawlNURL noticeURL;
+ public plasmaCrawlEURL errorURL;
+
+ public plasmaURLPool(File plasmaPath, int ramLURL, int ramNURL, int ramEURL) throws IOException {
+ loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL);
+ noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL);
+ errorURL = new plasmaCrawlEURL(new File(plasmaPath, "urlErr0.db"), ramEURL);
+ }
+
+ public String testHash(String hash) {
+ // tests if hash occurrs in any database
+ // if it exists, the name of the database is returned,
+ // if it not exists, null is returned
+ if (loadedURL.exists(hash)) return "loaded";
+ if (noticeURL.existsInStack(hash)) return "crawler";
+ return null;
+ }
+
+ public void close() throws IOException {
+ loadedURL.close();
+ noticeURL.close();
+ errorURL.close();
+ }
+}