preparations for image/movie/music indexing

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@280 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent f45dc29f35
commit a1ffc27041

@ -77,7 +77,7 @@ public class IndexControl_p {
prop.put("urlhash", "");
prop.put("result", "");
prop.put("wcount", "" + switchboard.wordIndex.size());
prop.put("ucount", "" + switchboard.loadedURL.size());
prop.put("ucount", "" + switchboard.urlPool.loadedURL.size());
prop.put("otherHosts", "");
prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : "");
prop.put("indexReceiveChecked", (switchboard.getConfig("allowReceiveIndex", "true").equals("true")) ? "checked" : "");
@ -132,7 +132,7 @@ public class IndexControl_p {
}
}
if (delurlref) for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true);
if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.loadedURL.remove(urlx[i]);
if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.urlPool.loadedURL.remove(urlx[i]);
switchboard.wordIndex.deleteIndex(keyhash);
post.remove("keyhashdeleteall");
if ((keystring.length() > 0) && (plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)))
@ -143,7 +143,7 @@ public class IndexControl_p {
if (post.containsKey("keyhashdelete")) {
if (delurlref) for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true);
if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.loadedURL.remove(urlx[i]);
if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.urlPool.loadedURL.remove(urlx[i]);
switchboard.wordIndex.removeEntries(keyhash, urlx, true);
// this shall lead to a presentation of the list; so handle that the remaining program
// thinks that it was called for a list presentation
@ -161,14 +161,14 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashdelete")) {
plasmaCrawlLURL.entry entry = switchboard.loadedURL.getEntry(urlhash);
plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url();
if (url == null) {
prop.put("result", "No Entry for url hash " + urlhash + "; nothing deleted.");
} else {
urlstring = htmlFilterContentScraper.urlNormalform(url);
prop.put("urlstring", "");
switchboard.loadedURL.remove(urlhash);
switchboard.urlPool.loadedURL.remove(urlhash);
prop.put("result", "Removed URL " + urlstring);
}
}
@ -198,7 +198,7 @@ public class IndexControl_p {
String result;
long starttime = System.currentTimeMillis();
indexes[0] = switchboard.wordIndex.getEntity(keyhash, true);
result = yacyClient.transferIndex(yacyCore.seedDB.getConnected(post.get("hostHash", "")), indexes, switchboard.loadedURL);
result = yacyClient.transferIndex(yacyCore.seedDB.getConnected(post.get("hostHash", "")), indexes, switchboard.urlPool.loadedURL);
prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
try {indexes[0].close();} catch (IOException e) {}
}
@ -227,7 +227,7 @@ public class IndexControl_p {
URL url = new URL(urlstring);
urlhash = plasmaURL.urlHash(url);
prop.put("urlhash", urlhash);
plasmaCrawlLURL.entry entry = switchboard.loadedURL.getEntry(urlhash);
plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
prop.put("result", genUrlProfile(switchboard, entry, urlhash));
} catch (MalformedURLException e) {
prop.put("urlstring", "wrong url: " + urlstring);
@ -236,7 +236,7 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashsearch")) {
plasmaCrawlLURL.entry entry = switchboard.loadedURL.getEntry(urlhash);
plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url();
if (url == null) {
prop.put("result", "No Entry for url hash " + urlhash);
@ -249,7 +249,7 @@ public class IndexControl_p {
if (post.containsKey("urlhashsimilar")) {
try {
Iterator hashIt = switchboard.loadedURL.urlHashes(urlhash, true);
Iterator hashIt = switchboard.urlPool.loadedURL.urlHashes(urlhash, true);
String result = "Sequential List of URL-Hashes:<br>";
String hash;
int i = 0;
@ -290,7 +290,7 @@ public class IndexControl_p {
// insert constants
prop.put("wcount", "" + switchboard.wordIndex.size());
prop.put("ucount", "" + switchboard.loadedURL.size());
prop.put("ucount", "" + switchboard.urlPool.loadedURL.size());
prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : "");
prop.put("indexReceiveChecked", (switchboard.getConfig("allowReceiveIndex", "true").equals("true")) ? "checked" : "");
// return rewrite properties
@ -307,7 +307,7 @@ public class IndexControl_p {
"<tr><td class=\"small\">Description</td><td class=\"tt\">" + entry.descr() + "</td></tr>" +
"<tr><td class=\"small\">Modified-Date</td><td class=\"tt\">" + entry.moddate() + "</td></tr>" +
"<tr><td class=\"small\">Loaded-Date</td><td class=\"tt\">" + entry.loaddate() + "</td></tr>" +
"<tr><td class=\"small\">Referrer</td><td class=\"tt\">" + switchboard.loadedURL.getEntry(entry.referrerHash()).url() + "</td></tr>" +
"<tr><td class=\"small\">Referrer</td><td class=\"tt\">" + switchboard.urlPool.loadedURL.getEntry(entry.referrerHash()).url() + "</td></tr>" +
"<tr><td class=\"small\">Doctype</td><td class=\"tt\">" + entry.doctype() + "</td></tr>" +
"<tr><td class=\"small\">Copy-Count</td><td class=\"tt\">" + entry.copyCount() + "</td></tr>" +
"<tr><td class=\"small\">Local-Flag</td><td class=\"tt\">" + entry.local() + "</td></tr>" +
@ -351,8 +351,8 @@ public class IndexControl_p {
uh = ie.getUrlHash();
result +=
"<input type=\"checkbox\" name=\"urlhx" + i++ + "\" value=\"" + uh + "\" align=\"top\">";
if (switchboard.loadedURL.exists(uh)) {
us = switchboard.loadedURL.getEntry(uh).url().toString();
if (switchboard.urlPool.loadedURL.exists(uh)) {
us = switchboard.urlPool.loadedURL.getEntry(uh).url().toString();
result +=
"<a href=\"/IndexControl_p.html?" + "keystring=" + keystring +
"&keyhash=" + keyhash + "&urlhash=" + uh + "&urlstringsearch=" + "&urlstring=" + us +

@ -131,8 +131,8 @@ public class IndexCreate_p {
// stack request
// first delete old entry, if exists
String urlhash = plasmaURL.urlHash(crawlingStart);
switchboard.loadedURL.remove(urlhash);
switchboard.noticeURL.remove(urlhash);
switchboard.urlPool.loadedURL.remove(urlhash);
switchboard.urlPool.noticeURL.remove(urlhash);
// stack url
String reasonString = switchboard.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0,
@ -157,7 +157,7 @@ public class IndexCreate_p {
}
}
if (post.containsKey("clearRejected")) {
switchboard.errorURL.clearStack();
switchboard.urlPool.errorURL.clearStack();
}
if (post.containsKey("moreRejected")) {
showRejectedCount = Integer.parseInt(post.get("showRejected", "10"));
@ -169,17 +169,17 @@ public class IndexCreate_p {
if (post.containsKey("clearcrawlqueue")) {
String urlHash;
int c = 0;
while (switchboard.noticeURL.coreStackSize() > 0) {
urlHash = switchboard.noticeURL.corePop().hash();
if (urlHash != null) { switchboard.noticeURL.remove(urlHash); c++; }
while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) {
urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE).hash();
if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
}
while (switchboard.noticeURL.limitStackSize() > 0) {
urlHash = switchboard.noticeURL.limitPop().hash();
if (urlHash != null) { switchboard.noticeURL.remove(urlHash); c++; }
while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) > 0) {
urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash();
if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
}
while (switchboard.noticeURL.remoteStackSize() > 0) {
urlHash = switchboard.noticeURL.remotePop().hash();
if (urlHash != null) { switchboard.noticeURL.remove(urlHash); c++; }
while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) > 0) {
urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash();
if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
}
prop.put("info", 3);//crawling queue cleared
prop.put("info_numEntries", c);
@ -211,7 +211,7 @@ public class IndexCreate_p {
int queueStackSize = switchboard.queueStack.size();
int loaderThreadsSize = switchboard.cacheLoader.size();
int crawlerListSize = switchboard.noticeURL.stackSize();
int crawlerListSize = switchboard.urlPool.noticeURL.stackSize();
int completequeue = queueStackSize + loaderThreadsSize + crawlerListSize;
if ((completequeue > 0) || ((post != null) && (post.containsKey("refreshpage")))) {
@ -279,11 +279,11 @@ public class IndexCreate_p {
}
// failure cases
if (switchboard.errorURL.stackSize() != 0) {
if (showRejectedCount > switchboard.errorURL.stackSize()) showRejectedCount = switchboard.errorURL.stackSize();
if (switchboard.urlPool.errorURL.stackSize() != 0) {
if (showRejectedCount > switchboard.urlPool.errorURL.stackSize()) showRejectedCount = switchboard.urlPool.errorURL.stackSize();
prop.put("rejected", 1);
prop.put("rejected_num", switchboard.errorURL.stackSize());
if (showRejectedCount != switchboard.errorURL.stackSize()) {
prop.put("rejected_num", switchboard.urlPool.errorURL.stackSize());
if (showRejectedCount != switchboard.urlPool.errorURL.stackSize()) {
prop.put("rejected_only-latest", 1);
prop.put("rejected_only-latest_num", showRejectedCount);
prop.put("rejected_only-latest_newnum", ((int) (showRejectedCount * 1.5)));
@ -295,8 +295,8 @@ public class IndexCreate_p {
plasmaCrawlEURL.entry entry;
yacySeed initiatorSeed, executorSeed;
int j=0;
for (i = switchboard.errorURL.stackSize() - 1; i >= (switchboard.errorURL.stackSize() - showRejectedCount); i--) {
entry = (plasmaCrawlEURL.entry) switchboard.errorURL.getStack(i);
for (i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) {
entry = (plasmaCrawlEURL.entry) switchboard.urlPool.errorURL.getStack(i);
initiatorHash = entry.initiator();
executorHash = entry.executor();
url = entry.url().toString();
@ -380,12 +380,12 @@ public class IndexCreate_p {
prop.put("loader-set_list", i );
}
int localStackSize = switchboard.noticeURL.coreStackSize();
int localStackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
if (localStackSize == 0) {
prop.put("crawler-queue", 0);
} else {
prop.put("crawler-queue", 1);
plasmaCrawlNURL.entry[] crawlerList = switchboard.noticeURL.coreTop(20);
plasmaCrawlNURL.entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, 20);
prop.put("crawler-queue_num", localStackSize);//num Entries
prop.put("crawler-queue_show-num", crawlerList.length); //showin sjow-num most recent
plasmaCrawlNURL.entry urle;

@ -95,12 +95,12 @@ public class IndexMonitor {
}
// do the commands
if (post.containsKey("clearlist")) switchboard.loadedURL.clearStack(process);
if (post.containsKey("clearlist")) switchboard.urlPool.loadedURL.clearStack(process);
if (post.containsKey("deleteentry")) {
String hash = post.get("hash", null);
if (hash != null) {
// delete from database
switchboard.loadedURL.remove(hash);
switchboard.urlPool.loadedURL.remove(hash);
}
}
if (post.containsKey("moreIndexed")) {
@ -113,7 +113,7 @@ public class IndexMonitor {
if (process == 0) {
prop.put("table", 2);
} else {
prop.putAll(switchboard.loadedURL.genTableProps(process, showIndexedCount, si, se, "unknown", null, "IndexMonitor.html", true));
prop.putAll(switchboard.urlPool.loadedURL.genTableProps(process, showIndexedCount, si, se, "unknown", null, "IndexMonitor.html", true));
}
prop.put("process", process);
// return rewrite properties

@ -66,7 +66,7 @@ public class IndexShare_p {
prop.put("dtable", "");
prop.put("rtable", "");
prop.put("wcount", "" + switchboard.wordIndex.size());
prop.put("ucount", "" + switchboard.loadedURL.size());
prop.put("ucount", "" + switchboard.urlPool.loadedURL.size());
return prop; // be save
}
@ -79,7 +79,7 @@ public class IndexShare_p {
// insert constants
prop.put("wcount", "" + switchboard.wordIndex.size());
prop.put("ucount", "" + switchboard.loadedURL.size());
prop.put("ucount", "" + switchboard.urlPool.loadedURL.size());
// return rewrite properties
return prop;
}

@ -74,12 +74,12 @@ public class ProxyIndexingMonitor_p {
prop.put("info_message", "");
if (post != null) {
if (post.containsKey("clearlist4")) switchboard.loadedURL.clearStack(4); // local: by proxy crawl
if (post.containsKey("clearlist4")) switchboard.urlPool.loadedURL.clearStack(4); // local: by proxy crawl
if (post.containsKey("deleteentry")) {
String hash = post.get("hash", null);
if (hash != null) {
// delete from database
switchboard.loadedURL.remove(hash);
switchboard.urlPool.loadedURL.remove(hash);
}
}
@ -123,7 +123,7 @@ public class ProxyIndexingMonitor_p {
// create tables
String myname = yacyCore.seedDB.mySeed.getName();
prop.putAll(switchboard.loadedURL.genTableProps(4, showIndexedCount, false, false, "proxy", null, "ProxyIndexingMonitor_p.html", true));
prop.putAll(switchboard.urlPool.loadedURL.genTableProps(4, showIndexedCount, false, false, "proxy", null, "ProxyIndexingMonitor_p.html", true));
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("proxyStoreHTCacheChecked", env.getConfig("proxyStoreHTCache", "").equals("true") ? 1 : 0);

@ -443,7 +443,7 @@ public class dir {
try {
URL url = new URL(urlstring);
plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
plasmaCrawlLURL.entry newEntry = switchboard.loadedURL.newEntry(
plasmaCrawlLURL.entry newEntry = switchboard.urlPool.loadedURL.newEntry(
url, "YaCyShare: " + descr, new Date(), new Date(),
"____________", /*initiator*/
yacyCore.seedDB.mySeed.hash, /*executor*/
@ -468,7 +468,7 @@ public class dir {
String urlhash = plasmaURL.urlHash(new URL(urlstring));
Set words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes());
switchboard.removeReferences(urlhash, words);
switchboard.loadedURL.remove(urlhash);
switchboard.urlPool.loadedURL.remove(urlhash);
} catch (Exception e) {
System.out.println("INTERNAL ERROR in dir.deletePhrase:");
e.printStackTrace();

@ -152,10 +152,10 @@ public class crawlOrder {
reason = reasonString;
delay = "" + (acceptDelay / 4);
// send lurl-Entry as response
plasmaCrawlLURL.entry entry = switchboard.loadedURL.getEntry(plasmaCrawlLURL.urlHash(url));
plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaCrawlLURL.urlHash(url));
if (entry != null) {
response = "double";
switchboard.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
lurl = crypt.simpleEncode(entry.toString());
delay = "1";
} else {

@ -111,11 +111,11 @@ public class crawlReceipt {
prop.put("delay", "3600");
} else if (result.equals("fill")) {
// put new data into database
switchboard.loadedURL.newEntry(propStr, true, youare, iam, 1);
switchboard.noticeURL.remove(urlhash);
switchboard.urlPool.loadedURL.newEntry(propStr, true, youare, iam, 1);
switchboard.urlPool.noticeURL.remove(urlhash);
// write log
plasmaCrawlLURL.entry entry = switchboard.loadedURL.getEntry(urlhash);
plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
if (entry == null) {
switchboard.getLog().logError("RECEIVED wrong RECEIPT for hash " + urlhash + " from peer " + iam);
} else {
@ -125,10 +125,10 @@ public class crawlReceipt {
// ready for more
prop.put("delay", "10");
} else {
plasmaCrawlNURL.entry en = switchboard.noticeURL.getEntry(urlhash);
plasmaCrawlNURL.entry en = switchboard.urlPool.noticeURL.getEntry(urlhash);
if (en != null) {
switchboard.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(plasmaURL.urlFlagLength), false);
switchboard.noticeURL.remove(urlhash);
switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(plasmaURL.urlFlagLength), false);
switchboard.urlPool.noticeURL.remove(urlhash);
}
prop.put("delay", "100"); // what shall we do with that???
}

@ -110,7 +110,7 @@ public class transferRWI {
switchboard.wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry));
urlHash = entry.getUrlHash();
if ((!(unknownURL.contains(urlHash))) &&
(!(switchboard.loadedURL.exists(urlHash)))) {
(!(switchboard.urlPool.loadedURL.exists(urlHash)))) {
unknownURL.add(urlHash);
}
received++;

@ -71,13 +71,13 @@ public class transferURL {
if (granted) {
int received = 0;
int sizeBefore = switchboard.loadedURL.size();
int sizeBefore = switchboard.urlPool.loadedURL.size();
// read the urls from the other properties and store
String urls;
for (int i = 0; i < urlc; i++) {
urls = (String) post.get("url" + i);
if (urls != null) {
switchboard.loadedURL.newEntry(urls, true, iam, iam, 3);
switchboard.urlPool.loadedURL.newEntry(urls, true, iam, iam, 3);
received++;
}
}
@ -85,7 +85,7 @@ public class transferURL {
yacyCore.seedDB.mySeed.incRU(received);
// return rewrite properties
int more = switchboard.loadedURL.size() - sizeBefore;
int more = switchboard.urlPool.loadedURL.size() - sizeBefore;
doublevalues = "" + (received - more);
switchboard.getLog().logInfo("Received " + received + " URL's from peer " + iam);
if ((received - more) > 0) switchboard.getLog().logError("Received " + doublevalues + " double URL's from peer " + iam);

@ -61,16 +61,22 @@ import de.anomic.tools.bitfield;
public class plasmaCrawlNURL extends plasmaURL {
public static final int STACK_TYPE_NULL = 0; // do not stack
public static final int STACK_TYPE_CORE = 1; // put on local stack
public static final int STACK_TYPE_LIMIT = 2; // put on global stack
public static final int STACK_TYPE_OVERHANG = 3; // put on overhang stack; links that are known but not crawled
public static final int STACK_TYPE_REMOTE = 4; // put on remote-triggered stack
public static final int STACK_TYPE_NULL = 0; // do not stack
public static final int STACK_TYPE_CORE = 1; // put on local stack
public static final int STACK_TYPE_LIMIT = 2; // put on global stack
public static final int STACK_TYPE_OVERHANG = 3; // put on overhang stack; links that are known but not crawled
public static final int STACK_TYPE_REMOTE = 4; // put on remote-triggered stack
public static final int STACK_TYPE_IMAGE = 11; // put on image stack
public static final int STACK_TYPE_MOVIE = 12; // put on movie stack
public static final int STACK_TYPE_MUSIC = 13; // put on music stack
private kelondroStack coreStack; // links found by crawling to depth-1
private kelondroStack limitStack; // links found by crawling at target depth
private kelondroStack overhangStack; // links found by crawling at depth+1
private kelondroStack remoteStack; // links from remote crawl orders
private kelondroStack imageStack; // links pointing to image resources
private kelondroStack movieStack; // links pointing to movie resources
private kelondroStack musicStack; // links pointing to music resources
private HashSet stackIndex; // to find out if a specific link is already on any stack
@ -104,39 +110,41 @@ public class plasmaCrawlNURL extends plasmaURL {
urlHashCache = new kelondroTree(cacheFile, bufferkb * 0x400, ce);
}
File localCrawlStack = new File(cacheStacksPath, "urlNoticeLocal0.stack");
if (localCrawlStack.exists()) {
coreStack = new kelondroStack(localCrawlStack, 0);
} else {
coreStack = new kelondroStack(localCrawlStack, 0, new int[] {plasmaURL.urlHashLength});
}
File limitCrawlStack = new File(cacheStacksPath, "urlNoticeLimit0.stack");
if (limitCrawlStack.exists()) {
limitStack = new kelondroStack(limitCrawlStack, 0);
} else {
limitStack = new kelondroStack(limitCrawlStack, 0, new int[] {plasmaURL.urlHashLength});
}
File overhangCrawlStack = new File(cacheStacksPath, "urlNoticeOverhang0.stack");
if (overhangCrawlStack.exists()) {
overhangStack = new kelondroStack(overhangCrawlStack, 0);
} else {
overhangStack = new kelondroStack(overhangCrawlStack, 0, new int[] {plasmaURL.urlHashLength});
}
File globalCrawlStack = new File(cacheStacksPath, "urlNoticeRemote0.stack");
if (globalCrawlStack.exists()) {
remoteStack = new kelondroStack(globalCrawlStack, 0);
} else {
remoteStack = new kelondroStack(globalCrawlStack, 0, new int[] {plasmaURL.urlHashLength});
}
File coreStackFile = new File(cacheStacksPath, "urlNoticeLocal0.stack");
File limitStackFile = new File(cacheStacksPath, "urlNoticeLimit0.stack");
File overhangStackFile = new File(cacheStacksPath, "urlNoticeOverhang0.stack");
File remoteStackFile = new File(cacheStacksPath, "urlNoticeRemote0.stack");
File imageStackFile = new File(cacheStacksPath, "urlNoticeImage0.stack");
File movieStackFile = new File(cacheStacksPath, "urlNoticeMovie0.stack");
File musicStackFile = new File(cacheStacksPath, "urlNoticeMusic0.stack");
if (coreStackFile.exists()) coreStack = new kelondroStack(coreStackFile, 0); else coreStack = new kelondroStack(coreStackFile, 0, new int[] {plasmaURL.urlHashLength});
if (limitStackFile.exists()) limitStack = new kelondroStack(limitStackFile, 0); else limitStack = new kelondroStack(limitStackFile, 0, new int[] {plasmaURL.urlHashLength});
if (overhangStackFile.exists()) overhangStack = new kelondroStack(overhangStackFile, 0); else overhangStack = new kelondroStack(overhangStackFile, 0, new int[] {plasmaURL.urlHashLength});
if (remoteStackFile.exists()) remoteStack = new kelondroStack(remoteStackFile, 0); else remoteStack = new kelondroStack(remoteStackFile, 0, new int[] {plasmaURL.urlHashLength});
if (imageStackFile.exists()) imageStack = new kelondroStack(imageStackFile, 0); else imageStack = new kelondroStack(imageStackFile, 0, new int[] {plasmaURL.urlHashLength});
if (movieStackFile.exists()) movieStack = new kelondroStack(movieStackFile, 0); else movieStack = new kelondroStack(movieStackFile, 0, new int[] {plasmaURL.urlHashLength});
if (musicStackFile.exists()) musicStack = new kelondroStack(musicStackFile, 0); else musicStack = new kelondroStack(musicStackFile, 0, new int[] {plasmaURL.urlHashLength});
// init stack Index
stackIndex = new HashSet();
Iterator i = coreStack.iterator();
while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
i = remoteStack.iterator();
while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
new initStackIndex().start();
}
public class initStackIndex extends Thread {
public void run() {
Iterator i;
try {
i = coreStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
i = limitStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
i = overhangStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
i = remoteStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
i = imageStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
i = movieStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
i = musicStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
} catch (IOException e) {}
}
}
private static String normalizeHost(String host) {
if (host.length() > urlHostLength) host = host.substring(0, urlHostLength);
host = host.toLowerCase();
@ -155,20 +163,17 @@ public class plasmaCrawlNURL extends plasmaURL {
return coreStack.size() + limitStack.size() + remoteStack.size();
}
public int coreStackSize() {
return coreStack.size();
}
public int limitStackSize() {
return limitStack.size();
}
public int overhangStackSize() {
return overhangStack.size();
}
public int remoteStackSize() {
return remoteStack.size();
public int stackSize(int stackType) {
switch (stackType) {
case STACK_TYPE_CORE: return coreStack.size();
case STACK_TYPE_LIMIT: return limitStack.size();
case STACK_TYPE_OVERHANG: return overhangStack.size();
case STACK_TYPE_REMOTE: return remoteStack.size();
case STACK_TYPE_IMAGE: return imageStack.size();
case STACK_TYPE_MOVIE: return movieStack.size();
case STACK_TYPE_MUSIC: return musicStack.size();
default: return -1;
}
}
public boolean existsInStack(String urlhash) {
@ -179,35 +184,48 @@ public class plasmaCrawlNURL extends plasmaURL {
String profile, int depth, int anchors, int forkfactor, int stackMode) {
entry e = new entry(initiator, url, referrer, name, loaddate, profile,
depth, anchors, forkfactor);
// stackMode can have 3 cases:
// 0 = do not stack
// 1 = on local stack
// 2 = on global stack
// 3 = on overhang stack
// 4 = on remote stack
try {
if (stackMode == 1) coreStack.push(new byte[][] {e.hash.getBytes()});
if (stackMode == 2) limitStack.push(new byte[][] {e.hash.getBytes()});
if (stackMode == 3) overhangStack.push(new byte[][] {e.hash.getBytes()});
if (stackMode == 4) remoteStack.push(new byte[][] {e.hash.getBytes()});
switch (stackMode) {
case STACK_TYPE_CORE: coreStack.push(new byte[][] {e.hash.getBytes()}); break;
case STACK_TYPE_LIMIT: limitStack.push(new byte[][] {e.hash.getBytes()}); break;
case STACK_TYPE_OVERHANG: overhangStack.push(new byte[][] {e.hash.getBytes()}); break;
case STACK_TYPE_REMOTE: remoteStack.push(new byte[][] {e.hash.getBytes()}); break;
case STACK_TYPE_IMAGE: imageStack.push(new byte[][] {e.hash.getBytes()}); break;
case STACK_TYPE_MOVIE: movieStack.push(new byte[][] {e.hash.getBytes()}); break;
case STACK_TYPE_MUSIC: musicStack.push(new byte[][] {e.hash.getBytes()}); break;
default: break;
}
stackIndex.add(new String(e.hash.getBytes()));
} catch (IOException er) {
}
return e;
}
public entry corePop() { return pop(coreStack); }
public entry[] coreTop(int count) { return top(coreStack, count); }
public entry limitPop() { return pop(limitStack); }
public entry[] limitTop(int count) { return top(limitStack, count); }
public entry overhangPop() { return pop(overhangStack); }
public entry[] overhangTop(int count) { return top(overhangStack, count); }
public entry[] top(int stackType, int count) {
switch (stackType) {
case STACK_TYPE_CORE: return top(coreStack, count);
case STACK_TYPE_LIMIT: return top(limitStack, count);
case STACK_TYPE_OVERHANG: return top(overhangStack, count);
case STACK_TYPE_REMOTE: return top(remoteStack, count);
case STACK_TYPE_IMAGE: return top(imageStack, count);
case STACK_TYPE_MOVIE: return top(movieStack, count);
case STACK_TYPE_MUSIC: return top(musicStack, count);
default: return null;
}
}
public entry remotePop() { return pop(remoteStack); }
public entry[] remoteTop(int count) { return top(remoteStack, count); }
public entry pop(int stackType) {
switch (stackType) {
case STACK_TYPE_CORE: return pop(coreStack);
case STACK_TYPE_LIMIT: return pop(limitStack);
case STACK_TYPE_OVERHANG: return pop(overhangStack);
case STACK_TYPE_REMOTE: return pop(remoteStack);
case STACK_TYPE_IMAGE: return pop(imageStack);
case STACK_TYPE_MOVIE: return pop(movieStack);
case STACK_TYPE_MUSIC: return pop(musicStack);
default: return null;
}
}
private entry pop(kelondroStack stack) {
// this is a filo - pop
@ -237,7 +255,7 @@ public class plasmaCrawlNURL extends plasmaURL {
return null;
}
}
public synchronized entry getEntry(String hash) {
return new entry(hash);
}
@ -247,10 +265,11 @@ public class plasmaCrawlNURL extends plasmaURL {
urlHashCache.remove(hash.getBytes());
} catch (IOException e) {}
}
public class entry {
private String initiator; // the initiator hash, is NULL or "" if it is the own proxy
private String initiator; // the initiator hash, is NULL or "" if it is the own proxy;
// if this is generated by a crawl, the own peer hash in entered
private String hash; // the url's hash
private String referrer; // the url's referrer hash
private URL url; // the url as string

@ -158,9 +158,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
private File cachePath;
private File plasmaPath;
public File listsPath;
public plasmaCrawlLURL loadedURL;
public plasmaCrawlNURL noticeURL;
public plasmaCrawlEURL errorURL;
public plasmaURLPool urlPool;
public plasmaWordIndex wordIndex;
public plasmaSearch searchManager;
public plasmaHTCache cacheManager;
@ -248,13 +246,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// start indexing management
log.logSystem("Starting Indexing Management");
loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL);
noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL);
errorURL = new plasmaCrawlEURL(new File(plasmaPath, "urlErr0.db"), ramEURL);
urlPool = new plasmaURLPool(plasmaPath, ramLURL, ramNURL, ramEURL);
wordIndex = new plasmaWordIndex(plasmaPath, ramRWI, log);
int wordCacheMax = Integer.parseInt((String) getConfig("wordCacheMax", "10000"));
wordIndex.setMaxWords(wordCacheMax);
searchManager = new plasmaSearch(loadedURL, wordIndex);
searchManager = new plasmaSearch(urlPool.loadedURL, wordIndex);
// start a cache manager
log.logSystem("Starting HT Cache Manager");
@ -402,7 +400,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} catch (IOException e) {}
}
private void cleanProfiles() {
if ((queueStack.size() > 0) || (cacheLoader.size() > 0) || (noticeURL.stackSize() > 0)) return;
if ((queueStack.size() > 0) || (cacheLoader.size() > 0) || (urlPool.noticeURL.stackSize() > 0)) return;
Iterator i = profiles.profiles(true);
plasmaCrawlProfile.entry entry;
try {
@ -446,9 +444,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
wikiDB.close();
messageDB.close();
facilityDB.close();
loadedURL.close();
noticeURL.close();
errorURL.close();
urlPool.close();
profiles.close();
parser.close();
cacheManager.close();
@ -468,7 +464,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public int lUrlSize() {
return loadedURL.size();
return urlPool.loadedURL.size();
}
public int cacheSizeMin() {
@ -496,10 +492,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// do one processing step
log.logDebug("DEQUEUE: cacheManager=" + ((cacheManager.idle()) ? "idle" : "busy") +
", queueStack=" + queueStack.size() +
", coreStackSize=" + noticeURL.coreStackSize() +
", limitStackSize=" + noticeURL.limitStackSize() +
", overhangStackSize=" + noticeURL.overhangStackSize() +
", remoteStackSize=" + noticeURL.remoteStackSize());
", coreStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) +
", limitStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) +
", overhangStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) +
", remoteStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE));
nextentry = (plasmaHTCache.Entry) queueStack.removeFirst();
}
processResourceStack(nextentry);
@ -508,9 +504,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public int cleanupJobSize() {
int c = 0;
if ((errorURL.stackSize() > 1000)) c++;
if ((urlPool.errorURL.stackSize() > 1000)) c++;
for (int i = 1; i <= 6; i++) {
if (loadedURL.getStackSize(i) > 1000) c++;
if (urlPool.loadedURL.getStackSize(i) > 1000) c++;
}
return c;
}
@ -520,14 +516,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
boolean hasDoneSomething = false;
// clean up error stack
if ((errorURL.stackSize() > 1000)) {
errorURL.clearStack();
if ((urlPool.errorURL.stackSize() > 1000)) {
urlPool.errorURL.clearStack();
hasDoneSomething = true;
}
// clean up loadedURL stack
for (int i = 1; i <= 6; i++) {
if (loadedURL.getStackSize(i) > 1000) {
loadedURL.clearStack(i);
if (urlPool.loadedURL.getStackSize(i) > 1000) {
urlPool.loadedURL.clearStack(i);
hasDoneSomething = true;
}
}
@ -567,11 +563,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public int coreCrawlJobSize() {
return noticeURL.coreStackSize();
return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
}
public boolean coreCrawlJob() {
if (noticeURL.coreStackSize() == 0) {
if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) {
//log.logDebug("CoreCrawl: queue is empty");
return false;
}
@ -600,13 +596,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// do a local crawl
plasmaCrawlNURL.entry urlEntry = noticeURL.corePop();
plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE);
if (urlEntry.url() == null) return false;
String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
if (profile == null) {
log.logError("LOCALCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
log.logError("LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return false;
}
log.logDebug("LOCALCRAWL: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
@ -617,11 +613,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public int limitCrawlTriggerJobSize() {
return noticeURL.limitStackSize();
return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
}
public boolean limitCrawlTriggerJob() {
if (noticeURL.limitStackSize() == 0) {
if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) == 0) {
//log.logDebug("LimitCrawl: queue is empty");
return false;
}
@ -639,13 +635,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// start a global crawl, if possible
plasmaCrawlNURL.entry urlEntry = noticeURL.limitPop();
plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
if (urlEntry.url() == null) return true;
String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
if (profile == null) {
log.logError("REMOTECRAWLTRIGGER[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
log.logError("REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return false;
}
log.logDebug("plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
@ -653,7 +649,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false")));
boolean tryRemote =
((noticeURL.coreStackSize() != 0) || (queueStack.size() != 0)) /* should do ourself */ &&
((urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (queueStack.size() != 0)) /* should do ourself */ &&
(profile.remoteIndexing()) /* granted */ &&
(urlEntry.initiator() != null) && (!(urlEntry.initiator().equals(plasmaURL.dummyHash))) /* not proxy */ &&
((yacyCore.seedDB.mySeed.isSenior()) ||
@ -681,7 +677,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public int remoteTriggeredCrawlJobSize() {
return noticeURL.remoteStackSize();
return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE);
}
public boolean remoteTriggeredCrawlJob() {
@ -689,7 +685,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// do nothing if either there are private processes to be done
// or there is no global crawl on the stack
if (noticeURL.remoteStackSize() == 0) {
if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) == 0) {
//log.logDebug("GlobalCrawl: queue is empty");
return false;
}
@ -720,13 +716,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
plasmaCrawlNURL.entry urlEntry = noticeURL.remotePop();
plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE);
if (urlEntry.url() == null) return false;
String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
if (profile == null) {
log.logError("REMOTETRIGGEREDCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
log.logError("REMOTETRIGGEREDCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return false;
}
log.logDebug("plasmaSwitchboard.remoteTriggeredCrawlJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
@ -807,12 +803,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (rejectReason == null) {
c++;
} else {
errorURL.newEntry(new URL(nexturlstring), entry.nomalizedURLString, entry.initiator(), yacyCore.seedDB.mySeed.hash,
urlPool.errorURL.newEntry(new URL(nexturlstring), entry.nomalizedURLString, entry.initiator(), yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false);
}
}
log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url.toString() +
", NEW CRAWL STACK SIZE IS " + noticeURL.coreStackSize());
", NEW CRAWL STACK SIZE IS " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
}
// create index
@ -835,7 +831,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
//log.logInfo("INDEXING HEADLINE:" + descr);
try {
log.logDebug("(Profile) Create LURL-Entry for '" + entry.nomalizedURLString + "'");
plasmaCrawlLURL.entry newEntry = loadedURL.newEntry(
plasmaCrawlLURL.entry newEntry = urlPool.loadedURL.newEntry(
entry.url, descr, entry.lastModified, new Date(),
initiatorHash,
yacyCore.seedDB.mySeed.hash,
@ -850,7 +846,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String urlHash = newEntry.hash();
log.logDebug("(Profile) Remove NURL for '" + entry.nomalizedURLString + "'");
noticeURL.remove(urlHash); // worked-off
urlPool.noticeURL.remove(urlHash); // worked-off
if (((processCase == 4) || (processCase == 5) || (processCase == 6)) &&
(entry.profile.localIndexing())) {
@ -882,7 +878,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} else {
log.logInfo("Not indexed any word in URL " + entry.url + "; cause: " + noIndexReason);
errorURL.newEntry(entry.url, referrerHash,
urlPool.errorURL.newEntry(entry.url, referrerHash,
((entry.proxy()) ? plasmaURL.dummyHash : entry.initiator()),
yacyCore.seedDB.mySeed.hash,
descr, noIndexReason, new bitfield(plasmaURL.urlFlagLength), true);
@ -925,7 +921,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// filter deny
if ((currentdepth > 0) && (!(nexturlString.matches(profile.generalFilter())))) {
reason = "denied_(does_not_match_filter)";
errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
return reason;
}
@ -933,7 +929,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// deny cgi
if (plasmaHTCache.isCGI(nexturlString)) {
reason = "denied_(cgi_url)";
errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
return reason;
}
@ -941,22 +937,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// deny post properties
if ((plasmaHTCache.isPOST(nexturlString)) && (!(profile.crawlingQ()))) {
reason = "denied_(post_url)";
errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
return reason;
}
String nexturlhash = plasmaURL.urlHash(nexturl);
if (loadedURL.exists(nexturlhash)) {
String dbocc = "";
if ((dbocc = urlPool.testHash(nexturlhash)) != null) {
// DISTIGUISH OLD/RE-SEARCH CASES HERE!
reason = "double_(already_loaded)";
errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
return reason;
}
if (noticeURL.existsInStack(nexturlhash)) {
reason = "double_(noticed_in_crawler)";
errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
reason = "double_(registered_in_" + dbocc + ")";
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
return reason;
}
@ -970,7 +961,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
((yacyCore.seedDB.mySeed.isSenior()) ||
(yacyCore.seedDB.mySeed.isPrincipal())) /* qualified */;
noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
nexturl, /* url clear text string */
loadDate, /* load date */
referrerHash, /* last url in crawling queue */
@ -988,11 +979,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
private URL hash2url(String urlhash) {
if (urlhash.equals(plasmaURL.dummyHash)) return null;
plasmaCrawlNURL.entry ne = noticeURL.getEntry(urlhash);
plasmaCrawlNURL.entry ne = urlPool.noticeURL.getEntry(urlhash);
if (ne != null) return ne.url();
plasmaCrawlLURL.entry le = loadedURL.getEntry(urlhash);
plasmaCrawlLURL.entry le = urlPool.loadedURL.getEntry(urlhash);
if (le != null) return le.url();
plasmaCrawlEURL.entry ee = errorURL.getEntry(urlhash);
plasmaCrawlEURL.entry ee = urlPool.errorURL.getEntry(urlhash);
if (ee != null) return ee.url();
return null;
}
@ -1005,17 +996,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
private boolean processLocalCrawling(plasmaCrawlNURL.entry urlEntry, plasmaCrawlProfile.entry profile) {
// work off one Crawl stack entry
if ((urlEntry == null) && (urlEntry.url() == null)) {
log.logInfo("LOCALCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.limitStackSize() + ", " + noticeURL.overhangStackSize() + ", " + noticeURL.remoteStackSize() + "]: urlEntry=null");
log.logInfo("LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null");
return false;
}
cacheLoader.loadParallel(urlEntry.url(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile);
log.logInfo("LOCALCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.limitStackSize() + ", " + noticeURL.overhangStackSize() + ", " + noticeURL.remoteStackSize() + "]: enqueued for load " + urlEntry.url());
log.logInfo("LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: enqueued for load " + urlEntry.url());
return true;
}
private boolean processRemoteCrawlTrigger(plasmaCrawlNURL.entry urlEntry) {
if (urlEntry == null) {
log.logInfo("REMOTECRAWLTRIGGER[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: urlEntry=null");
log.logInfo("REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null");
return false;
}
@ -1077,8 +1068,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
plasmaCrawlLURL.entry entry = loadedURL.newEntry(propStr, true, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1);
noticeURL.remove(entry.hash());
plasmaCrawlLURL.entry entry = urlPool.loadedURL.newEntry(propStr, true, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1);
urlPool.noticeURL.remove(entry.hash());
log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + nexturlString + "). URL IS CONSIDERED AS 'LOADED!'");
return true;
} else {
@ -1173,7 +1164,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
int fetchpeers = ((int) time / 1000) * 3; // number of target peers; means 30 peers in 10 seconds
long fetchtime = time * 7 / 10; // time to waste
if (fetchcount > count) fetchcount = count;
globalresults = yacySearch.searchHashes(queryhashes, loadedURL, searchManager, fetchcount, fetchpeers, snippetCache, fetchtime);
globalresults = yacySearch.searchHashes(queryhashes, urlPool.loadedURL, searchManager, fetchcount, fetchpeers, snippetCache, fetchtime);
log.logDebug("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
}
prop.put("globalresults", globalresults); // the result are written to the local DB
@ -1217,7 +1208,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ((seed == null) || ((address = seed.getAddress()) == null)) {
// seed is not known from here
removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + urlentry.descr()).getBytes()));
loadedURL.remove(urlentry.hash()); // clean up
urlPool.loadedURL.remove(urlentry.hash()); // clean up
continue; // next result
}
url = new URL("http://" + address + "/" + host.substring(0, p) + filename);
@ -1367,7 +1358,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (actionName.equals("urlcount")) {
serverObjects result = new serverObjects();
result.put("urls","" + loadedURL.size());
result.put("urls","" + urlPool.loadedURL.size());
return result;
}
@ -1392,7 +1383,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// finally, delete the url entry
// determine the url string
plasmaCrawlLURL.entry entry = loadedURL.getEntry(urlhash);
plasmaCrawlLURL.entry entry = urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url();
if (url == null) return 0;
// get set of words
@ -1401,7 +1392,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// delete all word references
int count = removeReferences(urlhash, words);
// finally delete the url entry itself
loadedURL.remove(urlhash);
urlPool.loadedURL.remove(urlhash);
return count;
}
@ -1443,7 +1434,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ((yacyCore.seedDB == null) ||
(yacyCore.seedDB.mySeed == null) ||
(yacyCore.seedDB.mySeed.isVirgin()) ||
(loadedURL.size() < 10) ||
(urlPool.loadedURL.size() < 10) ||
(wordIndex.size() < 100) ||
(!(yacyCore.seedDB.mySeed.isJunior()))) return false;
@ -1453,7 +1444,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (
(queueStack.size() == 0) &&
(cacheLoader.size() == 0) &&
(noticeURL.stackSize() == 0) &&
(urlPool.noticeURL.stackSize() == 0) &&
(getConfig("allowDistributeIndex", "false").equals("true")) &&
((transferred = performTransferIndex(indexCount, peerCount, true)) > 0)) {
indexCount = transferred;
@ -1508,7 +1499,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
while ((e.hasMoreElements()) && (hc < peerCount)) {
seed = (yacySeed) e.nextElement();
if (seed != null) {
error = yacyClient.transferIndex(seed, indexEntities, loadedURL);
error = yacyClient.transferIndex(seed, indexEntities, urlPool.loadedURL);
if (error == null) {
log.logInfo("Index Transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "] to peer " + seed.getName() + ":" + seed.hash + " successfull");
peerNames += ", " + seed.getName();

@ -0,0 +1,77 @@
// plasmaURLPool.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// last major change: 16.06.2005
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
// this class combines all url storage methods into one. It is the host for all url storage
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
public class plasmaURLPool {
public plasmaCrawlLURL loadedURL;
public plasmaCrawlNURL noticeURL;
public plasmaCrawlEURL errorURL;
public plasmaURLPool(File plasmaPath, int ramLURL, int ramNURL, int ramEURL) throws IOException {
loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL);
noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL);
errorURL = new plasmaCrawlEURL(new File(plasmaPath, "urlErr0.db"), ramEURL);
}
public String testHash(String hash) {
// tests if hash occurrs in any database
// if it exists, the name of the database is returned,
// if it not exists, null is returned
if (loadedURL.exists(hash)) return "loaded";
if (noticeURL.existsInStack(hash)) return "crawler";
return null;
}
public void close() throws IOException {
loadedURL.close();
noticeURL.close();
errorURL.close();
}
}
Loading…
Cancel
Save