From a1ffc2704157cfbab232a4ce182ba3ea6f95402d Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 16 Jun 2005 00:31:13 +0000 Subject: [PATCH] preparations for image/movie/music indexing git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@280 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControl_p.java | 26 +-- htroot/IndexCreate_p.java | 42 ++--- htroot/IndexMonitor.java | 6 +- htroot/IndexShare_p.java | 4 +- htroot/ProxyIndexingMonitor_p.java | 6 +- htroot/htdocsdefault/dir.java | 4 +- htroot/yacy/crawlOrder.java | 4 +- htroot/yacy/crawlReceipt.java | 12 +- htroot/yacy/transferRWI.java | 2 +- htroot/yacy/transferURL.java | 6 +- source/de/anomic/plasma/plasmaCrawlNURL.java | 163 ++++++++++-------- .../de/anomic/plasma/plasmaSwitchboard.java | 129 +++++++------- source/de/anomic/plasma/plasmaURLPool.java | 77 +++++++++ 13 files changed, 284 insertions(+), 197 deletions(-) create mode 100644 source/de/anomic/plasma/plasmaURLPool.java diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 2c5d10b58..72cdd622b 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -77,7 +77,7 @@ public class IndexControl_p { prop.put("urlhash", ""); prop.put("result", ""); prop.put("wcount", "" + switchboard.wordIndex.size()); - prop.put("ucount", "" + switchboard.loadedURL.size()); + prop.put("ucount", "" + switchboard.urlPool.loadedURL.size()); prop.put("otherHosts", ""); prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : ""); prop.put("indexReceiveChecked", (switchboard.getConfig("allowReceiveIndex", "true").equals("true")) ? "checked" : ""); @@ -132,7 +132,7 @@ public class IndexControl_p { } } if (delurlref) for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true); - if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.loadedURL.remove(urlx[i]); + if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.urlPool.loadedURL.remove(urlx[i]); switchboard.wordIndex.deleteIndex(keyhash); post.remove("keyhashdeleteall"); if ((keystring.length() > 0) && (plasmaWordIndexEntry.word2hash(keystring).equals(keyhash))) @@ -143,7 +143,7 @@ public class IndexControl_p { if (post.containsKey("keyhashdelete")) { if (delurlref) for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true); - if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.loadedURL.remove(urlx[i]); + if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.urlPool.loadedURL.remove(urlx[i]); switchboard.wordIndex.removeEntries(keyhash, urlx, true); // this shall lead to a presentation of the list; so handle that the remaining program // thinks that it was called for a list presentation @@ -161,14 +161,14 @@ public class IndexControl_p { } if (post.containsKey("urlhashdelete")) { - plasmaCrawlLURL.entry entry = switchboard.loadedURL.getEntry(urlhash); + plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); URL url = entry.url(); if (url == null) { prop.put("result", "No Entry for url hash " + urlhash + "; nothing deleted."); } else { urlstring = htmlFilterContentScraper.urlNormalform(url); prop.put("urlstring", ""); - switchboard.loadedURL.remove(urlhash); + switchboard.urlPool.loadedURL.remove(urlhash); prop.put("result", "Removed URL " + urlstring); } } @@ -198,7 +198,7 @@ public class IndexControl_p { String result; long starttime = System.currentTimeMillis(); indexes[0] = switchboard.wordIndex.getEntity(keyhash, true); - result = yacyClient.transferIndex(yacyCore.seedDB.getConnected(post.get("hostHash", "")), indexes, switchboard.loadedURL); + result = yacyClient.transferIndex(yacyCore.seedDB.getConnected(post.get("hostHash", "")), indexes, switchboard.urlPool.loadedURL); prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result); try {indexes[0].close();} catch (IOException e) {} } @@ -227,7 +227,7 @@ public class IndexControl_p { URL url = new URL(urlstring); urlhash = plasmaURL.urlHash(url); prop.put("urlhash", urlhash); - plasmaCrawlLURL.entry entry = switchboard.loadedURL.getEntry(urlhash); + plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); prop.put("result", genUrlProfile(switchboard, entry, urlhash)); } catch (MalformedURLException e) { prop.put("urlstring", "wrong url: " + urlstring); @@ -236,7 +236,7 @@ public class IndexControl_p { } if (post.containsKey("urlhashsearch")) { - plasmaCrawlLURL.entry entry = switchboard.loadedURL.getEntry(urlhash); + plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); URL url = entry.url(); if (url == null) { prop.put("result", "No Entry for url hash " + urlhash); @@ -249,7 +249,7 @@ public class IndexControl_p { if (post.containsKey("urlhashsimilar")) { try { - Iterator hashIt = switchboard.loadedURL.urlHashes(urlhash, true); + Iterator hashIt = switchboard.urlPool.loadedURL.urlHashes(urlhash, true); String result = "Sequential List of URL-Hashes:
"; String hash; int i = 0; @@ -290,7 +290,7 @@ public class IndexControl_p { // insert constants prop.put("wcount", "" + switchboard.wordIndex.size()); - prop.put("ucount", "" + switchboard.loadedURL.size()); + prop.put("ucount", "" + switchboard.urlPool.loadedURL.size()); prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : ""); prop.put("indexReceiveChecked", (switchboard.getConfig("allowReceiveIndex", "true").equals("true")) ? "checked" : ""); // return rewrite properties @@ -307,7 +307,7 @@ public class IndexControl_p { "Description" + entry.descr() + "" + "Modified-Date" + entry.moddate() + "" + "Loaded-Date" + entry.loaddate() + "" + - "Referrer" + switchboard.loadedURL.getEntry(entry.referrerHash()).url() + "" + + "Referrer" + switchboard.urlPool.loadedURL.getEntry(entry.referrerHash()).url() + "" + "Doctype" + entry.doctype() + "" + "Copy-Count" + entry.copyCount() + "" + "Local-Flag" + entry.local() + "" + @@ -351,8 +351,8 @@ public class IndexControl_p { uh = ie.getUrlHash(); result += ""; - if (switchboard.loadedURL.exists(uh)) { - us = switchboard.loadedURL.getEntry(uh).url().toString(); + if (switchboard.urlPool.loadedURL.exists(uh)) { + us = switchboard.urlPool.loadedURL.getEntry(uh).url().toString(); result += " 0) { - urlHash = switchboard.noticeURL.corePop().hash(); - if (urlHash != null) { switchboard.noticeURL.remove(urlHash); c++; } + while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) { + urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE).hash(); + if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } } - while (switchboard.noticeURL.limitStackSize() > 0) { - urlHash = switchboard.noticeURL.limitPop().hash(); - if (urlHash != null) { switchboard.noticeURL.remove(urlHash); c++; } + while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) > 0) { + urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash(); + if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } } - while (switchboard.noticeURL.remoteStackSize() > 0) { - urlHash = switchboard.noticeURL.remotePop().hash(); - if (urlHash != null) { switchboard.noticeURL.remove(urlHash); c++; } + while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) > 0) { + urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash(); + if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } } prop.put("info", 3);//crawling queue cleared prop.put("info_numEntries", c); @@ -211,7 +211,7 @@ public class IndexCreate_p { int queueStackSize = switchboard.queueStack.size(); int loaderThreadsSize = switchboard.cacheLoader.size(); - int crawlerListSize = switchboard.noticeURL.stackSize(); + int crawlerListSize = switchboard.urlPool.noticeURL.stackSize(); int completequeue = queueStackSize + loaderThreadsSize + crawlerListSize; if ((completequeue > 0) || ((post != null) && (post.containsKey("refreshpage")))) { @@ -279,11 +279,11 @@ public class IndexCreate_p { } // failure cases - if (switchboard.errorURL.stackSize() != 0) { - if (showRejectedCount > switchboard.errorURL.stackSize()) showRejectedCount = switchboard.errorURL.stackSize(); + if (switchboard.urlPool.errorURL.stackSize() != 0) { + if (showRejectedCount > switchboard.urlPool.errorURL.stackSize()) showRejectedCount = switchboard.urlPool.errorURL.stackSize(); prop.put("rejected", 1); - prop.put("rejected_num", switchboard.errorURL.stackSize()); - if (showRejectedCount != switchboard.errorURL.stackSize()) { + prop.put("rejected_num", switchboard.urlPool.errorURL.stackSize()); + if (showRejectedCount != switchboard.urlPool.errorURL.stackSize()) { prop.put("rejected_only-latest", 1); prop.put("rejected_only-latest_num", showRejectedCount); prop.put("rejected_only-latest_newnum", ((int) (showRejectedCount * 1.5))); @@ -295,8 +295,8 @@ public class IndexCreate_p { plasmaCrawlEURL.entry entry; yacySeed initiatorSeed, executorSeed; int j=0; - for (i = switchboard.errorURL.stackSize() - 1; i >= (switchboard.errorURL.stackSize() - showRejectedCount); i--) { - entry = (plasmaCrawlEURL.entry) switchboard.errorURL.getStack(i); + for (i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) { + entry = (plasmaCrawlEURL.entry) switchboard.urlPool.errorURL.getStack(i); initiatorHash = entry.initiator(); executorHash = entry.executor(); url = entry.url().toString(); @@ -380,12 +380,12 @@ public class IndexCreate_p { prop.put("loader-set_list", i ); } - int localStackSize = switchboard.noticeURL.coreStackSize(); + int localStackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); if (localStackSize == 0) { prop.put("crawler-queue", 0); } else { prop.put("crawler-queue", 1); - plasmaCrawlNURL.entry[] crawlerList = switchboard.noticeURL.coreTop(20); + plasmaCrawlNURL.entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, 20); prop.put("crawler-queue_num", localStackSize);//num Entries prop.put("crawler-queue_show-num", crawlerList.length); //showin sjow-num most recent plasmaCrawlNURL.entry urle; diff --git a/htroot/IndexMonitor.java b/htroot/IndexMonitor.java index a6ff070ee..5793e1fd0 100644 --- a/htroot/IndexMonitor.java +++ b/htroot/IndexMonitor.java @@ -95,12 +95,12 @@ public class IndexMonitor { } // do the commands - if (post.containsKey("clearlist")) switchboard.loadedURL.clearStack(process); + if (post.containsKey("clearlist")) switchboard.urlPool.loadedURL.clearStack(process); if (post.containsKey("deleteentry")) { String hash = post.get("hash", null); if (hash != null) { // delete from database - switchboard.loadedURL.remove(hash); + switchboard.urlPool.loadedURL.remove(hash); } } if (post.containsKey("moreIndexed")) { @@ -113,7 +113,7 @@ public class IndexMonitor { if (process == 0) { prop.put("table", 2); } else { - prop.putAll(switchboard.loadedURL.genTableProps(process, showIndexedCount, si, se, "unknown", null, "IndexMonitor.html", true)); + prop.putAll(switchboard.urlPool.loadedURL.genTableProps(process, showIndexedCount, si, se, "unknown", null, "IndexMonitor.html", true)); } prop.put("process", process); // return rewrite properties diff --git a/htroot/IndexShare_p.java b/htroot/IndexShare_p.java index 4315153cc..69321499f 100644 --- a/htroot/IndexShare_p.java +++ b/htroot/IndexShare_p.java @@ -66,7 +66,7 @@ public class IndexShare_p { prop.put("dtable", ""); prop.put("rtable", ""); prop.put("wcount", "" + switchboard.wordIndex.size()); - prop.put("ucount", "" + switchboard.loadedURL.size()); + prop.put("ucount", "" + switchboard.urlPool.loadedURL.size()); return prop; // be save } @@ -79,7 +79,7 @@ public class IndexShare_p { // insert constants prop.put("wcount", "" + switchboard.wordIndex.size()); - prop.put("ucount", "" + switchboard.loadedURL.size()); + prop.put("ucount", "" + switchboard.urlPool.loadedURL.size()); // return rewrite properties return prop; } diff --git a/htroot/ProxyIndexingMonitor_p.java b/htroot/ProxyIndexingMonitor_p.java index 1dd018b8f..fb9b7fcca 100644 --- a/htroot/ProxyIndexingMonitor_p.java +++ b/htroot/ProxyIndexingMonitor_p.java @@ -74,12 +74,12 @@ public class ProxyIndexingMonitor_p { prop.put("info_message", ""); if (post != null) { - if (post.containsKey("clearlist4")) switchboard.loadedURL.clearStack(4); // local: by proxy crawl + if (post.containsKey("clearlist4")) switchboard.urlPool.loadedURL.clearStack(4); // local: by proxy crawl if (post.containsKey("deleteentry")) { String hash = post.get("hash", null); if (hash != null) { // delete from database - switchboard.loadedURL.remove(hash); + switchboard.urlPool.loadedURL.remove(hash); } } @@ -123,7 +123,7 @@ public class ProxyIndexingMonitor_p { // create tables String myname = yacyCore.seedDB.mySeed.getName(); - prop.putAll(switchboard.loadedURL.genTableProps(4, showIndexedCount, false, false, "proxy", null, "ProxyIndexingMonitor_p.html", true)); + prop.putAll(switchboard.urlPool.loadedURL.genTableProps(4, showIndexedCount, false, false, "proxy", null, "ProxyIndexingMonitor_p.html", true)); prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); prop.put("proxyStoreHTCacheChecked", env.getConfig("proxyStoreHTCache", "").equals("true") ? 1 : 0); diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index 936e85781..5586ba4e7 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -443,7 +443,7 @@ public class dir { try { URL url = new URL(urlstring); plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes())); - plasmaCrawlLURL.entry newEntry = switchboard.loadedURL.newEntry( + plasmaCrawlLURL.entry newEntry = switchboard.urlPool.loadedURL.newEntry( url, "YaCyShare: " + descr, new Date(), new Date(), "____________", /*initiator*/ yacyCore.seedDB.mySeed.hash, /*executor*/ @@ -468,7 +468,7 @@ public class dir { String urlhash = plasmaURL.urlHash(new URL(urlstring)); Set words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes()); switchboard.removeReferences(urlhash, words); - switchboard.loadedURL.remove(urlhash); + switchboard.urlPool.loadedURL.remove(urlhash); } catch (Exception e) { System.out.println("INTERNAL ERROR in dir.deletePhrase:"); e.printStackTrace(); diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java index ed619d7db..3e889ba36 100644 --- a/htroot/yacy/crawlOrder.java +++ b/htroot/yacy/crawlOrder.java @@ -152,10 +152,10 @@ public class crawlOrder { reason = reasonString; delay = "" + (acceptDelay / 4); // send lurl-Entry as response - plasmaCrawlLURL.entry entry = switchboard.loadedURL.getEntry(plasmaCrawlLURL.urlHash(url)); + plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaCrawlLURL.urlHash(url)); if (entry != null) { response = "double"; - switchboard.loadedURL.notifyGCrawl(entry.hash(), iam, youare); + switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare); lurl = crypt.simpleEncode(entry.toString()); delay = "1"; } else { diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index bbfe013ed..5928d6098 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -111,11 +111,11 @@ public class crawlReceipt { prop.put("delay", "3600"); } else if (result.equals("fill")) { // put new data into database - switchboard.loadedURL.newEntry(propStr, true, youare, iam, 1); - switchboard.noticeURL.remove(urlhash); + switchboard.urlPool.loadedURL.newEntry(propStr, true, youare, iam, 1); + switchboard.urlPool.noticeURL.remove(urlhash); // write log - plasmaCrawlLURL.entry entry = switchboard.loadedURL.getEntry(urlhash); + plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); if (entry == null) { switchboard.getLog().logError("RECEIVED wrong RECEIPT for hash " + urlhash + " from peer " + iam); } else { @@ -125,10 +125,10 @@ public class crawlReceipt { // ready for more prop.put("delay", "10"); } else { - plasmaCrawlNURL.entry en = switchboard.noticeURL.getEntry(urlhash); + plasmaCrawlNURL.entry en = switchboard.urlPool.noticeURL.getEntry(urlhash); if (en != null) { - switchboard.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(plasmaURL.urlFlagLength), false); - switchboard.noticeURL.remove(urlhash); + switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(plasmaURL.urlFlagLength), false); + switchboard.urlPool.noticeURL.remove(urlhash); } prop.put("delay", "100"); // what shall we do with that??? } diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 505d05651..8138fe0fc 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -110,7 +110,7 @@ public class transferRWI { switchboard.wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry)); urlHash = entry.getUrlHash(); if ((!(unknownURL.contains(urlHash))) && - (!(switchboard.loadedURL.exists(urlHash)))) { + (!(switchboard.urlPool.loadedURL.exists(urlHash)))) { unknownURL.add(urlHash); } received++; diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 1dcafca8d..cec81faed 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -71,13 +71,13 @@ public class transferURL { if (granted) { int received = 0; - int sizeBefore = switchboard.loadedURL.size(); + int sizeBefore = switchboard.urlPool.loadedURL.size(); // read the urls from the other properties and store String urls; for (int i = 0; i < urlc; i++) { urls = (String) post.get("url" + i); if (urls != null) { - switchboard.loadedURL.newEntry(urls, true, iam, iam, 3); + switchboard.urlPool.loadedURL.newEntry(urls, true, iam, iam, 3); received++; } } @@ -85,7 +85,7 @@ public class transferURL { yacyCore.seedDB.mySeed.incRU(received); // return rewrite properties - int more = switchboard.loadedURL.size() - sizeBefore; + int more = switchboard.urlPool.loadedURL.size() - sizeBefore; doublevalues = "" + (received - more); switchboard.getLog().logInfo("Received " + received + " URL's from peer " + iam); if ((received - more) > 0) switchboard.getLog().logError("Received " + doublevalues + " double URL's from peer " + iam); diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index d1c2577d5..144c71938 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -61,16 +61,22 @@ import de.anomic.tools.bitfield; public class plasmaCrawlNURL extends plasmaURL { - public static final int STACK_TYPE_NULL = 0; // do not stack - public static final int STACK_TYPE_CORE = 1; // put on local stack - public static final int STACK_TYPE_LIMIT = 2; // put on global stack - public static final int STACK_TYPE_OVERHANG = 3; // put on overhang stack; links that are known but not crawled - public static final int STACK_TYPE_REMOTE = 4; // put on remote-triggered stack + public static final int STACK_TYPE_NULL = 0; // do not stack + public static final int STACK_TYPE_CORE = 1; // put on local stack + public static final int STACK_TYPE_LIMIT = 2; // put on global stack + public static final int STACK_TYPE_OVERHANG = 3; // put on overhang stack; links that are known but not crawled + public static final int STACK_TYPE_REMOTE = 4; // put on remote-triggered stack + public static final int STACK_TYPE_IMAGE = 11; // put on image stack + public static final int STACK_TYPE_MOVIE = 12; // put on movie stack + public static final int STACK_TYPE_MUSIC = 13; // put on music stack private kelondroStack coreStack; // links found by crawling to depth-1 private kelondroStack limitStack; // links found by crawling at target depth private kelondroStack overhangStack; // links found by crawling at depth+1 private kelondroStack remoteStack; // links from remote crawl orders + private kelondroStack imageStack; // links pointing to image resources + private kelondroStack movieStack; // links pointing to movie resources + private kelondroStack musicStack; // links pointing to music resources private HashSet stackIndex; // to find out if a specific link is already on any stack @@ -104,39 +110,41 @@ public class plasmaCrawlNURL extends plasmaURL { urlHashCache = new kelondroTree(cacheFile, bufferkb * 0x400, ce); } - File localCrawlStack = new File(cacheStacksPath, "urlNoticeLocal0.stack"); - if (localCrawlStack.exists()) { - coreStack = new kelondroStack(localCrawlStack, 0); - } else { - coreStack = new kelondroStack(localCrawlStack, 0, new int[] {plasmaURL.urlHashLength}); - } - File limitCrawlStack = new File(cacheStacksPath, "urlNoticeLimit0.stack"); - if (limitCrawlStack.exists()) { - limitStack = new kelondroStack(limitCrawlStack, 0); - } else { - limitStack = new kelondroStack(limitCrawlStack, 0, new int[] {plasmaURL.urlHashLength}); - } - File overhangCrawlStack = new File(cacheStacksPath, "urlNoticeOverhang0.stack"); - if (overhangCrawlStack.exists()) { - overhangStack = new kelondroStack(overhangCrawlStack, 0); - } else { - overhangStack = new kelondroStack(overhangCrawlStack, 0, new int[] {plasmaURL.urlHashLength}); - } - File globalCrawlStack = new File(cacheStacksPath, "urlNoticeRemote0.stack"); - if (globalCrawlStack.exists()) { - remoteStack = new kelondroStack(globalCrawlStack, 0); - } else { - remoteStack = new kelondroStack(globalCrawlStack, 0, new int[] {plasmaURL.urlHashLength}); - } - + File coreStackFile = new File(cacheStacksPath, "urlNoticeLocal0.stack"); + File limitStackFile = new File(cacheStacksPath, "urlNoticeLimit0.stack"); + File overhangStackFile = new File(cacheStacksPath, "urlNoticeOverhang0.stack"); + File remoteStackFile = new File(cacheStacksPath, "urlNoticeRemote0.stack"); + File imageStackFile = new File(cacheStacksPath, "urlNoticeImage0.stack"); + File movieStackFile = new File(cacheStacksPath, "urlNoticeMovie0.stack"); + File musicStackFile = new File(cacheStacksPath, "urlNoticeMusic0.stack"); + if (coreStackFile.exists()) coreStack = new kelondroStack(coreStackFile, 0); else coreStack = new kelondroStack(coreStackFile, 0, new int[] {plasmaURL.urlHashLength}); + if (limitStackFile.exists()) limitStack = new kelondroStack(limitStackFile, 0); else limitStack = new kelondroStack(limitStackFile, 0, new int[] {plasmaURL.urlHashLength}); + if (overhangStackFile.exists()) overhangStack = new kelondroStack(overhangStackFile, 0); else overhangStack = new kelondroStack(overhangStackFile, 0, new int[] {plasmaURL.urlHashLength}); + if (remoteStackFile.exists()) remoteStack = new kelondroStack(remoteStackFile, 0); else remoteStack = new kelondroStack(remoteStackFile, 0, new int[] {plasmaURL.urlHashLength}); + if (imageStackFile.exists()) imageStack = new kelondroStack(imageStackFile, 0); else imageStack = new kelondroStack(imageStackFile, 0, new int[] {plasmaURL.urlHashLength}); + if (movieStackFile.exists()) movieStack = new kelondroStack(movieStackFile, 0); else movieStack = new kelondroStack(movieStackFile, 0, new int[] {plasmaURL.urlHashLength}); + if (musicStackFile.exists()) musicStack = new kelondroStack(musicStackFile, 0); else musicStack = new kelondroStack(musicStackFile, 0, new int[] {plasmaURL.urlHashLength}); + // init stack Index stackIndex = new HashSet(); - Iterator i = coreStack.iterator(); - while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); - i = remoteStack.iterator(); - while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + new initStackIndex().start(); } + public class initStackIndex extends Thread { + public void run() { + Iterator i; + try { + i = coreStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + i = limitStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + i = overhangStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + i = remoteStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + i = imageStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + i = movieStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + i = musicStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + } catch (IOException e) {} + } + } + private static String normalizeHost(String host) { if (host.length() > urlHostLength) host = host.substring(0, urlHostLength); host = host.toLowerCase(); @@ -155,20 +163,17 @@ public class plasmaCrawlNURL extends plasmaURL { return coreStack.size() + limitStack.size() + remoteStack.size(); } - public int coreStackSize() { - return coreStack.size(); - } - - public int limitStackSize() { - return limitStack.size(); - } - - public int overhangStackSize() { - return overhangStack.size(); - } - - public int remoteStackSize() { - return remoteStack.size(); + public int stackSize(int stackType) { + switch (stackType) { + case STACK_TYPE_CORE: return coreStack.size(); + case STACK_TYPE_LIMIT: return limitStack.size(); + case STACK_TYPE_OVERHANG: return overhangStack.size(); + case STACK_TYPE_REMOTE: return remoteStack.size(); + case STACK_TYPE_IMAGE: return imageStack.size(); + case STACK_TYPE_MOVIE: return movieStack.size(); + case STACK_TYPE_MUSIC: return musicStack.size(); + default: return -1; + } } public boolean existsInStack(String urlhash) { @@ -179,35 +184,48 @@ public class plasmaCrawlNURL extends plasmaURL { String profile, int depth, int anchors, int forkfactor, int stackMode) { entry e = new entry(initiator, url, referrer, name, loaddate, profile, depth, anchors, forkfactor); - - // stackMode can have 3 cases: - // 0 = do not stack - // 1 = on local stack - // 2 = on global stack - // 3 = on overhang stack - // 4 = on remote stack try { - if (stackMode == 1) coreStack.push(new byte[][] {e.hash.getBytes()}); - if (stackMode == 2) limitStack.push(new byte[][] {e.hash.getBytes()}); - if (stackMode == 3) overhangStack.push(new byte[][] {e.hash.getBytes()}); - if (stackMode == 4) remoteStack.push(new byte[][] {e.hash.getBytes()}); + switch (stackMode) { + case STACK_TYPE_CORE: coreStack.push(new byte[][] {e.hash.getBytes()}); break; + case STACK_TYPE_LIMIT: limitStack.push(new byte[][] {e.hash.getBytes()}); break; + case STACK_TYPE_OVERHANG: overhangStack.push(new byte[][] {e.hash.getBytes()}); break; + case STACK_TYPE_REMOTE: remoteStack.push(new byte[][] {e.hash.getBytes()}); break; + case STACK_TYPE_IMAGE: imageStack.push(new byte[][] {e.hash.getBytes()}); break; + case STACK_TYPE_MOVIE: movieStack.push(new byte[][] {e.hash.getBytes()}); break; + case STACK_TYPE_MUSIC: musicStack.push(new byte[][] {e.hash.getBytes()}); break; + default: break; + } stackIndex.add(new String(e.hash.getBytes())); } catch (IOException er) { } return e; } - public entry corePop() { return pop(coreStack); } - public entry[] coreTop(int count) { return top(coreStack, count); } - - public entry limitPop() { return pop(limitStack); } - public entry[] limitTop(int count) { return top(limitStack, count); } - - public entry overhangPop() { return pop(overhangStack); } - public entry[] overhangTop(int count) { return top(overhangStack, count); } + public entry[] top(int stackType, int count) { + switch (stackType) { + case STACK_TYPE_CORE: return top(coreStack, count); + case STACK_TYPE_LIMIT: return top(limitStack, count); + case STACK_TYPE_OVERHANG: return top(overhangStack, count); + case STACK_TYPE_REMOTE: return top(remoteStack, count); + case STACK_TYPE_IMAGE: return top(imageStack, count); + case STACK_TYPE_MOVIE: return top(movieStack, count); + case STACK_TYPE_MUSIC: return top(musicStack, count); + default: return null; + } + } - public entry remotePop() { return pop(remoteStack); } - public entry[] remoteTop(int count) { return top(remoteStack, count); } + public entry pop(int stackType) { + switch (stackType) { + case STACK_TYPE_CORE: return pop(coreStack); + case STACK_TYPE_LIMIT: return pop(limitStack); + case STACK_TYPE_OVERHANG: return pop(overhangStack); + case STACK_TYPE_REMOTE: return pop(remoteStack); + case STACK_TYPE_IMAGE: return pop(imageStack); + case STACK_TYPE_MOVIE: return pop(movieStack); + case STACK_TYPE_MUSIC: return pop(musicStack); + default: return null; + } + } private entry pop(kelondroStack stack) { // this is a filo - pop @@ -237,7 +255,7 @@ public class plasmaCrawlNURL extends plasmaURL { return null; } } - + public synchronized entry getEntry(String hash) { return new entry(hash); } @@ -247,10 +265,11 @@ public class plasmaCrawlNURL extends plasmaURL { urlHashCache.remove(hash.getBytes()); } catch (IOException e) {} } - + public class entry { - private String initiator; // the initiator hash, is NULL or "" if it is the own proxy + private String initiator; // the initiator hash, is NULL or "" if it is the own proxy; + // if this is generated by a crawl, the own peer hash in entered private String hash; // the url's hash private String referrer; // the url's referrer hash private URL url; // the url as string diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 1d78f87f4..bb035fe18 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -158,9 +158,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser private File cachePath; private File plasmaPath; public File listsPath; - public plasmaCrawlLURL loadedURL; - public plasmaCrawlNURL noticeURL; - public plasmaCrawlEURL errorURL; + public plasmaURLPool urlPool; public plasmaWordIndex wordIndex; public plasmaSearch searchManager; public plasmaHTCache cacheManager; @@ -248,13 +246,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // start indexing management log.logSystem("Starting Indexing Management"); - loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL); - noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL); - errorURL = new plasmaCrawlEURL(new File(plasmaPath, "urlErr0.db"), ramEURL); + urlPool = new plasmaURLPool(plasmaPath, ramLURL, ramNURL, ramEURL); + + wordIndex = new plasmaWordIndex(plasmaPath, ramRWI, log); int wordCacheMax = Integer.parseInt((String) getConfig("wordCacheMax", "10000")); wordIndex.setMaxWords(wordCacheMax); - searchManager = new plasmaSearch(loadedURL, wordIndex); + searchManager = new plasmaSearch(urlPool.loadedURL, wordIndex); // start a cache manager log.logSystem("Starting HT Cache Manager"); @@ -402,7 +400,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } catch (IOException e) {} } private void cleanProfiles() { - if ((queueStack.size() > 0) || (cacheLoader.size() > 0) || (noticeURL.stackSize() > 0)) return; + if ((queueStack.size() > 0) || (cacheLoader.size() > 0) || (urlPool.noticeURL.stackSize() > 0)) return; Iterator i = profiles.profiles(true); plasmaCrawlProfile.entry entry; try { @@ -446,9 +444,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser wikiDB.close(); messageDB.close(); facilityDB.close(); - loadedURL.close(); - noticeURL.close(); - errorURL.close(); + urlPool.close(); profiles.close(); parser.close(); cacheManager.close(); @@ -468,7 +464,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } public int lUrlSize() { - return loadedURL.size(); + return urlPool.loadedURL.size(); } public int cacheSizeMin() { @@ -496,10 +492,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // do one processing step log.logDebug("DEQUEUE: cacheManager=" + ((cacheManager.idle()) ? "idle" : "busy") + ", queueStack=" + queueStack.size() + - ", coreStackSize=" + noticeURL.coreStackSize() + - ", limitStackSize=" + noticeURL.limitStackSize() + - ", overhangStackSize=" + noticeURL.overhangStackSize() + - ", remoteStackSize=" + noticeURL.remoteStackSize()); + ", coreStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + + ", limitStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + + ", overhangStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + + ", remoteStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE)); nextentry = (plasmaHTCache.Entry) queueStack.removeFirst(); } processResourceStack(nextentry); @@ -508,9 +504,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public int cleanupJobSize() { int c = 0; - if ((errorURL.stackSize() > 1000)) c++; + if ((urlPool.errorURL.stackSize() > 1000)) c++; for (int i = 1; i <= 6; i++) { - if (loadedURL.getStackSize(i) > 1000) c++; + if (urlPool.loadedURL.getStackSize(i) > 1000) c++; } return c; } @@ -520,14 +516,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser boolean hasDoneSomething = false; // clean up error stack - if ((errorURL.stackSize() > 1000)) { - errorURL.clearStack(); + if ((urlPool.errorURL.stackSize() > 1000)) { + urlPool.errorURL.clearStack(); hasDoneSomething = true; } // clean up loadedURL stack for (int i = 1; i <= 6; i++) { - if (loadedURL.getStackSize(i) > 1000) { - loadedURL.clearStack(i); + if (urlPool.loadedURL.getStackSize(i) > 1000) { + urlPool.loadedURL.clearStack(i); hasDoneSomething = true; } } @@ -567,11 +563,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } public int coreCrawlJobSize() { - return noticeURL.coreStackSize(); + return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); } public boolean coreCrawlJob() { - if (noticeURL.coreStackSize() == 0) { + if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) { //log.logDebug("CoreCrawl: queue is empty"); return false; } @@ -600,13 +596,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // do a local crawl - plasmaCrawlNURL.entry urlEntry = noticeURL.corePop(); + plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE); if (urlEntry.url() == null) return false; String profileHandle = urlEntry.profileHandle(); //System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); if (profile == null) { - log.logError("LOCALCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); + log.logError("LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); return false; } log.logDebug("LOCALCRAWL: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + @@ -617,11 +613,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } public int limitCrawlTriggerJobSize() { - return noticeURL.limitStackSize(); + return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); } public boolean limitCrawlTriggerJob() { - if (noticeURL.limitStackSize() == 0) { + if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) == 0) { //log.logDebug("LimitCrawl: queue is empty"); return false; } @@ -639,13 +635,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // start a global crawl, if possible - plasmaCrawlNURL.entry urlEntry = noticeURL.limitPop(); + plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT); if (urlEntry.url() == null) return true; String profileHandle = urlEntry.profileHandle(); //System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); if (profile == null) { - log.logError("REMOTECRAWLTRIGGER[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); + log.logError("REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); return false; } log.logDebug("plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + @@ -653,7 +649,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false"))); boolean tryRemote = - ((noticeURL.coreStackSize() != 0) || (queueStack.size() != 0)) /* should do ourself */ && + ((urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (queueStack.size() != 0)) /* should do ourself */ && (profile.remoteIndexing()) /* granted */ && (urlEntry.initiator() != null) && (!(urlEntry.initiator().equals(plasmaURL.dummyHash))) /* not proxy */ && ((yacyCore.seedDB.mySeed.isSenior()) || @@ -681,7 +677,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } public int remoteTriggeredCrawlJobSize() { - return noticeURL.remoteStackSize(); + return urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE); } public boolean remoteTriggeredCrawlJob() { @@ -689,7 +685,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // do nothing if either there are private processes to be done // or there is no global crawl on the stack - if (noticeURL.remoteStackSize() == 0) { + if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) == 0) { //log.logDebug("GlobalCrawl: queue is empty"); return false; } @@ -720,13 +716,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view) - plasmaCrawlNURL.entry urlEntry = noticeURL.remotePop(); + plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE); if (urlEntry.url() == null) return false; String profileHandle = urlEntry.profileHandle(); //System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); if (profile == null) { - log.logError("REMOTETRIGGEREDCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); + log.logError("REMOTETRIGGEREDCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); return false; } log.logDebug("plasmaSwitchboard.remoteTriggeredCrawlJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + @@ -807,12 +803,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (rejectReason == null) { c++; } else { - errorURL.newEntry(new URL(nexturlstring), entry.nomalizedURLString, entry.initiator(), yacyCore.seedDB.mySeed.hash, + urlPool.errorURL.newEntry(new URL(nexturlstring), entry.nomalizedURLString, entry.initiator(), yacyCore.seedDB.mySeed.hash, (String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false); } } log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url.toString() + - ", NEW CRAWL STACK SIZE IS " + noticeURL.coreStackSize()); + ", NEW CRAWL STACK SIZE IS " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)); } // create index @@ -835,7 +831,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser //log.logInfo("INDEXING HEADLINE:" + descr); try { log.logDebug("(Profile) Create LURL-Entry for '" + entry.nomalizedURLString + "'"); - plasmaCrawlLURL.entry newEntry = loadedURL.newEntry( + plasmaCrawlLURL.entry newEntry = urlPool.loadedURL.newEntry( entry.url, descr, entry.lastModified, new Date(), initiatorHash, yacyCore.seedDB.mySeed.hash, @@ -850,7 +846,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String urlHash = newEntry.hash(); log.logDebug("(Profile) Remove NURL for '" + entry.nomalizedURLString + "'"); - noticeURL.remove(urlHash); // worked-off + urlPool.noticeURL.remove(urlHash); // worked-off if (((processCase == 4) || (processCase == 5) || (processCase == 6)) && (entry.profile.localIndexing())) { @@ -882,7 +878,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } else { log.logInfo("Not indexed any word in URL " + entry.url + "; cause: " + noIndexReason); - errorURL.newEntry(entry.url, referrerHash, + urlPool.errorURL.newEntry(entry.url, referrerHash, ((entry.proxy()) ? plasmaURL.dummyHash : entry.initiator()), yacyCore.seedDB.mySeed.hash, descr, noIndexReason, new bitfield(plasmaURL.urlFlagLength), true); @@ -925,7 +921,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // filter deny if ((currentdepth > 0) && (!(nexturlString.matches(profile.generalFilter())))) { reason = "denied_(does_not_match_filter)"; - errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, + urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, name, reason, new bitfield(plasmaURL.urlFlagLength), false); return reason; } @@ -933,7 +929,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // deny cgi if (plasmaHTCache.isCGI(nexturlString)) { reason = "denied_(cgi_url)"; - errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, + urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, name, reason, new bitfield(plasmaURL.urlFlagLength), false); return reason; } @@ -941,22 +937,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // deny post properties if ((plasmaHTCache.isPOST(nexturlString)) && (!(profile.crawlingQ()))) { reason = "denied_(post_url)"; - errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, + urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, name, reason, new bitfield(plasmaURL.urlFlagLength), false); return reason; } String nexturlhash = plasmaURL.urlHash(nexturl); - if (loadedURL.exists(nexturlhash)) { + String dbocc = ""; + if ((dbocc = urlPool.testHash(nexturlhash)) != null) { // DISTIGUISH OLD/RE-SEARCH CASES HERE! - reason = "double_(already_loaded)"; - errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, - name, reason, new bitfield(plasmaURL.urlFlagLength), false); - return reason; - } - if (noticeURL.existsInStack(nexturlhash)) { - reason = "double_(noticed_in_crawler)"; - errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, + reason = "double_(registered_in_" + dbocc + ")"; + urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, name, reason, new bitfield(plasmaURL.urlFlagLength), false); return reason; } @@ -970,7 +961,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser ((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) /* qualified */; - noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */ + urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */ nexturl, /* url clear text string */ loadDate, /* load date */ referrerHash, /* last url in crawling queue */ @@ -988,11 +979,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser private URL hash2url(String urlhash) { if (urlhash.equals(plasmaURL.dummyHash)) return null; - plasmaCrawlNURL.entry ne = noticeURL.getEntry(urlhash); + plasmaCrawlNURL.entry ne = urlPool.noticeURL.getEntry(urlhash); if (ne != null) return ne.url(); - plasmaCrawlLURL.entry le = loadedURL.getEntry(urlhash); + plasmaCrawlLURL.entry le = urlPool.loadedURL.getEntry(urlhash); if (le != null) return le.url(); - plasmaCrawlEURL.entry ee = errorURL.getEntry(urlhash); + plasmaCrawlEURL.entry ee = urlPool.errorURL.getEntry(urlhash); if (ee != null) return ee.url(); return null; } @@ -1005,17 +996,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser private boolean processLocalCrawling(plasmaCrawlNURL.entry urlEntry, plasmaCrawlProfile.entry profile) { // work off one Crawl stack entry if ((urlEntry == null) && (urlEntry.url() == null)) { - log.logInfo("LOCALCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.limitStackSize() + ", " + noticeURL.overhangStackSize() + ", " + noticeURL.remoteStackSize() + "]: urlEntry=null"); + log.logInfo("LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null"); return false; } cacheLoader.loadParallel(urlEntry.url(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile); - log.logInfo("LOCALCRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.limitStackSize() + ", " + noticeURL.overhangStackSize() + ", " + noticeURL.remoteStackSize() + "]: enqueued for load " + urlEntry.url()); + log.logInfo("LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: enqueued for load " + urlEntry.url()); return true; } private boolean processRemoteCrawlTrigger(plasmaCrawlNURL.entry urlEntry) { if (urlEntry == null) { - log.logInfo("REMOTECRAWLTRIGGER[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: urlEntry=null"); + log.logInfo("REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null"); return false; } @@ -1077,8 +1068,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String lurl = (String) page.get("lurl"); if ((lurl != null) && (lurl.length() != 0)) { String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); - plasmaCrawlLURL.entry entry = loadedURL.newEntry(propStr, true, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); - noticeURL.remove(entry.hash()); + plasmaCrawlLURL.entry entry = urlPool.loadedURL.newEntry(propStr, true, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); + urlPool.noticeURL.remove(entry.hash()); log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + nexturlString + "). URL IS CONSIDERED AS 'LOADED!'"); return true; } else { @@ -1173,7 +1164,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser int fetchpeers = ((int) time / 1000) * 3; // number of target peers; means 30 peers in 10 seconds long fetchtime = time * 7 / 10; // time to waste if (fetchcount > count) fetchcount = count; - globalresults = yacySearch.searchHashes(queryhashes, loadedURL, searchManager, fetchcount, fetchpeers, snippetCache, fetchtime); + globalresults = yacySearch.searchHashes(queryhashes, urlPool.loadedURL, searchManager, fetchcount, fetchpeers, snippetCache, fetchtime); log.logDebug("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); } prop.put("globalresults", globalresults); // the result are written to the local DB @@ -1217,7 +1208,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if ((seed == null) || ((address = seed.getAddress()) == null)) { // seed is not known from here removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + urlentry.descr()).getBytes())); - loadedURL.remove(urlentry.hash()); // clean up + urlPool.loadedURL.remove(urlentry.hash()); // clean up continue; // next result } url = new URL("http://" + address + "/" + host.substring(0, p) + filename); @@ -1367,7 +1358,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (actionName.equals("urlcount")) { serverObjects result = new serverObjects(); - result.put("urls","" + loadedURL.size()); + result.put("urls","" + urlPool.loadedURL.size()); return result; } @@ -1392,7 +1383,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // finally, delete the url entry // determine the url string - plasmaCrawlLURL.entry entry = loadedURL.getEntry(urlhash); + plasmaCrawlLURL.entry entry = urlPool.loadedURL.getEntry(urlhash); URL url = entry.url(); if (url == null) return 0; // get set of words @@ -1401,7 +1392,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // delete all word references int count = removeReferences(urlhash, words); // finally delete the url entry itself - loadedURL.remove(urlhash); + urlPool.loadedURL.remove(urlhash); return count; } @@ -1443,7 +1434,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if ((yacyCore.seedDB == null) || (yacyCore.seedDB.mySeed == null) || (yacyCore.seedDB.mySeed.isVirgin()) || - (loadedURL.size() < 10) || + (urlPool.loadedURL.size() < 10) || (wordIndex.size() < 100) || (!(yacyCore.seedDB.mySeed.isJunior()))) return false; @@ -1453,7 +1444,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if ( (queueStack.size() == 0) && (cacheLoader.size() == 0) && - (noticeURL.stackSize() == 0) && + (urlPool.noticeURL.stackSize() == 0) && (getConfig("allowDistributeIndex", "false").equals("true")) && ((transferred = performTransferIndex(indexCount, peerCount, true)) > 0)) { indexCount = transferred; @@ -1508,7 +1499,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser while ((e.hasMoreElements()) && (hc < peerCount)) { seed = (yacySeed) e.nextElement(); if (seed != null) { - error = yacyClient.transferIndex(seed, indexEntities, loadedURL); + error = yacyClient.transferIndex(seed, indexEntities, urlPool.loadedURL); if (error == null) { log.logInfo("Index Transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "] to peer " + seed.getName() + ":" + seed.hash + " successfull"); peerNames += ", " + seed.getName(); diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java new file mode 100644 index 000000000..93e3318e0 --- /dev/null +++ b/source/de/anomic/plasma/plasmaURLPool.java @@ -0,0 +1,77 @@ +// plasmaURLPool.java +// ----------------------- +// part of YaCy +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2005 +// last major change: 16.06.2005 +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +// this class combines all url storage methods into one. It is the host for all url storage + + +package de.anomic.plasma; + +import java.io.File; +import java.io.IOException; + +public class plasmaURLPool { + + + public plasmaCrawlLURL loadedURL; + public plasmaCrawlNURL noticeURL; + public plasmaCrawlEURL errorURL; + + public plasmaURLPool(File plasmaPath, int ramLURL, int ramNURL, int ramEURL) throws IOException { + loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL); + noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL); + errorURL = new plasmaCrawlEURL(new File(plasmaPath, "urlErr0.db"), ramEURL); + } + + public String testHash(String hash) { + // tests if hash occurrs in any database + // if it exists, the name of the database is returned, + // if it not exists, null is returned + if (loadedURL.exists(hash)) return "loaded"; + if (noticeURL.existsInStack(hash)) return "crawler"; + return null; + } + + public void close() throws IOException { + loadedURL.close(); + noticeURL.close(); + errorURL.close(); + } +}