From 19dbed7cc85bfbbeca4b288bc07f87f86cc5c054 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 12 Jul 2005 15:09:35 +0000 Subject: [PATCH] code clean-up git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@401 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControl_p.java | 1 + htroot/IndexCreateIndexingQueue_p.java | 4 +- htroot/IndexCreateWWWGlobalQueue_p.java | 4 +- htroot/IndexCreateWWWLocalQueue_p.java | 4 +- htroot/yacy/crawlReceipt.java | 2 +- .../de/anomic/kelondro/kelondroMSetTools.java | 47 ++ source/de/anomic/plasma/plasmaCrawlEURL.java | 20 +- source/de/anomic/plasma/plasmaCrawlNURL.java | 28 +- .../de/anomic/plasma/plasmaSnippetCache.java | 37 ++ .../de/anomic/plasma/plasmaSwitchboard.java | 413 ++---------------- source/de/anomic/plasma/plasmaURLPattern.java | 4 +- source/de/anomic/plasma/plasmaURLPool.java | 17 +- source/de/anomic/plasma/plasmaWordIndex.java | 4 + .../plasma/plasmaWordIndexDistribution.java | 254 +++++++++++ source/de/anomic/server/serverFileUtils.java | 22 +- source/de/anomic/yacy/yacyClient.java | 6 +- 16 files changed, 439 insertions(+), 428 deletions(-) create mode 100644 source/de/anomic/plasma/plasmaWordIndexDistribution.java diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 567c6a627..9ed16be34 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -108,6 +108,7 @@ public class IndexControl_p { if (post.containsKey("setIndexDistribute")) { boolean allowDistributeIndex = ((String) post.get("indexDistribute", "")).equals("on"); switchboard.setConfig("allowDistributeIndex", (allowDistributeIndex) ? "true" : "false"); + if (allowDistributeIndex) switchboard.indexDistribution.enable(); else switchboard.indexDistribution.disable(); } if (post.containsKey("setIndexReceive")) { diff --git a/htroot/IndexCreateIndexingQueue_p.java b/htroot/IndexCreateIndexingQueue_p.java index accb2fda9..2c8dfc8e4 100644 --- a/htroot/IndexCreateIndexingQueue_p.java +++ b/htroot/IndexCreateIndexingQueue_p.java @@ -123,11 +123,11 @@ public class IndexCreateIndexingQueue_p { } dark = true; String url, initiatorHash, executorHash; - plasmaCrawlEURL.entry entry; + plasmaCrawlEURL.Entry entry; yacySeed initiatorSeed, executorSeed; int j=0; for (i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) { - entry = (plasmaCrawlEURL.entry) switchboard.urlPool.errorURL.getStack(i); + entry = (plasmaCrawlEURL.Entry) switchboard.urlPool.errorURL.getStack(i); initiatorHash = entry.initiator(); executorHash = entry.executor(); url = entry.url().toString(); diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java index d7dbf45f0..e174e3dd2 100644 --- a/htroot/IndexCreateWWWGlobalQueue_p.java +++ b/htroot/IndexCreateWWWGlobalQueue_p.java @@ -85,10 +85,10 @@ public class IndexCreateWWWGlobalQueue_p { prop.put("crawler-queue", 0); } else { prop.put("crawler-queue", 1); - plasmaCrawlNURL.entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, 100); + plasmaCrawlNURL.Entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, 100); prop.put("crawler-queue_num", stackSize);//num Entries prop.put("crawler-queue_show-num", crawlerList.length); //showin sjow-num most recent - plasmaCrawlNURL.entry urle; + plasmaCrawlNURL.Entry urle; boolean dark = true; yacySeed initiator; int i; diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java index 9fb72806e..7605bb8b3 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -85,10 +85,10 @@ public class IndexCreateWWWLocalQueue_p { prop.put("crawler-queue", 0); } else { prop.put("crawler-queue", 1); - plasmaCrawlNURL.entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, 100); + plasmaCrawlNURL.Entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, 100); prop.put("crawler-queue_num", stackSize);//num Entries prop.put("crawler-queue_show-num", crawlerList.length); //showin sjow-num most recent - plasmaCrawlNURL.entry urle; + plasmaCrawlNURL.Entry urle; boolean dark = true; yacySeed initiator; int i; diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 142c04f24..7b381f6af 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -125,7 +125,7 @@ public class crawlReceipt { // ready for more prop.put("delay", "10"); } else { - plasmaCrawlNURL.entry en = switchboard.urlPool.noticeURL.getEntry(urlhash); + plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(urlhash); if (en != null) { switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(plasmaURL.urlFlagLength), false); switchboard.urlPool.noticeURL.remove(urlhash); diff --git a/source/de/anomic/kelondro/kelondroMSetTools.java b/source/de/anomic/kelondro/kelondroMSetTools.java index ff1bd17b6..21c0b7594 100644 --- a/source/de/anomic/kelondro/kelondroMSetTools.java +++ b/source/de/anomic/kelondro/kelondroMSetTools.java @@ -40,6 +40,11 @@ package de.anomic.kelondro; +import java.io.File; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.IOException; +import java.io.FileInputStream; import java.util.Comparator; import java.util.Iterator; import java.util.TreeMap; @@ -351,6 +356,48 @@ public class kelondroMSetTools { // ------------------------------------------------------------------------------------------------ + public static TreeMap loadMap(String mapname, String filename, String sep) { + TreeMap map = new TreeMap(); + BufferedReader br = null; + try { + br = new BufferedReader(new InputStreamReader(new FileInputStream(filename))); + String line; + int pos; + while ((line = br.readLine()) != null) { + line = line.trim(); + if ((line.length() > 0) && (!(line.startsWith("#"))) && ((pos = line.indexOf(sep)) > 0)) + map.put(line.substring(0, pos).trim().toLowerCase(), line.substring(pos + sep.length()).trim()); + } + } catch (IOException e) { + } finally { + if (br != null) try { br.close(); } catch (Exception e) {} + } + return map; + } + + public static TreeSet loadList(File file) { + TreeSet list = new TreeSet(kelondroMSetTools.fastStringComparator); + if (!(file.exists())) return list; + + BufferedReader br = null; + try { + br = new BufferedReader(new InputStreamReader(new FileInputStream(file))); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if ((line.length() > 0) && (!(line.startsWith("#")))) list.add(line.trim().toLowerCase()); + } + br.close(); + } catch (IOException e) { + } finally { + if (br != null) try{br.close();}catch(Exception e){} + } + return list; + } + + // ------------------------------------------------------------------------------------------------ + + public static void main(String[] args) { TreeMap m = new TreeMap(); TreeSet s = new TreeSet(); diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index ff41132bd..0ebed064b 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -84,7 +84,7 @@ public class plasmaCrawlEURL extends plasmaURL { } } - public synchronized entry newEntry(URL url, String referrer, String initiator, String executor, + public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor, String name, String failreason, bitfield flags, boolean retry) { if ((referrer == null) || (referrer.length() < urlHashLength)) referrer = dummyHash; if ((initiator == null) || (initiator.length() < urlHashLength)) initiator = dummyHash; @@ -101,15 +101,15 @@ public class plasmaCrawlEURL extends plasmaURL { map.put("failreason", failreason); map.put("flags", flags); rejectedStack.add(map); - entry e = new entry(url, referrer, initiator, executor, name, failreason, flags); + Entry e = new Entry(url, referrer, initiator, executor, name, failreason, flags); // put in table if (retry) e.store(); return e; } - public synchronized entry getEntry(String hash) { - return new entry(hash); + public synchronized Entry getEntry(String hash) { + return new Entry(hash); } public void clearStack() { @@ -120,13 +120,13 @@ public class plasmaCrawlEURL extends plasmaURL { return rejectedStack.size(); } - public entry getStack(int pos) { + public Entry getStack(int pos) { HashMap m = (HashMap) rejectedStack.get(pos); - return new entry((URL) m.get("url"), (String) m.get("referrer"), (String) m.get("initiator"), (String) m.get("executor"), + return new Entry((URL) m.get("url"), (String) m.get("referrer"), (String) m.get("initiator"), (String) m.get("executor"), (String) m.get("name"), (String) m.get("failreason"), (bitfield) m.get("flags")); } - public class entry { + public class Entry { private String hash; // the url's hash private String referrer; // the url's referrer hash @@ -140,7 +140,7 @@ public class plasmaCrawlEURL extends plasmaURL { private String failreason; // string describing reason for load fail private bitfield flags; // extra space - public entry(URL url, String referrer, String initiator, String executor, String name, String failreason, bitfield flags) { + public Entry(URL url, String referrer, String initiator, String executor, String name, String failreason, bitfield flags) { // create new entry and store it into database this.hash = urlHash(url); this.referrer = (referrer == null) ? dummyHash : referrer; @@ -156,7 +156,7 @@ public class plasmaCrawlEURL extends plasmaURL { } - public entry(String hash) { + public Entry(String hash) { // generates an plasmaEURLEntry using the url hash // to speed up the access, the url-hashes are buffered // in the hash cache. @@ -265,7 +265,7 @@ public class plasmaCrawlEURL extends plasmaURL { return i.hasNext(); } public Object nextElement() { - return new entry(new String(((byte[][]) i.next())[0])); + return new Entry(new String(((byte[][]) i.next())[0])); } } diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index 740e2b6c0..3ee22fbc2 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -187,9 +187,9 @@ public class plasmaCrawlNURL extends plasmaURL { return stackIndex.contains(urlhash); } - public synchronized entry newEntry(String initiator, URL url, Date loaddate, String referrer, String name, + public synchronized Entry newEntry(String initiator, URL url, Date loaddate, String referrer, String name, String profile, int depth, int anchors, int forkfactor, int stackMode) { - entry e = new entry(initiator, url, referrer, name, loaddate, profile, + Entry e = new Entry(initiator, url, referrer, name, loaddate, profile, depth, anchors, forkfactor); try { switch (stackMode) { @@ -208,7 +208,7 @@ public class plasmaCrawlNURL extends plasmaURL { return e; } - public entry[] top(int stackType, int count) { + public Entry[] top(int stackType, int count) { switch (stackType) { case STACK_TYPE_CORE: return top(coreStack, count); case STACK_TYPE_LIMIT: return top(limitStack, count); @@ -221,7 +221,7 @@ public class plasmaCrawlNURL extends plasmaURL { } } - public entry pop(int stackType) { + public Entry pop(int stackType) { switch (stackType) { case STACK_TYPE_CORE: return pop(coreStack); case STACK_TYPE_LIMIT: return pop(limitStack); @@ -234,11 +234,11 @@ public class plasmaCrawlNURL extends plasmaURL { } } - private entry pop(kelondroStack stack) { + private Entry pop(kelondroStack stack) { // this is a filo - pop try { if (stack.size() > 0) { - entry e = new entry(new String(stack.pop()[0])); + Entry e = new Entry(new String(stack.pop()[0])); stackIndex.remove(e.hash); return e; } else { @@ -249,13 +249,13 @@ public class plasmaCrawlNURL extends plasmaURL { } } - private entry[] top(kelondroStack stack, int count) { + private Entry[] top(kelondroStack stack, int count) { // this is a filo - top if (count > stack.size()) count = stack.size(); - entry[] list = new entry[count]; + Entry[] list = new Entry[count]; try { for (int i = 0; i < count; i++) { - list[i] = new entry(new String(stack.top(i)[0])); + list[i] = new Entry(new String(stack.top(i)[0])); } return list; } catch (IOException e) { @@ -263,8 +263,8 @@ public class plasmaCrawlNURL extends plasmaURL { } } - public synchronized entry getEntry(String hash) { - return new entry(hash); + public synchronized Entry getEntry(String hash) { + return new Entry(hash); } public synchronized void remove(String hash) { @@ -273,7 +273,7 @@ public class plasmaCrawlNURL extends plasmaURL { } catch (IOException e) {} } - public class entry { + public class Entry { private String initiator; // the initiator hash, is NULL or "" if it is the own proxy; // if this is generated by a crawl, the own peer hash in entered @@ -289,7 +289,7 @@ public class plasmaCrawlNURL extends plasmaURL { private bitfield flags; private int handle; - public entry(String initiator, URL url, String referrer, String name, Date loaddate, String profileHandle, + public Entry(String initiator, URL url, String referrer, String name, Date loaddate, String profileHandle, int depth, int anchors, int forkfactor) { // create new entry and store it into database this.hash = urlHash(url); @@ -307,7 +307,7 @@ public class plasmaCrawlNURL extends plasmaURL { store(); } - public entry(String hash) { + public Entry(String hash) { // generates an plasmaNURLEntry using the url hash // to speed up the access, the url-hashes are buffered // in the hash cache. diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 9d35255e6..54d60a7f2 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -54,6 +54,7 @@ import de.anomic.http.httpHeader; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySearch; +import de.anomic.htmlFilter.htmlFilterContentScraper; public class plasmaSnippetCache { @@ -368,4 +369,40 @@ public class plasmaSnippetCache { log); } + public void fetch(plasmaSearch.result acc, Set queryhashes, String urlmask, int fetchcount) { + // fetch snippets + int i = 0; + plasmaCrawlLURL.Entry urlentry; + String urlstring; + plasmaSnippetCache.result snippet; + while ((acc.hasMoreElements()) && (i < fetchcount)) { + urlentry = acc.nextElement(); + if (urlentry.url().getHost().endsWith(".yacyh")) continue; + urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url()); + if ((urlstring.matches(urlmask)) && + (!(existsInCache(urlentry.url(), queryhashes)))) { + new Fetcher(urlentry.url(), queryhashes).start(); + i++; + } + } + } + + public class Fetcher extends Thread { + URL url; + Set queryhashes; + public Fetcher(URL url, Set queryhashes) { + if (url.getHost().endsWith(".yacyh")) return; + this.url = url; + this.queryhashes = queryhashes; + } + public void run() { + log.logDebug("snippetFetcher: try to get URL " + url); + plasmaSnippetCache.result snippet = retrieve(url, queryhashes, true, 260); + if (snippet.line == null) + log.logDebug("snippetFetcher: cannot get URL " + url + ". error(" + snippet.source + "): " + snippet.error); + else + log.logDebug("snippetFetcher: got URL " + url + ", the snippet is '" + snippet.line + "', source=" + snippet.source); + } + } + } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 8989a0b35..d5c3a6413 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -157,28 +157,28 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public static plasmaURLPattern urlBlacklist; // storage management - private File cachePath; - private File plasmaPath; - public File listsPath; - public plasmaURLPool urlPool; - public plasmaWordIndex wordIndex; - public plasmaSearch searchManager; - public plasmaHTCache cacheManager; - public plasmaSnippetCache snippetCache; - public plasmaCrawlLoader cacheLoader; - public plasmaSwitchboardQueue sbQueue; - public messageBoard messageDB; - public wikiBoard wikiDB; - public String remoteProxyHost; - public int remoteProxyPort; - public boolean remoteProxyUse; - public plasmaCrawlProfile profiles; - public plasmaCrawlProfile.entry defaultProxyProfile; - public plasmaCrawlProfile.entry defaultRemoteProfile; - public distributeIndex indexDistribution; - public HashMap outgoingCookies, incomingCookies; - public kelondroTables facilityDB; - public plasmaParser parser; + private File cachePath; + private File plasmaPath; + public File listsPath; + public plasmaURLPool urlPool; + public plasmaWordIndex wordIndex; + public plasmaSearch searchManager; + public plasmaHTCache cacheManager; + public plasmaSnippetCache snippetCache; + public plasmaCrawlLoader cacheLoader; + public plasmaSwitchboardQueue sbQueue; + public messageBoard messageDB; + public wikiBoard wikiDB; + public String remoteProxyHost; + public int remoteProxyPort; + public boolean remoteProxyUse; + public plasmaCrawlProfile profiles; + public plasmaCrawlProfile.entry defaultProxyProfile; + public plasmaCrawlProfile.entry defaultRemoteProfile; + public plasmaWordIndexDistribution indexDistribution; + public HashMap outgoingCookies, incomingCookies; + public kelondroTables facilityDB; + public plasmaParser parser; public plasmaWordIndexClassicCacheMigration classicCache; private serverSemaphore shutdownSync = new serverSemaphore(0); @@ -217,7 +217,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (blueList == null) { // read only once upon first instantiation of this class String f = getConfig("plasmaBlueList", null); - if (f != null) blueList = loadList(new File(f)); else blueList= new TreeSet(); + if (f != null) blueList = kelondroMSetTools.loadList(new File(f)); else blueList= new TreeSet(); } // load the black-list / inspired by [AS] @@ -231,7 +231,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // load stopwords if (stopwords == null) { - stopwords = loadList(new File(rootPath, "yacy.stopwords")); + stopwords = kelondroMSetTools.loadList(new File(rootPath, "yacy.stopwords")); } // read memory amount @@ -376,7 +376,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser deployThread("30_peerping", "YaCy Core", "this is the p2p-control and peer-ping task", peerPing = new serverInstantThread(yc, "peerPing", null), 2000); peerPing.setSyncObject(new Object()); - indexDistribution = new distributeIndex(100 /*indexCount*/, 8000, 1 /*peerCount*/); + indexDistribution = new plasmaWordIndexDistribution(urlPool, wordIndex, log, + getConfig("allowDistributeIndex", "false").equals("true")); + indexDistribution.setCounts(100 /*indexCount*/, 1 /*peerCount*/, 8000); deployThread("20_dhtdistribution", "DHT Distribution (currently by juniors only)", "selection, transfer and deletion of index entries that are not searched on your peer, but on others", new serverInstantThread(indexDistribution, "job", null), 120000); @@ -401,8 +403,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } - - private static String ppRamString(int bytes) { if (bytes < 1024) return bytes + " KByte"; bytes = bytes / 1024; @@ -457,8 +457,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public plasmaHTCache getCacheManager() { return cacheManager; } - - synchronized public void htEntryStoreEnqueued(plasmaHTCache.Entry entry) throws IOException { if (cacheManager.full()) @@ -497,33 +495,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser entry.name() )); - // write log - - /* - switch (entry.status) { - case plasmaHTCache.CACHE_UNFILLED: - log.logInfo("CACHE UNFILLED: " + entry.cacheFile); break; - case plasmaHTCache.CACHE_FILL: - log.logInfo("CACHE FILL: " + entry.cacheFile + ((entry.cacheArray == null) ? "" : " (cacheArray is filled)")); - break; - case plasmaHTCache.CACHE_HIT: - log.logInfo("CACHE HIT: " + entry.cacheFile); break; - case plasmaHTCache.CACHE_STALE_NO_RELOAD: - log.logInfo("CACHE STALE, NO RELOAD: " + entry.cacheFile); break; - case plasmaHTCache.CACHE_STALE_RELOAD_GOOD: - log.logInfo("CACHE STALE, NECESSARY RELOAD: " + entry.cacheFile); break; - case plasmaHTCache.CACHE_STALE_RELOAD_BAD: - log.logInfo("CACHE STALE, SUPERFLUOUS RELOAD: " + entry.cacheFile); break; - case plasmaHTCache.CACHE_PASSING: - log.logInfo("PASSING: " + entry.cacheFile); break; - default: - log.logInfo("CACHE STATE UNKNOWN: " + entry.cacheFile); break; - } - */ return true; } - public boolean htEntryStoreJob() { if (cacheManager.empty()) return false; try { @@ -536,26 +510,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public int htEntrySize() { return cacheManager.size(); } - - private static TreeSet loadList(File file) { - TreeSet list = new TreeSet(kelondroMSetTools.fastStringComparator); - if (!(file.exists())) return list; - - BufferedReader br = null; - try { - br = new BufferedReader(new InputStreamReader(new FileInputStream(file))); - String line; - while ((line = br.readLine()) != null) { - line = line.trim(); - if ((line.length() > 0) && (!(line.startsWith("#")))) list.add(line.trim().toLowerCase()); - } - br.close(); - } catch (IOException e) { - } finally { - if (br != null) try{br.close();}catch(Exception e){} - } - return list; - } public void close() { log.logSystem("SWITCHBOARD SHUTDOWN STEP 1: sending termination signal to managed threads:"); @@ -726,7 +680,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // do a local crawl - plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE); + plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE); String stats = "LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; if (urlEntry.url() == null) { log.logError(stats + ": urlEntry.url() == null"); @@ -770,7 +724,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // start a global crawl, if possible - plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT); + plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT); String stats = "REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; if (urlEntry.url() == null) { log.logError(stats + ": urlEntry.url() == null"); @@ -855,7 +809,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view) - plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE); + plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE); String stats = "REMOTETRIGGEREDCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; if (urlEntry.url() == null) { log.logError(stats + ": urlEntry.url() == null"); @@ -878,7 +832,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } private void processResourceStack(plasmaSwitchboardQueue.Entry entry) { - // work off one stack entry with a fresh resource (scraped web page) + // work off one stack entry with a fresh resource try { // we must distinguish the following cases: resource-load was initiated by // 1) global crawling: the index is extern, not here (not possible here) @@ -1104,7 +1058,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String nexturlhash = plasmaURL.urlHash(nexturl); String dbocc = ""; - if ((dbocc = urlPool.testHash(nexturlhash)) != null) { + if ((dbocc = urlPool.exists(nexturlhash)) != null) { // DISTIGUISH OLD/RE-SEARCH CASES HERE! reason = "double_(registered_in_" + dbocc + ")"; urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, @@ -1137,23 +1091,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser return null; } - private URL hash2url(String urlhash) { - if (urlhash.equals(plasmaURL.dummyHash)) return null; - plasmaCrawlNURL.entry ne = urlPool.noticeURL.getEntry(urlhash); - if (ne != null) return ne.url(); - plasmaCrawlLURL.Entry le = urlPool.loadedURL.getEntry(urlhash); - if (le != null) return le.url(); - plasmaCrawlEURL.entry ee = urlPool.errorURL.getEntry(urlhash); - if (ee != null) return ee.url(); - return null; - } - - private String hash2urlstring(String urlhash) { - URL u = hash2url(urlhash); - if (u == null) return plasmaURL.dummyHash; else return u.toString(); - } - - private void processLocalCrawling(plasmaCrawlNURL.entry urlEntry, plasmaCrawlProfile.entry profile, String stats) { + private void processLocalCrawling(plasmaCrawlNURL.Entry urlEntry, plasmaCrawlProfile.entry profile, String stats) { // work off one Crawl stack entry if ((urlEntry == null) && (urlEntry.url() == null)) { log.logInfo(stats + ": urlEntry=null"); @@ -1164,7 +1102,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser return; } - private boolean processRemoteCrawlTrigger(plasmaCrawlNURL.entry urlEntry) { + private boolean processRemoteCrawlTrigger(plasmaCrawlNURL.Entry urlEntry) { // return true iff another peer has/will index(ed) the url if (urlEntry == null) { log.logInfo("REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null"); @@ -1183,7 +1121,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logDebug("ERROR: plasmaSwitchboard.processRemoteCrawlTrigger - url is null. name=" + urlEntry.name()); return true; } - String nexturlString = urlEntry.url().toString(); String urlhash = plasmaURL.urlHash(urlEntry.url()); // check remote crawl @@ -1195,7 +1132,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // do the request - HashMap page = yacyClient.crawlOrder(remoteSeed, nexturlString, hash2urlstring(urlEntry.referrerHash()), 0); + HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), urlPool.getURL(urlEntry.referrerHash()), 0); // check success @@ -1216,17 +1153,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser the resource is also returned in lurl */ if ((page == null) || (page.get("delay") == null)) { - log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + nexturlString + ")"); + log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + ")"); if (remoteSeed != null) yacyCore.peerActions.peerDeparture(remoteSeed); return false; } else try { - log.logDebug("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed.getName() + ", url=" + nexturlString + ", response=" + page.toString()); // DEBUG + log.logDebug("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed.getName() + ", url=" + urlEntry.url().toString() + ", response=" + page.toString()); // DEBUG int newdelay = Integer.parseInt((String) page.get("delay")); yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay); String response = (String) page.get("response"); if (response.equals("stacked")) { - log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " PLACED URL=" + nexturlString + "; NEW DELAY=" + newdelay); + log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " PLACED URL=" + urlEntry.url().toString() + "; NEW DELAY=" + newdelay); return true; } else if (response.equals("double")) { String lurl = (String) page.get("lurl"); @@ -1236,14 +1173,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser urlPool.loadedURL.newEntry(propStr, true), yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); urlPool.noticeURL.remove(entry.hash()); - log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + nexturlString + "). URL IS CONSIDERED AS 'LOADED!'"); + log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'"); return true; } else { - log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " REJECTED. CAUSE: " + page.get("reason") + " (URL=" + nexturlString + ")"); + log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " REJECTED. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + ")"); return false; } } else { - log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " DENIED. RESPONSE=" + response + ", CAUSE=" + page.get("reason") + ", URL=" + nexturlString); + log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " DENIED. RESPONSE=" + response + ", CAUSE=" + page.get("reason") + ", URL=" + urlEntry.url().toString()); return false; } } catch (Exception e) { @@ -1253,7 +1190,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser return false; } } - private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy"); public static String dateString(Date date) { @@ -1285,7 +1221,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logDebug("presearch: ordered results, now " + acc.sizeOrdered() + " URLs ready for fetch"); // take some elements and fetch the snippets - fetchSnippets(acc, queryhashes, urlmask, fetchcount); + snippetCache.fetch(acc, queryhashes, urlmask, fetchcount); } catch (IOException e) { e.printStackTrace(); } @@ -1293,42 +1229,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } - public void fetchSnippets(plasmaSearch.result acc, Set queryhashes, String urlmask, int fetchcount) { - // fetch the snippets - int i = 0; - plasmaCrawlLURL.Entry urlentry; - String urlstring; - plasmaSnippetCache.result snippet; - while ((acc.hasMoreElements()) && (i < fetchcount)) { - urlentry = acc.nextElement(); - if (urlentry.url().getHost().endsWith(".yacyh")) continue; - urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url()); - if ((urlstring.matches(urlmask)) && - (!(snippetCache.existsInCache(urlentry.url(), queryhashes)))) { - new snippetFetcher(urlentry.url(), queryhashes).start(); - i++; - } - } - } - - public class snippetFetcher extends Thread { - URL url; - Set queryhashes; - public snippetFetcher(URL url, Set queryhashes) { - if (url.getHost().endsWith(".yacyh")) return; - this.url = url; - this.queryhashes = queryhashes; - } - public void run() { - log.logDebug("snippetFetcher: try to get URL " + url); - plasmaSnippetCache.result snippet = snippetCache.retrieve(url, queryhashes, true, 260); - if (snippet.line == null) - log.logDebug("snippetFetcher: cannot get URL " + url + ". error(" + snippet.source + "): " + snippet.error); - else - log.logDebug("snippetFetcher: got URL " + url + ", the snippet is '" + snippet.line + "', source=" + snippet.source); - } - } - public serverObjects searchFromLocal(Set querywords, String order1, String order2, int count, boolean global, long time /*milliseconds*/, String urlmask) { serverObjects prop = new serverObjects(); @@ -1380,7 +1280,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (remainingTime < 500) remainingTime = 500; if (remainingTime > 3000) remainingTime = 3000; plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, remainingTime, 10); - if (!(global)) fetchSnippets(acc.cloneSmart(), queryhashes, urlmask, 10); + if (!(global)) snippetCache.fetch(acc.cloneSmart(), queryhashes, urlmask, 10); log.logDebug("SEARCH TIME AFTER ORDERING OF SEARCH RESULT: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); // result is a List of urlEntry elements: prepare answer @@ -1625,233 +1525,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } return count; } - - public class distributeIndex { - // distributes parts of the index to other peers - // stops as soon as an error occurrs - - int indexCount; - int peerCount; - long pause; - long maxTime; - - public distributeIndex(int indexCount, long maxTimePerTransfer, int peerCount) { - this.indexCount = indexCount; - this.peerCount = peerCount; - this.maxTime = maxTimePerTransfer; - } - - public boolean job() { - if ((yacyCore.seedDB == null) || - (yacyCore.seedDB.mySeed == null) || - (yacyCore.seedDB.mySeed.isVirgin()) || - (urlPool.loadedURL.size() < 10) || - (wordIndex.size() < 100) || - (!(yacyCore.seedDB.mySeed.isJunior()))) return false; - - int transferred; - long starttime = System.currentTimeMillis(); - try { - if ( - (sbQueue.size() == 0) && - (cacheLoader.size() == 0) && - (urlPool.noticeURL.stackSize() == 0) && - (getConfig("allowDistributeIndex", "false").equals("true")) && - ((transferred = performTransferIndex(indexCount, peerCount, true)) > 0)) { - indexCount = transferred; - if ((System.currentTimeMillis() - starttime) > (maxTime * peerCount)) indexCount--; else indexCount++; - if (indexCount < 30) indexCount = 30; - return true; - } else { - // make a long pause - return false; - } - } catch (IllegalArgumentException ee) { - // this is a bug that occurres if a not-fixeable data-inconsistency in the table structure was detected - // make a long pause - log.logError("very bad data inconsistency: " + ee.getMessage()); - //ee.printStackTrace(); - return false; - } - } - - public void setCounts(int indexCount, int peerCount, long pause) { - this.indexCount = indexCount; - if (indexCount < 30) indexCount = 30; - this.peerCount = peerCount; - this.pause = pause; - } - - } - - public int performTransferIndex(int indexCount, int peerCount, boolean delete) { - if ((yacyCore.seedDB == null) || (yacyCore.seedDB.sizeConnected() == 0)) return -1; - - // collect index - //String startPointHash = yacyCore.seedCache.mySeed.hash; - String startPointHash = serverCodings.encodeMD5B64("" + System.currentTimeMillis(), true).substring(0, yacySeedDB.commonHashLength); - plasmaWordIndexEntity[] indexEntities = selectTransferIndexes(startPointHash, indexCount); - if ((indexEntities == null) || (indexEntities.length == 0)) { - log.logDebug("No Index available for Index Transfer, hash start-point " + startPointHash); - return -1; - } - // count the indexes again, can be smaller as expected - indexCount = 0; for (int i = 0; i < indexEntities.length; i++) indexCount += indexEntities[i].size(); - - // find start point for DHT-selection - String keyhash = indexEntities[indexEntities.length - 1].wordHash(); - - // iterate over DHT-peers and send away the indexes - yacySeed seed; - int hc = 0; - Enumeration e = yacyCore.dhtAgent.getAcceptRemoteIndexSeeds(keyhash); - String error; - String peerNames = ""; - while ((e.hasMoreElements()) && (hc < peerCount)) { - seed = (yacySeed) e.nextElement(); - if (seed != null) { - error = yacyClient.transferIndex(seed, indexEntities, urlPool.loadedURL); - if (error == null) { - log.logInfo("Index Transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "] to peer " + seed.getName() + ":" + seed.hash + " successfull"); - peerNames += ", " + seed.getName(); - hc++; - } else { - log.logWarning("Index Transfer to peer " + seed.getName() + ":" + seed.hash + " failed:'" + error + "', disconnecting peer"); - yacyCore.peerActions.peerDeparture(seed); - } - } - } - if (peerNames.length() > 0) peerNames = peerNames.substring(2); // remove comma - - // clean up and finish with deletion of indexes - if (hc >= peerCount) { - // success - if (delete) { - try { - if (deleteTransferIndexes(indexEntities)) { - log.logDebug("Deleted all transferred whole-word indexes locally"); - return indexCount; - } else { - log.logError("Deleted not all transferred whole-word indexes"); - return -1; - } - } catch (IOException ee) { - log.logError("Deletion of Indexes not possible:" + ee.getMessage()); - ee.printStackTrace(); - return -1; - } - } else { - // simply close the indexEntities - for (int i = 0; i < indexEntities.length; i++) try { - indexEntities[i].close(); - } catch (IOException ee) {} - } - return indexCount; - } else { - log.logError("Index distribution failed. Too less peers (" + hc + ") received the index, not deleted locally."); - return -1; - } - } - - private plasmaWordIndexEntity[] selectTransferIndexes(String hash, int count) { - Vector tmpEntities = new Vector(); - String nexthash = ""; - try { - Iterator wordHashIterator = wordIndex.wordHashes(hash, true, true); - plasmaWordIndexEntity indexEntity, tmpEntity; - Enumeration urlEnum; - plasmaWordIndexEntry indexEntry; - while ((count > 0) && (wordHashIterator.hasNext()) && - ((nexthash = (String) wordHashIterator.next()) != null) && (nexthash.trim().length() > 0)) { - indexEntity = wordIndex.getEntity(nexthash, true); - if (indexEntity.size() == 0) { - indexEntity.deleteComplete(); - } else if (indexEntity.size() <= count) { - // take the whole entity - tmpEntities.add(indexEntity); - log.logDebug("Selected Whole Index (" + indexEntity.size() + " urls) for word " + indexEntity.wordHash()); - count -= indexEntity.size(); - } else { - // make an on-the-fly entity and insert values - tmpEntity = new plasmaWordIndexEntity(indexEntity.wordHash()); - urlEnum = indexEntity.elements(true); - while ((urlEnum.hasMoreElements()) && (count > 0)) { - indexEntry = (plasmaWordIndexEntry) urlEnum.nextElement(); - tmpEntity.addEntry(indexEntry); - count--; - } - urlEnum = null; - log.logDebug("Selected Partial Index (" + tmpEntity.size() + " from " + indexEntity.size() +" urls) for word " + tmpEntity.wordHash()); - tmpEntities.add(tmpEntity); - indexEntity.close(); // important: is not closed elswhere and cannot be deleted afterwards - indexEntity = null; - } - - } - // transfer to array - plasmaWordIndexEntity[] indexEntities = new plasmaWordIndexEntity[tmpEntities.size()]; - for (int i = 0; i < tmpEntities.size(); i++) indexEntities[i] = (plasmaWordIndexEntity) tmpEntities.elementAt(i); - return indexEntities; - } catch (IOException e) { - log.logError("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage()); - e.printStackTrace(); - return new plasmaWordIndexEntity[0]; - } catch (kelondroException e) { - log.logError("selectTransferIndexes database corrupted: " + e.getMessage()); - e.printStackTrace(); - return new plasmaWordIndexEntity[0]; - } - } - - private boolean deleteTransferIndexes(plasmaWordIndexEntity[] indexEntities) throws IOException { - String wordhash; - Enumeration urlEnum; - plasmaWordIndexEntry indexEntry; - plasmaWordIndexEntity indexEntity; - String[] urlHashes; - int sz; - boolean success = true; - for (int i = 0; i < indexEntities.length; i++) { - if (indexEntities[i].isTMPEntity()) { - // delete entries separately - int c = 0; - urlHashes = new String[indexEntities[i].size()]; - urlEnum = indexEntities[i].elements(true); - while (urlEnum.hasMoreElements()) { - indexEntry = (plasmaWordIndexEntry) urlEnum.nextElement(); - urlHashes[c++] = indexEntry.getUrlHash(); - } - wordIndex.removeEntries(indexEntities[i].wordHash(), urlHashes, true); - indexEntity = wordIndex.getEntity(indexEntities[i].wordHash(), true); - sz = indexEntity.size(); - indexEntity.close(); - log.logDebug("Deleted Partinal Index (" + c + " urls) for word " + indexEntities[i].wordHash() + "; " + sz + " entries left"); - // DEBUG: now try to delete the remaining index. If this works, this routine is fine - /* - if (wordIndex.getEntity(indexEntities[i].wordHash()).deleteComplete()) - System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " SUCCESSFULL"); - else - System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " FAILED"); - */ - // end debug - indexEntities[i].close(); - } else { - // delete complete file - if (indexEntities[i].deleteComplete()) { - indexEntities[i].close(); - } else { - indexEntities[i].close(); - // have another try... - if (!(plasmaWordIndexEntity.wordHash2path(plasmaPath, indexEntities[i].wordHash()).delete())) { - success = false; - log.logError("Could not delete whole Index for word " + indexEntities[i].wordHash()); - } - } - } - indexEntities[i] = null; - } - return success; - } public int adminAuthenticated(httpHeader header) { String adminAccountBase64MD5 = getConfig("adminAccountBase64MD5", ""); diff --git a/source/de/anomic/plasma/plasmaURLPattern.java b/source/de/anomic/plasma/plasmaURLPattern.java index 3fd3b2c9e..16cd37774 100644 --- a/source/de/anomic/plasma/plasmaURLPattern.java +++ b/source/de/anomic/plasma/plasmaURLPattern.java @@ -45,7 +45,7 @@ import java.lang.String; import java.util.HashMap; import java.io.File; -import de.anomic.server.serverFileUtils; +import de.anomic.kelondro.kelondroMSetTools; public class plasmaURLPattern { @@ -71,7 +71,7 @@ public class plasmaURLPattern { if(filenamesarray.length >0) for(int i = 0; i < filenamesarray.length; i++) - hostpaths.putAll(serverFileUtils.loadMap(mapname, (new File(rootPath, filenamesarray[i])).toString(), sep)); + hostpaths.putAll(kelondroMSetTools.loadMap(mapname, (new File(rootPath, filenamesarray[i])).toString(), sep)); } public void remove(String host) { diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java index 93e3318e0..7726919b2 100644 --- a/source/de/anomic/plasma/plasmaURLPool.java +++ b/source/de/anomic/plasma/plasmaURLPool.java @@ -44,8 +44,11 @@ package de.anomic.plasma; +import java.net.URL; import java.io.File; import java.io.IOException; +import java.util.Set; +import java.util.Iterator; public class plasmaURLPool { @@ -60,12 +63,24 @@ public class plasmaURLPool { errorURL = new plasmaCrawlEURL(new File(plasmaPath, "urlErr0.db"), ramEURL); } - public String testHash(String hash) { + public String exists(String hash) { // tests if hash occurrs in any database // if it exists, the name of the database is returned, // if it not exists, null is returned if (loadedURL.exists(hash)) return "loaded"; if (noticeURL.existsInStack(hash)) return "crawler"; + if (errorURL.exists(hash)) return "errors"; + return null; + } + + public URL getURL(String urlhash) { + if (urlhash.equals(plasmaURL.dummyHash)) return null; + plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); + if (ne != null) return ne.url(); + plasmaCrawlLURL.Entry le = loadedURL.getEntry(urlhash); + if (le != null) return le.url(); + plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); + if (ee != null) return ee.url(); return null; } diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 222160d52..1da846107 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -67,6 +67,10 @@ public final class plasmaWordIndex { this.ramCache = new plasmaWordIndexCache(databaseRoot, fileDB, bufferkb, log); } + public File getRoot() { + return databaseRoot; + } + public int maxURLinWordCache() { return ramCache.maxURLinWordCache(); } diff --git a/source/de/anomic/plasma/plasmaWordIndexDistribution.java b/source/de/anomic/plasma/plasmaWordIndexDistribution.java new file mode 100644 index 000000000..31202c413 --- /dev/null +++ b/source/de/anomic/plasma/plasmaWordIndexDistribution.java @@ -0,0 +1,254 @@ + + +package de.anomic.plasma; + +import java.io.IOException; +import java.util.Enumeration; +import java.util.Vector; +import java.util.Iterator; + +import de.anomic.yacy.yacyCore; +import de.anomic.yacy.yacySeed; +import de.anomic.yacy.yacySeedDB; +import de.anomic.yacy.yacyClient; +import de.anomic.server.serverCodings; +import de.anomic.server.logging.serverLog; +import de.anomic.kelondro.kelondroException; + +public class plasmaWordIndexDistribution { + + // distributes parts of the index to other peers + // stops as soon as an error occurrs + + private int indexCount; + private int peerCount; + private long maxTime; + + private plasmaURLPool urlPool; + private plasmaWordIndex wordIndex; + private serverLog log; + private boolean enabled; + + public plasmaWordIndexDistribution(plasmaURLPool urlPool, plasmaWordIndex wordIndex, serverLog log, + boolean enable) { + this.urlPool = urlPool; + this.wordIndex = wordIndex; + setCounts(100 /*indexCount*/, 1 /*peerCount*/, 8000); + } + + public void enable() { + enabled = true; + } + + public void disable() { + enabled = false; + } + + public boolean job() { + if ((yacyCore.seedDB == null) || + (yacyCore.seedDB.mySeed == null) || + (yacyCore.seedDB.mySeed.isVirgin()) || + (urlPool.loadedURL.size() < 10) || + (wordIndex.size() < 100) || + (!(yacyCore.seedDB.mySeed.isJunior()))) return false; + + int transferred; + long starttime = System.currentTimeMillis(); + try { + if ( + (urlPool.noticeURL.stackSize() == 0) && + (enabled) && + ((transferred = performTransferIndex(indexCount, peerCount, true)) > 0)) { + indexCount = transferred; + if ((System.currentTimeMillis() - starttime) > (maxTime * peerCount)) indexCount--; else indexCount++; + if (indexCount < 30) indexCount = 30; + return true; + } else { + // make a long pause + return false; + } + } catch (IllegalArgumentException ee) { + // this is a bug that occurres if a not-fixeable data-inconsistency in the table structure was detected + // make a long pause + log.logError("very bad data inconsistency: " + ee.getMessage()); + //ee.printStackTrace(); + return false; + } + } + + public void setCounts(int indexCount, int peerCount, long maxTimePerTransfer) { + this.maxTime = maxTimePerTransfer; + this.indexCount = indexCount; + if (indexCount < 30) indexCount = 30; + this.peerCount = peerCount; + } + + public int performTransferIndex(int indexCount, int peerCount, boolean delete) { + if ((yacyCore.seedDB == null) || (yacyCore.seedDB.sizeConnected() == 0)) return -1; + + // collect index + //String startPointHash = yacyCore.seedCache.mySeed.hash; + String startPointHash = serverCodings.encodeMD5B64("" + System.currentTimeMillis(), true).substring(0, yacySeedDB.commonHashLength); + plasmaWordIndexEntity[] indexEntities = selectTransferIndexes(startPointHash, indexCount); + if ((indexEntities == null) || (indexEntities.length == 0)) { + log.logDebug("No Index available for Index Transfer, hash start-point " + startPointHash); + return -1; + } + // count the indexes again, can be smaller as expected + indexCount = 0; for (int i = 0; i < indexEntities.length; i++) indexCount += indexEntities[i].size(); + + // find start point for DHT-selection + String keyhash = indexEntities[indexEntities.length - 1].wordHash(); + + // iterate over DHT-peers and send away the indexes + yacySeed seed; + int hc = 0; + Enumeration e = yacyCore.dhtAgent.getAcceptRemoteIndexSeeds(keyhash); + String error; + String peerNames = ""; + while ((e.hasMoreElements()) && (hc < peerCount)) { + seed = (yacySeed) e.nextElement(); + if (seed != null) { + error = yacyClient.transferIndex(seed, indexEntities, urlPool.loadedURL); + if (error == null) { + log.logInfo("Index Transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "] to peer " + seed.getName() + ":" + seed.hash + " successfull"); + peerNames += ", " + seed.getName(); + hc++; + } else { + log.logWarning("Index Transfer to peer " + seed.getName() + ":" + seed.hash + " failed:'" + error + "', disconnecting peer"); + yacyCore.peerActions.peerDeparture(seed); + } + } + } + if (peerNames.length() > 0) peerNames = peerNames.substring(2); // remove comma + + // clean up and finish with deletion of indexes + if (hc >= peerCount) { + // success + if (delete) { + try { + if (deleteTransferIndexes(indexEntities)) { + log.logDebug("Deleted all transferred whole-word indexes locally"); + return indexCount; + } else { + log.logError("Deleted not all transferred whole-word indexes"); + return -1; + } + } catch (IOException ee) { + log.logError("Deletion of Indexes not possible:" + ee.getMessage()); + ee.printStackTrace(); + return -1; + } + } else { + // simply close the indexEntities + for (int i = 0; i < indexEntities.length; i++) try { + indexEntities[i].close(); + } catch (IOException ee) {} + } + return indexCount; + } else { + log.logError("Index distribution failed. Too less peers (" + hc + ") received the index, not deleted locally."); + return -1; + } + } + + private plasmaWordIndexEntity[] selectTransferIndexes(String hash, int count) { + Vector tmpEntities = new Vector(); + String nexthash = ""; + try { + Iterator wordHashIterator = wordIndex.wordHashes(hash, true, true); + plasmaWordIndexEntity indexEntity, tmpEntity; + Enumeration urlEnum; + plasmaWordIndexEntry indexEntry; + while ((count > 0) && (wordHashIterator.hasNext()) && + ((nexthash = (String) wordHashIterator.next()) != null) && (nexthash.trim().length() > 0)) { + indexEntity = wordIndex.getEntity(nexthash, true); + if (indexEntity.size() == 0) { + indexEntity.deleteComplete(); + } else if (indexEntity.size() <= count) { + // take the whole entity + tmpEntities.add(indexEntity); + log.logDebug("Selected Whole Index (" + indexEntity.size() + " urls) for word " + indexEntity.wordHash()); + count -= indexEntity.size(); + } else { + // make an on-the-fly entity and insert values + tmpEntity = new plasmaWordIndexEntity(indexEntity.wordHash()); + urlEnum = indexEntity.elements(true); + while ((urlEnum.hasMoreElements()) && (count > 0)) { + indexEntry = (plasmaWordIndexEntry) urlEnum.nextElement(); + tmpEntity.addEntry(indexEntry); + count--; + } + urlEnum = null; + log.logDebug("Selected Partial Index (" + tmpEntity.size() + " from " + indexEntity.size() +" urls) for word " + tmpEntity.wordHash()); + tmpEntities.add(tmpEntity); + indexEntity.close(); // important: is not closed elswhere and cannot be deleted afterwards + indexEntity = null; + } + + } + // transfer to array + plasmaWordIndexEntity[] indexEntities = new plasmaWordIndexEntity[tmpEntities.size()]; + for (int i = 0; i < tmpEntities.size(); i++) indexEntities[i] = (plasmaWordIndexEntity) tmpEntities.elementAt(i); + return indexEntities; + } catch (IOException e) { + log.logError("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage()); + e.printStackTrace(); + return new plasmaWordIndexEntity[0]; + } catch (kelondroException e) { + log.logError("selectTransferIndexes database corrupted: " + e.getMessage()); + e.printStackTrace(); + return new plasmaWordIndexEntity[0]; + } + } + + private boolean deleteTransferIndexes(plasmaWordIndexEntity[] indexEntities) throws IOException { + String wordhash; + Enumeration urlEnum; + plasmaWordIndexEntry indexEntry; + plasmaWordIndexEntity indexEntity; + String[] urlHashes; + int sz; + boolean success = true; + for (int i = 0; i < indexEntities.length; i++) { + if (indexEntities[i].isTMPEntity()) { + // delete entries separately + int c = 0; + urlHashes = new String[indexEntities[i].size()]; + urlEnum = indexEntities[i].elements(true); + while (urlEnum.hasMoreElements()) { + indexEntry = (plasmaWordIndexEntry) urlEnum.nextElement(); + urlHashes[c++] = indexEntry.getUrlHash(); + } + wordIndex.removeEntries(indexEntities[i].wordHash(), urlHashes, true); + indexEntity = wordIndex.getEntity(indexEntities[i].wordHash(), true); + sz = indexEntity.size(); + indexEntity.close(); + log.logDebug("Deleted Partinal Index (" + c + " urls) for word " + indexEntities[i].wordHash() + "; " + sz + " entries left"); + // DEBUG: now try to delete the remaining index. If this works, this routine is fine + /* + if (wordIndex.getEntity(indexEntities[i].wordHash()).deleteComplete()) + System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " SUCCESSFULL"); + else + System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " FAILED"); + */ + // end debug + indexEntities[i].close(); + } else { + // delete complete file + if (indexEntities[i].deleteComplete()) { + indexEntities[i].close(); + } else { + indexEntities[i].close(); + // have another try... + if (!(plasmaWordIndexEntity.wordHash2path(wordIndex.getRoot() /*PLASMADB*/, indexEntities[i].wordHash()).delete())) { + success = false; + log.logError("Could not delete whole Index for word " + indexEntities[i].wordHash()); + } + } + } + indexEntities[i] = null; + } + return success; + } +} diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java index 85cb50f13..84d84ea66 100644 --- a/source/de/anomic/server/serverFileUtils.java +++ b/source/de/anomic/server/serverFileUtils.java @@ -51,7 +51,6 @@ import java.io.OutputStream; import java.io.BufferedReader; import java.io.InputStreamReader; import java.util.zip.GZIPOutputStream; -import java.util.TreeMap; import java.util.HashSet; public final class serverFileUtils { @@ -163,24 +162,5 @@ public final class serverFileUtils { } return set; } - - public static TreeMap loadMap(String mapname, String filename, String sep) { - TreeMap map = new TreeMap(); - BufferedReader br = null; - try { - br = new BufferedReader(new InputStreamReader(new FileInputStream(filename))); - String line; - int pos; - while ((line = br.readLine()) != null) { - line = line.trim(); - if ((line.length() > 0) && (!(line.startsWith("#"))) && ((pos = line.indexOf(sep)) > 0)) - map.put(line.substring(0, pos).trim().toLowerCase(), line.substring(pos + sep.length()).trim()); - } - } catch (IOException e) { - } finally { - if (br != null) try { br.close(); } catch (Exception e) {} - } - return map; - } - + } diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 17fd52a10..dfc8e29ff 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -450,7 +450,7 @@ public class yacyClient { } } - public static HashMap crawlOrder(yacySeed targetSeed, String url, String referrer, int depth) { + public static HashMap crawlOrder(yacySeed targetSeed, URL url, URL referrer, int depth) { // this post a message to the remote message board if (targetSeed == null) return null; if (yacyCore.seedDB.mySeed == null) return null; @@ -467,8 +467,8 @@ public class yacyClient { "&process=crawl" + "&youare=" + targetSeed.hash + "&iam=" + yacyCore.seedDB.mySeed.hash + - "&url=" + crypt.simpleEncode(url) + - "&referrer=" + crypt.simpleEncode(referrer) + + "&url=" + crypt.simpleEncode(url.toString()) + + "&referrer=" + crypt.simpleEncode((referrer == null) ? "" : referrer.toString()) + "&depth=" + depth + "&ttl=0" ),