From 0cd0fee546e175775690448e7d53710749c319bc Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 16 Sep 2008 21:56:23 +0000 Subject: [PATCH] fixed bug with wrong proxy result enqueueing. See: http://forum.yacy-websuche.de/viewtopic.php?p=8130#p8130 - removed the online status property. This influenced the proxy behavior and created some complexity that was not needed because the online status was never used as it was ceated for (offline browsing) - checked all proxy identification procedures during crawling and enhanced transparency and error checking - fixed a proxy identification routine that caused the wrong selection of the proxy result queue git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5173 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 7 --- htroot/Network.html | 15 ------ htroot/Network.java | 10 ---- htroot/SettingsAck_p.java | 22 -------- htroot/Status.java | 8 --- source/de/anomic/crawler/CrawlStacker.java | 32 ++++++++--- source/de/anomic/crawler/IndexingStack.java | 6 +-- .../anomic/http/JakartaCommonsHttpClient.java | 4 ++ .../de/anomic/http/httpdProxyCacheEntry.java | 7 +++ source/de/anomic/http/httpdProxyHandler.java | 54 +++++++++---------- .../anomic/plasma/parser/rss/rssParser.java | 2 + source/de/anomic/plasma/plasmaHTCache.java | 2 - source/de/anomic/yacy/yacyCore.java | 31 ----------- 13 files changed, 65 insertions(+), 135 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index cb938392b..62245f709 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -431,13 +431,6 @@ peerName=anomic # of the period here (minutes) peerCycle=2 -# The p2p maintenance can run in either of two online modes: -# - don't process jobs and only access available in cache -> mode 0 -# - process any job only if we are online, which is technically only the case -# if the proxy is used -> mode 1 -# - process jobs periodically, with periodes according to peerCycle -> mode 2 -onlineMode=2 - # Debug mode for YACY network: this will trigger that also local ip's are # accepted as peer addresses yacyDebugMode=false diff --git a/htroot/Network.html b/htroot/Network.html index feec221ef..9d19518b2 100644 --- a/htroot/Network.html +++ b/htroot/Network.html @@ -165,21 +165,6 @@
- #(comment)# - :: - - :: - - #(/comment)#
diff --git a/htroot/Network.java b/htroot/Network.java index d00bf4a77..9ec5b46f1 100644 --- a/htroot/Network.java +++ b/htroot/Network.java @@ -164,16 +164,6 @@ public class Network { prop.putNum("table_gppm", otherppm + ((iAmActive) ? myppm : 0)); prop.putNum("table_gqph", Math.round(6000d * otherqpm + 100d * ((iAmActive) ? myqph : 0d)) / 100d); - -// String comment = ""; - prop.put("table_comment", 0); - if (conCount == 0) { - if (Integer.parseInt(sb.getConfig("onlineMode", "1")) == 2) { - prop.put("table_comment", 1);//in onlinemode, but not online - } else { - prop.put("table_comment", 2);//not in online mode, and not online - } - } prop.put("table", 2); // triggers overview prop.put("page", 0); } else if (post != null && Integer.parseInt(post.get("page", "1")) == 4) { diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index 357ba3a68..76fc88ecb 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -288,28 +288,6 @@ public class SettingsAck_p { prop.putHTML("info_filter", filter); return prop; } - - if (post.containsKey("pmode")) { - env.setConfig("onlineMode", "2"); - prop.put("info", "11");//permanent online mode - yacyCore.setOnlineMode(2); - yacyCore.triggerOnlineAction(); - return prop; - } - - if (post.containsKey("emode")) { - env.setConfig("onlineMode", "1"); - prop.put("info", "24");//event-based online mode - yacyCore.setOnlineMode(1); - return prop; - } - - if (post.containsKey("cmode")) { - env.setConfig("onlineMode", "0"); - prop.put("info", "25");//cache mode - yacyCore.setOnlineMode(0); - return prop; - } if (post.containsKey("proxysettings")) { diff --git a/htroot/Status.java b/htroot/Status.java index e7bf09338..4582f6816 100644 --- a/htroot/Status.java +++ b/htroot/Status.java @@ -270,14 +270,6 @@ public class Status { prop.put("tray", "1"); } - if (sb.getConfig("onlineMode", "1").equals("0")) { - prop.put("omode", "0"); - } else if (sb.getConfig("onlineMode", "1").equals("1")) { - prop.put("omode", "1"); - } else { - prop.put("omode", "2"); - } - // memory usage and system attributes prop.put("freeMemory", serverMemory.bytesToString(serverMemory.free())); prop.put("totalMemory", serverMemory.bytesToString(serverMemory.total())); diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 9dcc71b33..1fbf60eb8 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -476,6 +476,8 @@ public final class CrawlStacker extends Thread { // store information final boolean local = entry.initiator().equals(sb.webIndex.seedDB.mySeed().hash); + final boolean proxy = (entry.initiator() == null || entry.initiator().equals("------------")) && profile.handle().equals(this.sb.webIndex.defaultProxyProfile.handle()); + final boolean remote = profile.handle().equals(this.sb.webIndex.defaultRemoteProfile.handle()); final boolean global = (profile.remoteIndexing()) /* granted */ && (entry.depth() == profile.generalDepth()) /* leaf node */ && @@ -485,15 +487,29 @@ public final class CrawlStacker extends Thread { (sb.webIndex.seedDB.mySeed().isPrincipal()) ) /* qualified */; - if (!local && !global && !profile.handle().equals(this.sb.webIndex.defaultRemoteProfile.handle())) { - this.log.logSevere("URL '" + entry.url().toString() + "' can neither be crawled local nor global."); + if (!local && !global && !remote && !proxy) { + this.log.logSevere("URL '" + entry.url().toString() + "' cannot be crawled. initiator = " + entry.initiator() + ", profile.handle = " + profile.handle()); + } else { + if (global) { + // it may be possible that global == true and local == true, so do not check an error case against it + if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle()); + if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle()); + sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_LIMIT, entry); + } + if (local) { + if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle()); + if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle()); + sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry); + } + if (proxy) { + if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle()); + sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry); + } + if (remote) { + sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry); + } + } - - // add the url into the crawling queue - sb.crawlQueues.noticeURL.push( - ((global) ? NoticedURL.STACK_TYPE_LIMIT : - ((local) ? NoticedURL.STACK_TYPE_CORE : NoticedURL.STACK_TYPE_REMOTE)) /*local/remote stack*/, - entry); return null; } diff --git a/source/de/anomic/crawler/IndexingStack.java b/source/de/anomic/crawler/IndexingStack.java index e22b89b9f..66c7bd634 100644 --- a/source/de/anomic/crawler/IndexingStack.java +++ b/source/de/anomic/crawler/IndexingStack.java @@ -318,7 +318,7 @@ public class IndexingStack { } public String initiator() { - return (initiator == null) ? "------------" : initiator; + return (initiator == null) ? "------------" : initiator; // TODO: this may cause problems for methods that check if the initiator is the proxy } public yacySeed initiatorPeer() { @@ -396,10 +396,10 @@ public class IndexingStack { // 6) local fetching for global crawling (other known or unknwon initiator) int processCase = plasmaSwitchboardConstants.PROCESSCASE_0_UNKNOWN; // FIXME the equals seems to be incorrect: String.equals(boolean) - if ((initiator == null) || (initiator.equals(initiator.length() == 0))) { + if ((initiator == null) || initiator.length() == 0 || initiator.equals("------------")) { // proxy-load processCase = plasmaSwitchboardConstants.PROCESSCASE_4_PROXY_LOAD; - } else if ((initiator != null) && (initiator.equals(wordIndex.seedDB.mySeed().hash))) { + } else if (initiator.equals(wordIndex.seedDB.mySeed().hash)) { // normal crawling processCase = plasmaSwitchboardConstants.PROCESSCASE_5_LOCAL_CRAWLING; } else { diff --git a/source/de/anomic/http/JakartaCommonsHttpClient.java b/source/de/anomic/http/JakartaCommonsHttpClient.java index bfeb96b5e..e5374d685 100644 --- a/source/de/anomic/http/JakartaCommonsHttpClient.java +++ b/source/de/anomic/http/JakartaCommonsHttpClient.java @@ -446,6 +446,10 @@ public class JakartaCommonsHttpClient { // cleanUp statistics HttpConnectionInfo.removeConnection(generateConInfo(method)); throw e; + } catch (final IllegalStateException e) { + // cleanUp statistics + HttpConnectionInfo.removeConnection(generateConInfo(method)); + throw new IOException(e.getMessage()); } if (serverLog.isFinest("HTTPC")) serverLog.logFinest("HTTPC", "<-" + method.hashCode() + " response headers " + Arrays.toString(method.getResponseHeaders())); diff --git a/source/de/anomic/http/httpdProxyCacheEntry.java b/source/de/anomic/http/httpdProxyCacheEntry.java index 190029d6f..8a8edd744 100755 --- a/source/de/anomic/http/httpdProxyCacheEntry.java +++ b/source/de/anomic/http/httpdProxyCacheEntry.java @@ -158,6 +158,13 @@ public class httpdProxyCacheEntry implements indexDocumentMetadata { this.depth = depth; this.responseStatus = responseStatus; this.profile = profile; + + // the initiator is the hash of the peer that caused the hash entry + // it is stored here only to track processed in the peer and this + // information is not permanently stored in the web index after the queue has + // been processed + // in case of proxy usage, the initiator hash is null, + // which distinguishes local crawling from proxy indexing this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null : initiator); this.language = yacyURL.language(url); diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 4a9d1d4f7..0146ac7bb 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -93,7 +93,6 @@ import de.anomic.server.serverFileUtils; import de.anomic.server.serverObjects; import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverMiniLogFormatter; -import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyURL; public final class httpdProxyHandler { @@ -415,35 +414,32 @@ public final class httpdProxyHandler { // 4. cache stale - refill - superfluous // in two of these cases we trigger a scheduler to handle newly arrived files: // case 1 and case 3 - final indexDocumentMetadata cacheEntry = (cachedResponseHeader == null) ? null : - new httpdProxyCacheEntry( - 0, // crawling depth - url, // url - "", // name of the url is unknown - //requestHeader, // request headers - "200 OK", // request status - requestHeader, - cachedResponseHeader, - null, // initiator - switchboard.webIndex.defaultProxyProfile // profile - ); - if (cacheEntry != null) plasmaHTCache.storeMetadata(cachedResponseHeader, cacheEntry); - - if (yacyCore.getOnlineMode() == 0) { - if (cacheExists) { - if (theLogger.isFinest()) theLogger.logFinest(reqID +" fulfill request from cache"); - fulfillRequestFromCache(conProp,url,ext,requestHeader,cachedResponseHeader,cacheFile,countedRespond); - } else { - theLogger.logInfo("URL not availabe in Cache"+" and not in online-mode!"); - httpd.sendRespondError(conProp,countedRespond,4,404,null,"URL not availabe in Cache",null); - } - } else if (cacheExists && cacheEntry.shallUseCacheForProxy()) { - if (theLogger.isFinest()) theLogger.logFinest(reqID +" fulfill request from cache"); - fulfillRequestFromCache(conProp,url,ext,requestHeader,cachedResponseHeader,cacheFile,countedRespond); - } else { - if (theLogger.isFinest()) theLogger.logFinest(reqID +" fulfill request from web"); - fulfillRequestFromWeb(conProp,url,ext,requestHeader,cachedResponseHeader,cacheFile,countedRespond); + if (cachedResponseHeader == null) { + if (theLogger.isFinest()) theLogger.logFinest(reqID + " page not in cache: fulfill request from web"); + fulfillRequestFromWeb(conProp,url,ext,requestHeader,cachedResponseHeader,cacheFile,countedRespond); + } else { + final indexDocumentMetadata cacheEntry = new httpdProxyCacheEntry( + 0, // crawling depth + url, // url + "", // name of the url is unknown + //requestHeader, // request headers + "200 OK", // request status + requestHeader, + cachedResponseHeader, + null, // initiator + switchboard.webIndex.defaultProxyProfile // profile + ); + plasmaHTCache.storeMetadata(cachedResponseHeader, cacheEntry); // TODO: check if this storeMetadata is necessary + + if (cacheExists && cacheEntry.shallUseCacheForProxy()) { + if (theLogger.isFinest()) theLogger.logFinest(reqID + " fulfill request from cache"); + fulfillRequestFromCache(conProp,url,ext,requestHeader,cachedResponseHeader,cacheFile,countedRespond); + } else { + if (theLogger.isFinest()) theLogger.logFinest(reqID + " fulfill request from web"); + fulfillRequestFromWeb(conProp,url,ext,requestHeader,cachedResponseHeader,cacheFile,countedRespond); + } } + } catch (final Exception e) { try { diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java index e81a99ac8..a6df6224b 100644 --- a/source/de/anomic/plasma/parser/rss/rssParser.java +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -85,6 +85,8 @@ public class rssParser extends AbstractParser implements Parser { final serverCharBuffer authors = new serverCharBuffer(); final RSSFeed feed = new RSSReader(source).getFeed(); + if (feed == null) throw new ParserException("no feed in document",location); + if (feed.getChannel() == null) throw new ParserException("no channel in document",location); // getting the rss feed title and description final String feedTitle = feed.getChannel().getTitle(); diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index cdb61a53e..2a6d201fd 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -699,7 +699,6 @@ public final class plasmaHTCache { * ACCESS METHODS */ - // Store to Cache public static void storeMetadata( @@ -712,7 +711,6 @@ public final class plasmaHTCache { hm.putAll(responseHeader); hm.put("@@URL", metadata.url().toNormalform(false, false)); hm.put("@@DEPTH", Integer.toString(metadata.depth())); - if (metadata.initiator() != null) hm.put("@@INITIATOR", metadata.initiator()); responseHeaderDB.put(metadata.urlHash(), hm); } catch (final Exception e) { log.logWarning("could not write ResourceInfo: " diff --git a/source/de/anomic/yacy/yacyCore.java b/source/de/anomic/yacy/yacyCore.java index 11bf9c55e..f38bbc9ea 100644 --- a/source/de/anomic/yacy/yacyCore.java +++ b/source/de/anomic/yacy/yacyCore.java @@ -81,7 +81,6 @@ public class yacyCore { // public static boolean terminate = false; // class variables - private static int onlineMode = 1; plasmaSwitchboard sb; public static int yacyTime() { @@ -115,41 +114,12 @@ public class yacyCore { // ATTENTION, VERY IMPORTANT: before starting the thread, the httpd yacy server must be running! speedKey = System.currentTimeMillis() - time; - - // start with a seedList update to propagate out peer, if possible - onlineMode = Integer.parseInt(sb.getConfig("onlineMode", "1")); - //lastSeedUpdate = universalTime(); lastOnlineTime = 0; - - // cycle - // within cycle: update seed file, strengthen network, pass news (new, old seed's) - if (online()) { - log.logConfig("you are in online mode"); - } else { - log.logConfig("YOU ARE OFFLINE! ---"); - log.logConfig("--- TO START BOOTSTRAPING, YOU MUST USE THE PROXY,"); - log.logConfig("--- OR HIT THE BUTTON 'go online'"); - log.logConfig("--- ON THE STATUS PAGE http://localhost:" + serverCore.getPortNr(sb.getConfig("port", "8080")) + "/Status.html"); - } } synchronized static public void triggerOnlineAction() { lastOnlineTime = System.currentTimeMillis(); } - - public final boolean online() { - onlineMode = Integer.parseInt(sb.getConfig("onlineMode", "1")); - return ((onlineMode == 2) || ((System.currentTimeMillis() - lastOnlineTime) < 10000)); - } - - public static int getOnlineMode() { - return onlineMode; - } - - public static void setOnlineMode(final int newOnlineMode) { - onlineMode = newOnlineMode; - return; - } public final void publishSeedList() { if (log.isFine()) log.logFine("yacyCore.publishSeedList: Triggered Seed Publish"); @@ -201,7 +171,6 @@ public class yacyCore { } public final void peerPing() { - if (!online()) { return; } if ((sb.isRobinsonMode()) && (sb.getConfig("cluster.mode", "").equals("privatepeer"))) { // in case this peer is a privat peer we omit the peer ping // all other robinson peer types do a peer ping: