From 13c63f40826c8952f447f70a5395dfdf8ea0a690 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 15 Jul 2009 14:15:51 +0000 Subject: [PATCH] a set of small fixes to crawling behaviour git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6216 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexCreateIndexingQueue_p.java | 16 +-- htroot/Status.java | 2 +- htroot/api/queues_p.java | 10 +- source/de/anomic/crawler/Balancer.java | 6 +- source/de/anomic/crawler/CrawlQueues.java | 18 +-- .../de/anomic/crawler/CrawlSwitchboard.java | 10 +- source/de/anomic/crawler/IndexingStack.java | 6 +- source/de/anomic/crawler/Latency.java | 15 ++- source/de/anomic/crawler/NoticedURL.java | 4 +- source/de/anomic/document/Parser.java | 12 +- .../de/anomic/plasma/plasmaSwitchboard.java | 118 ++++++++++-------- source/de/anomic/yacy/yacyURL.java | 1 + source/yacy.java | 2 +- 13 files changed, 118 insertions(+), 102 deletions(-) diff --git a/htroot/IndexCreateIndexingQueue_p.java b/htroot/IndexCreateIndexingQueue_p.java index 7a12b48e5..e587af77d 100644 --- a/htroot/IndexCreateIndexingQueue_p.java +++ b/htroot/IndexCreateIndexingQueue_p.java @@ -63,20 +63,20 @@ public class IndexCreateIndexingQueue_p { } if (post.containsKey("clearIndexingQueue")) { try { - synchronized (sb.crawler.queuePreStack) { + synchronized (sb.crawler.indexingStack) { IndexingStack.QueueEntry entry = null; - while ((entry = sb.crawler.queuePreStack.pop()) != null) { + while ((entry = sb.crawler.indexingStack.pop()) != null) { if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) { plasmaHTCache.deleteFromCache(entry.url()); } } - sb.crawler.queuePreStack.clear(); // reset file to clean up content completely + sb.crawler.indexingStack.clear(); // reset file to clean up content completely } } catch (final Exception e) {} } else if (post.containsKey("deleteEntry")) { final String urlHash = post.get("deleteEntry"); try { - sb.crawler.queuePreStack.remove(urlHash); + sb.crawler.indexingStack.remove(urlHash); } catch (final Exception e) {} prop.put("LOCATION",""); return prop; @@ -86,7 +86,7 @@ public class IndexCreateIndexingQueue_p { yacySeed initiator; boolean dark; - if ((sb.crawler.queuePreStack.size() == 0) && (sb.crawler.queuePreStack.getActiveQueueSize() == 0)) { + if ((sb.crawler.indexingStack.size() == 0) && (sb.crawler.indexingStack.getActiveQueueSize() == 0)) { prop.put("indexing-queue", "0"); //is empty } else { prop.put("indexing-queue", "1"); // there are entries in the queue or in process @@ -98,12 +98,12 @@ public class IndexCreateIndexingQueue_p { // getting all entries that are currently in process final ArrayList entryList = new ArrayList(); - entryList.addAll(sb.crawler.queuePreStack.getActiveQueueEntries()); + entryList.addAll(sb.crawler.indexingStack.getActiveQueueEntries()); final int inProcessCount = entryList.size(); // getting all enqueued entries - if ((sb.crawler.queuePreStack.size() > 0)) { - final Iterator i = sb.crawler.queuePreStack.entryIterator(false); + if ((sb.crawler.indexingStack.size() > 0)) { + final Iterator i = sb.crawler.indexingStack.entryIterator(false); while (i.hasNext()) entryList.add(i.next()); } diff --git a/htroot/Status.java b/htroot/Status.java index 018cd162a..7d360261f 100644 --- a/htroot/Status.java +++ b/htroot/Status.java @@ -286,7 +286,7 @@ public class Status { prop.putNum("connectionsMax", httpd.getMaxSessionCount()); // Queue information - final int indexingJobCount = sb.getThread("80_indexing").getJobCount() + sb.crawler.queuePreStack.getActiveQueueSize(); + final int indexingJobCount = sb.getThread("80_indexing").getJobCount() + sb.crawler.indexingStack.getActiveQueueSize(); final int indexingMaxCount = (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30); final int indexingPercent = (indexingMaxCount==0)?0:indexingJobCount*100/indexingMaxCount; prop.putNum("indexingQueueSize", indexingJobCount); diff --git a/htroot/api/queues_p.java b/htroot/api/queues_p.java index 318a1560a..a9506b9c7 100755 --- a/htroot/api/queues_p.java +++ b/htroot/api/queues_p.java @@ -39,11 +39,11 @@ public class queues_p { yacySeed initiator; //indexing queue - prop.putNum("indexingSize", sb.getThread(plasmaSwitchboardConstants.INDEXER).getJobCount() + sb.crawler.queuePreStack.getActiveQueueSize()); + prop.putNum("indexingSize", sb.getThread(plasmaSwitchboardConstants.INDEXER).getJobCount() + sb.crawler.indexingStack.getActiveQueueSize()); prop.putNum("indexingMax", (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30)); prop.putNum("urlpublictextSize", sb.indexSegment.urlMetadata().size()); prop.putNum("rwipublictextSize", sb.indexSegment.termIndex().sizesMax()); - if ((sb.crawler.queuePreStack.size() == 0) && (sb.crawler.queuePreStack.getActiveQueueSize() == 0)) { + if ((sb.crawler.indexingStack.size() == 0) && (sb.crawler.indexingStack.getActiveQueueSize() == 0)) { prop.put("list", "0"); //is empty } else { IndexingStack.QueueEntry pcentry; @@ -52,12 +52,12 @@ public class queues_p { // getting all entries that are currently in process final ArrayList entryList = new ArrayList(); - entryList.addAll(sb.crawler.queuePreStack.getActiveQueueEntries()); + entryList.addAll(sb.crawler.indexingStack.getActiveQueueEntries()); final int inProcessCount = entryList.size(); // getting all enqueued entries - if ((sb.crawler.queuePreStack.size() > 0)) { - final Iterator i1 = sb.crawler.queuePreStack.entryIterator(false); + if ((sb.crawler.indexingStack.size() > 0)) { + final Iterator i1 = sb.crawler.indexingStack.entryIterator(false); while (i1.hasNext()) try { entryList.add(i1.next()); } catch (kelondroException e) { diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 660531c76..60d7d126d 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -95,7 +95,7 @@ public class Balancer { } public void clear() { - Log.logInfo("Balancer", "cleaing balancer with " + urlFileIndex.size() + " entries from " + urlFileIndex.filename()); + Log.logInfo("Balancer", "cleaning balancer with " + urlFileIndex.size() + " entries from " + urlFileIndex.filename()); try { urlFileIndex.clear(); } catch (IOException e) { @@ -289,7 +289,7 @@ public class Balancer { * @return a url in a CrawlEntry object * @throws IOException */ - public CrawlEntry pop(boolean delay, CrawlProfile profile) throws IOException { + public CrawlEntry pop(final boolean delay, final CrawlProfile profile) throws IOException { // returns a crawl entry from the stack and ensures minimum delta times filltop(delay, -600000, false); @@ -355,7 +355,7 @@ public class Balancer { // in best case, this should never happen if the balancer works propertly // this is only to protection against the worst case, where the crawler could // behave in a DoS-manner - Log.logInfo("BALANCER", "forcing crawl-delay of " + (sleeptime / 1000) + " seconds for " + crawlEntry.url().getHost() + ((sleeptime > Math.max(minimumLocalDelta, minimumGlobalDelta)) ? " (forced latency)" : "")); + Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ((sleeptime > Math.max(minimumLocalDelta, minimumGlobalDelta)) ? " (forced latency)" : "")); long loops = sleeptime / 3000; long rest = sleeptime % 3000; if (loops < 2) { diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index bedbd8bbc..c9e091c38 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -303,9 +303,9 @@ public class CrawlQueues { } value = (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30); - if (sb.crawler.queuePreStack.size() >= value) { + if (sb.crawler.indexingStack.size() >= value) { if (this.log.isFine()) { - log.logFine(type + "Crawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.crawler.queuePreStack.size() + ")"); + log.logFine(type + "Crawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.crawler.indexingStack.size() + ")"); } return false; } @@ -322,9 +322,10 @@ public class CrawlQueues { return false; } - if (sb.onlineCaution()) { + String cautionCause = sb.onlineCaution(); + if (cautionCause != null) { if (this.log.isFine()) { - log.logFine(type + "Crawl: online caution, omitting processing"); + log.logFine(type + "Crawl: online caution for " + cautionCause + ", omitting processing"); } return false; } @@ -344,8 +345,8 @@ public class CrawlQueues { return false; } - if (sb.crawler.queuePreStack.size() >= (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30) / 2) { - if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.crawler.queuePreStack.size() + ")"); + if (sb.crawler.indexingStack.size() >= (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30) / 2) { + if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.crawler.indexingStack.size() + ")"); return false; } @@ -359,8 +360,9 @@ public class CrawlQueues { return false; } - if (sb.onlineCaution()) { - if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: online caution, omitting processing"); + String cautionCause = sb.onlineCaution(); + if (cautionCause != null) { + if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: online caution for " + cautionCause + ", omitting processing"); return false; } diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index 12f5bf87c..eb5dbda08 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -58,7 +58,7 @@ public final class CrawlSwitchboard { public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L; private final Log log; - public IndexingStack queuePreStack; + public IndexingStack indexingStack; public CrawlProfile profilesActiveCrawls, profilesPassiveCrawls; public CrawlProfile.entry defaultProxyProfile; public CrawlProfile.entry defaultRemoteProfile; @@ -134,11 +134,11 @@ public final class CrawlSwitchboard { ", " + profilesPassiveFile.length()/1024); // init queues - this.queuePreStack = new IndexingStack(peers, queuesRoot, "urlNoticePreStack.stack", this.profilesActiveCrawls); + this.indexingStack = new IndexingStack(peers, queuesRoot, "urlNoticePreStack.stack", this.profilesActiveCrawls); } public void clear() { - queuePreStack.clear(); + indexingStack.clear(); } private void initActiveCrawlProfiles() { @@ -230,7 +230,7 @@ public final class CrawlSwitchboard { public boolean cleanProfiles() throws InterruptedException { - if (queuePreStack.size() > 0) return false; + if (indexingStack.size() > 0) return false; final Iterator iter = profilesActiveCrawls.profiles(true); CrawlProfile.entry entry; boolean hasDoneSomething = false; @@ -264,7 +264,7 @@ public final class CrawlSwitchboard { public void close() { this.profilesActiveCrawls.close(); this.profilesPassiveCrawls.close(); - this.queuePreStack.close(); + this.indexingStack.close(); } } diff --git a/source/de/anomic/crawler/IndexingStack.java b/source/de/anomic/crawler/IndexingStack.java index 969dec3cb..b27aef6e1 100644 --- a/source/de/anomic/crawler/IndexingStack.java +++ b/source/de/anomic/crawler/IndexingStack.java @@ -204,7 +204,7 @@ public class IndexingStack { return new QueueEntry(url, referrer, ifModifiedSince, requestWithCookie, initiator, depth, profilehandle, anchorName); } - public void enQueueToActive(final QueueEntry entry) { + public void store(final QueueEntry entry) { queueInProcess.put(entry.url().hash(), entry); } @@ -437,7 +437,7 @@ public class IndexingStack { // check profile if (!profile().indexText() && !profile().indexMedia()) { - return "Indexing_Not_Allowed"; + return "indexing not allowed - indexText and indexMedia not set (for proxy)"; } // -CGI access in request @@ -579,7 +579,7 @@ public class IndexingStack { // check profile if (!profile().indexText() && !profile().indexMedia()) { - return "Indexing_Not_Allowed"; + return "indexing not allowed - indexText and indexMedia not set (for crawler)"; } // -CGI access in request diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java index 1b185028e..f06115794 100644 --- a/source/de/anomic/crawler/Latency.java +++ b/source/de/anomic/crawler/Latency.java @@ -149,18 +149,17 @@ public class Latency { * @return the remaining waiting time in milliseconds */ public static long waitingRemaining(yacyURL url, final long minimumLocalDelta, final long minimumGlobalDelta) { + + // find the minimum waiting time based on the network domain (local or global) + final boolean local = url.isLocal(); + long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta; // first check if the domain was _ever_ accessed before String hosthash = url.hash().substring(6); Host host = host(hosthash); - if (host == null) return 0; // no delay // the time since last access to the domain is the basis of the remaining calculation - final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); - - // find the minimum waiting time based on the network domain (local or global) - final boolean local = url.isLocal(); - long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta; + final long timeSinceLastAccess = (host == null) ? 0 : System.currentTimeMillis() - host.lastacc(); // for CGI accesses, we double the minimum time // mostly there is a database access in the background @@ -168,7 +167,7 @@ public class Latency { if (url.isCGI()) waiting = waiting * 2; // if we have accessed the domain many times, get slower (the flux factor) - if (!local) waiting += host.flux(waiting); + if (!local && host != null) waiting += host.flux(waiting); // find the delay as given by robots.txt on target site long robotsDelay = (local) ? 0 : plasmaSwitchboard.getSwitchboard().robots.crawlDelayMillis(url); @@ -177,7 +176,7 @@ public class Latency { // use the access latency as rule how fast we can access the server // this applies also to localhost, but differently, because it is not necessary to // consider so many external accesses - waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2); + if (host != null) waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2); // prevent that that a robots file can stop our indexer completely waiting = Math.min(60000, waiting); diff --git a/source/de/anomic/crawler/NoticedURL.java b/source/de/anomic/crawler/NoticedURL.java index 9e281b7a0..2fb571868 100755 --- a/source/de/anomic/crawler/NoticedURL.java +++ b/source/de/anomic/crawler/NoticedURL.java @@ -51,6 +51,7 @@ public class NoticedURL { private Balancer remoteStack; // links from remote crawl orders public NoticedURL(final File cachePath) { + Log.logInfo("NoticedURL", "CREATING STACKS at " + cachePath.toString()); this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", false, minimumLocalDeltaInit, minimumGlobalDeltaInit); this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", false, minimumLocalDeltaInit, minimumGlobalDeltaInit); //overhangStack = new plasmaCrawlBalancer(overhangStackFile); @@ -72,13 +73,14 @@ public class NoticedURL { } public void clear() { - Log.logInfo("NoticedURL", "CLEARING ALL STACKS!"); + Log.logInfo("NoticedURL", "CLEARING ALL STACKS"); coreStack.clear(); limitStack.clear(); remoteStack.clear(); } public void close() { + Log.logInfo("NoticedURL", "CLOSING ALL STACKS"); if (coreStack != null) { coreStack.close(); coreStack = null; diff --git a/source/de/anomic/document/Parser.java b/source/de/anomic/document/Parser.java index c316f0137..f2b136905 100644 --- a/source/de/anomic/document/Parser.java +++ b/source/de/anomic/document/Parser.java @@ -113,12 +113,12 @@ public final class Parser { if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'."); mime2parser.put(mimeType, parser); Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName()); - - if (prototypeMime != null) for (String ext: parser.supportedExtensions()) { - String s = ext2mime.get(ext); - if (s != null) log.logSevere("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'."); - ext2mime.put(ext, prototypeMime); - } + } + + if (prototypeMime != null) for (String ext: parser.supportedExtensions()) { + String s = ext2mime.get(ext); + if (s != null) log.logSevere("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'."); + ext2mime.put(ext, prototypeMime); } } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 37fb4186c..9d3231216 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -785,20 +785,20 @@ public final class plasmaSwitchboard extends serverAbstractSwitch= 0, - "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0); - + // create new web structure this.webStructure = new plasmaWebStructure(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", plasmaRankingDistribution.CR_OWN), new File(queuesRoot, "webStructure.map")); @@ -873,11 +867,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch= 0, + "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0); + } // start up crawl jobs continueCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); continueCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); - this.log.logInfo("switched network to " + networkDefinition); + log.logInfo("SWITCH NETWORK: FINISHED START UP, new network is now '" + networkDefinition + "'."); + // check status of account configuration: when local url crawling is allowed, it is not allowed // that an automatic authorization of localhost is done, because in this case crawls from local // addresses are blocked to prevent attack szenarios where remote pages contain links to localhost @@ -1032,11 +1036,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch 1)) { - return "no DHT distribution: indexing in progress: noticeURL.stackSize() = " + crawlQueues.noticeURL.size() + ", sbQueue.size() = " + crawler.queuePreStack.size(); + if ((getConfig(plasmaSwitchboardConstants.INDEX_DIST_ALLOW_WHILE_INDEXING, "false").equalsIgnoreCase("false")) && (crawler.indexingStack.size() > 1)) { + return "no DHT distribution: indexing in progress: noticeURL.stackSize() = " + crawlQueues.noticeURL.size() + ", sbQueue.size() = " + crawler.indexingStack.size(); } return null; // this means; yes, please do dht transfer } @@ -2275,7 +2288,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch