From c0e17de2fbc6da7c4721d6fb038473bd66569177 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sat, 25 Jul 2009 21:38:57 +0000 Subject: [PATCH] - fixes for some problems with the new crawling/caching strategies - speed enhancements for the cache-only cache policy by using special no-delay rules in the balancer - fixed some deadlock- and 100% CPU problems in the balancer git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6243 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CrawlStart_p.html | 2 +- htroot/ViewImage.java | 2 + htroot/api/util/getpageinfo_p.java | 33 ++++++++----- source/de/anomic/crawler/Balancer.java | 47 ++++++++++++++----- source/de/anomic/crawler/CrawlQueues.java | 6 +-- source/de/anomic/crawler/Latency.java | 2 +- source/de/anomic/crawler/RobotsTxt.java | 2 +- .../crawler/retrieval/LoaderDispatcher.java | 4 +- .../de/anomic/document/parser/swfParser.java | 2 + 9 files changed, 69 insertions(+), 31 deletions(-) diff --git a/htroot/CrawlStart_p.html b/htroot/CrawlStart_p.html index 4af4ae456..61e218e53 100644 --- a/htroot/CrawlStart_p.html +++ b/htroot/CrawlStart_p.html @@ -200,8 +200,8 @@ : no cache    - if exist    if fresh    + if exist    cache only diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index 8303d559f..c73f81b4b 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -76,6 +76,8 @@ public class ViewImage { urlString = (url == null) ? null : url.toNormalform(true, true); } + if (urlString == null) return null; + int width = post.getInt("width", 0); int height = post.getInt("height", 0); int maxwidth = post.getInt("maxwidth", 0); diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java index d265da13c..2554cc288 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/util/getpageinfo_p.java @@ -40,10 +40,24 @@ public class getpageinfo_p { url = "http://" + url; } if (actions.indexOf("title")>=0) { + yacyURL u = null; try { - final yacyURL u = new yacyURL(url, null); - final ContentScraper scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH); - + u = new yacyURL(url, null); + } catch (final MalformedURLException e) { + // fail, do nothing + } + ContentScraper scraper = null; + if (u != null) try { + scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH); + } catch (final IOException e) { + // try again, try harder + try { + scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFEXIST); + } catch (final IOException ee) { + // now thats a fail, do nothing + } + } + if (scraper != null) { // put the document title prop.putXML("title", scraper.getTitle()); @@ -54,11 +68,11 @@ public class getpageinfo_p { final String list[]=scraper.getKeywords(); int count = 0; for(int i=0;i languages = scraper.getContentLanguages(); prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next()); - - } catch (final MalformedURLException e) { /* ignore this */ - } catch (final IOException e) { /* ignore this */ } } if(actions.indexOf("robots")>=0){ diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 4d976ee30..9db2b1828 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -34,6 +34,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; import de.anomic.crawler.retrieval.Request; +import de.anomic.http.client.Cache; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.ObjectIndex; import de.anomic.kelondro.order.CloneableIterator; @@ -107,7 +108,7 @@ public class Balancer { } } - public synchronized Request get(final String urlhash) throws IOException { + public Request get(final String urlhash) throws IOException { assert urlhash != null; if (urlFileIndex == null) return null; // case occurs during shutdown final Row.Entry entry = urlFileIndex.get(urlhash.getBytes()); @@ -189,7 +190,7 @@ public class Balancer { return removedCounter; } - public synchronized boolean has(final String urlhash) { + public boolean has(final String urlhash) { return urlFileIndex.has(urlhash.getBytes()); } @@ -305,17 +306,29 @@ public class Balancer { long sleeptime = 0; Request crawlEntry = null; synchronized (this) { + String failhash = null; while (this.urlFileIndex.size() > 0) { // first simply take one of the entries in the top list, that should be one without any delay - String result = nextFromDelayed(); - if (result == null && this.top.size() > 0) result = top.remove(); + String nexthash = nextFromDelayed(); + //System.out.println("*** nextFromDelayed=" + nexthash); + if (nexthash == null && this.top.size() > 0) { + nexthash = top.remove(); + //System.out.println("*** top.remove()=" + nexthash); + } // check minimumDelta and if necessary force a sleep //final int s = urlFileIndex.size(); - Row.Entry rowEntry = (result == null) ? null : urlFileIndex.remove(result.getBytes()); + Row.Entry rowEntry = (nexthash == null) ? null : urlFileIndex.remove(nexthash.getBytes()); if (rowEntry == null) { + //System.out.println("*** rowEntry=null, nexthash=" + nexthash); rowEntry = urlFileIndex.removeOne(); - result = (rowEntry == null) ? null : new String(rowEntry.getPrimaryKeyBytes()); + if (rowEntry == null) { + nexthash = null; + } else { + nexthash = new String(rowEntry.getPrimaryKeyBytes()); + //System.out.println("*** rowEntry.getPrimaryKeyBytes()=" + nexthash); + } + } if (rowEntry == null) { Log.logWarning("Balancer", "removeOne() failed - size = " + this.size()); @@ -334,18 +347,28 @@ public class Balancer { return null; } // depending on the caching policy we need sleep time to avoid DoS-like situations - sleeptime = (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server + sleeptime = ( + profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY || + (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_IFEXIST && Cache.has(crawlEntry.url())) + ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server + + assert nexthash.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + nexthash + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes()); + assert nexthash.equals(crawlEntry.url().hash()) : "result = " + nexthash + ", crawlEntry.url().hash() = " + crawlEntry.url().hash(); - assert result.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + result + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes()); - assert result.equals(crawlEntry.url().hash()) : "result = " + result + ", crawlEntry.url().hash() = " + crawlEntry.url().hash(); if (this.domainStacks.size() <= 1) break; + if (failhash != null && failhash.equals(nexthash)) break; // prevent endless loops if (delay && sleeptime > 0) { + //System.out.println("*** putback: nexthash=" + nexthash + ", failhash="+failhash); // put that thing back to omit a delay here - this.delayed.put(new Long(System.currentTimeMillis() + sleeptime + 1), result); + if (!delayed.values().contains(nexthash)) { + //System.out.println("*** delayed +=" + nexthash); + this.delayed.put(new Long(System.currentTimeMillis() + sleeptime + 1), nexthash); + } this.urlFileIndex.put(rowEntry); - this.domainStacks.remove(result.substring(6)); - continue; + this.domainStacks.remove(nexthash.substring(6)); + failhash = nexthash; + continue; } break; } diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index eca4c0962..9e47ced54 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -87,12 +87,12 @@ public class CrawlQueues { // tests if hash occurrs in any database // if it exists, the name of the database is returned, // if it not exists, null is returned - if (noticeURL.existsInStack(hash)) return "crawler"; if (delegatedURL.exists(hash)) return "delegated"; if (errorURL.exists(hash)) return "errors"; for (final crawlWorker worker: workers.values()) { if (worker.request.url().hash().equals(hash)) return "worker"; } + if (noticeURL.existsInStack(hash)) return "crawler"; return null; } @@ -105,8 +105,6 @@ public class CrawlQueues { public yacyURL getURL(final String urlhash) { assert urlhash != null; if (urlhash == null || urlhash.length() == 0) return null; - final Request ne = noticeURL.get(urlhash); - if (ne != null) return ne.url(); ZURL.Entry ee = delegatedURL.getEntry(urlhash); if (ee != null) return ee.url(); ee = errorURL.getEntry(urlhash); @@ -114,6 +112,8 @@ public class CrawlQueues { for (final crawlWorker w: workers.values()) { if (w.request.url().hash().equals(urlhash)) return w.request.url(); } + final Request ne = noticeURL.get(urlhash); + if (ne != null) return ne.url(); return null; } diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java index 798a22eb7..2d304a935 100644 --- a/source/de/anomic/crawler/Latency.java +++ b/source/de/anomic/crawler/Latency.java @@ -202,7 +202,7 @@ public class Latency { } public void update(long time) { this.lastacc = System.currentTimeMillis(); - this.timeacc += time; + this.timeacc += Math.min(30000, time); this.count++; } public void update() { diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index ed07c6bd4..2c0d34cc6 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -202,7 +202,7 @@ public class RobotsTxt { int sz = this.robotsTable.size(); addEntry(robotsTxt4Host); if (this.robotsTable.size() <= sz) { - Log.logSevere("RobotsTxt", "new entry in robots.txt table failed, reseing database"); + Log.logSevere("RobotsTxt", "new entry in robots.txt table failed, resetting database"); this.resetDatabase(); addEntry(robotsTxt4Host); } diff --git a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java index c8251600e..398d89ec9 100644 --- a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java +++ b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java @@ -200,7 +200,7 @@ public final class LoaderDispatcher { // now forget about the cache, nothing there. Try to load the content from the internet // check access time: this is a double-check (we checked possibly already in the balancer) - // to make shure that we don't DoS the target by mistake + // to make sure that we don't DoS the target by mistake if (!request.url().isLocal()) { final Long lastAccess = accessTime.get(host); long wait = 0; @@ -214,7 +214,7 @@ public final class LoaderDispatcher { } } - // now it's for shure that we will access the target. Remember the access time + // now it's for sure that we will access the target. Remember the access time accessTime.put(host, System.currentTimeMillis()); // load resource from the internet diff --git a/source/de/anomic/document/parser/swfParser.java b/source/de/anomic/document/parser/swfParser.java index a2a60cc88..b64cef0a1 100644 --- a/source/de/anomic/document/parser/swfParser.java +++ b/source/de/anomic/document/parser/swfParser.java @@ -78,6 +78,8 @@ public class swfParser extends AbstractParser implements Idiom { String contents = ""; try { contents = swf2html.convertSWFToHTML(source); + } catch (NegativeArraySizeException e) { + // seen in log } catch (Exception e) { // we have seen a lot of OOM errors in the parser... e.printStackTrace();