From ccbfb15b6b2619e5000b7aa06292c98eb29ee587 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 1 Nov 2007 00:57:32 +0000 Subject: [PATCH] enhancement to crawl stacker enqueue order git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4192 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/WatchCrawler_p.java | 3 +- .../de/anomic/plasma/plasmaCrawlStacker.java | 32 +++++++++++++++---- .../de/anomic/plasma/plasmaSwitchboard.java | 2 +- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index f7affeae2..565ee5deb 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -305,8 +305,7 @@ public class WatchCrawler_p { (String) e.getValue(), new Date(), 0, - profile, - true); + profile); } } catch (PatternSyntaxException e) { diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 670ca4506..3de20e195 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -86,6 +86,9 @@ public final class plasmaCrawlStacker extends Thread { private long preloadTime; private int dbtype; private boolean prequeue; + private long dnsHit, dnsMiss; + private int alternateCount; + // objects for the prefetch task private ArrayList dnsfetchHosts = new ArrayList(); @@ -93,6 +96,9 @@ public final class plasmaCrawlStacker extends Thread { public plasmaCrawlStacker(plasmaSwitchboard sb, File dbPath, long preloadTime, int dbtype, boolean prequeue) { this.sb = sb; this.prequeue = prequeue; + this.dnsHit = 0; + this.dnsMiss = 0; + this.alternateCount = 0; // init the message list this.urlEntryHashCache = new LinkedList(); @@ -152,14 +158,18 @@ public final class plasmaCrawlStacker extends Thread { } catch (InterruptedException e) {} } - public void prefetchHost(String host) { + public boolean prefetchHost(String host) { + // returns true when the host was known in the dns cache. + // If not, the host is stacked on the fetch stack and false is returned try { serverDomains.dnsResolveFromCache(host); + return true; } catch (UnknownHostException e) { synchronized (this) { dnsfetchHosts.add(host); notifyAll(); } + return false; } } @@ -221,8 +231,7 @@ public final class plasmaCrawlStacker extends Thread { String name, Date loadDate, int currentdepth, - plasmaCrawlProfile.entry profile, - boolean first) { + plasmaCrawlProfile.entry profile) { if (profile == null) return; plasmaCrawlEntry newEntry = new plasmaCrawlEntry( initiatorHash, @@ -240,17 +249,28 @@ public final class plasmaCrawlStacker extends Thread { synchronized(this.urlEntryHashCache) { kelondroRow.Entry oldValue; - if (prequeue) prefetchHost(nexturl.getHost()); + boolean hostknown = true; + if (prequeue) hostknown = prefetchHost(nexturl.getHost()); try { oldValue = this.urlEntryCache.put(newEntry.toRow()); } catch (IOException e) { oldValue = null; } if (oldValue == null) { - if (first) { + //System.out.println("*** debug crawlStacker dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ((this.dnsMiss > 0) ? (", Q=" + (this.dnsHit / this.dnsMiss)) : "")); + if (hostknown) { + this.alternateCount++; this.urlEntryHashCache.addFirst(newEntry.url().hash()); + this.dnsHit++; } else { - this.urlEntryHashCache.addLast(newEntry.url().hash()); + if ((this.dnsMiss > 0) && (this.alternateCount > 2 * this.dnsHit / this.dnsMiss)) { + this.urlEntryHashCache.addFirst(newEntry.url().hash()); + this.alternateCount = 0; + //System.out.println("*** debug crawlStacker alternate switch, dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ", Q=" + (this.dnsHit / this.dnsMiss)); + } else { + this.urlEntryHashCache.addLast(newEntry.url().hash()); + } + this.dnsMiss++; } } } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 6e555378c..e90266b61 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -2206,7 +2206,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser nextUrl = new yacyURL(nextUrlString, null); // enqueue the hyperlink into the pre-notice-url db - crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile(), ((entry.depth() <= 1) || (entry.depth() + 1 >= entry.profile().generalDepth()))); + crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile()); } catch (MalformedURLException e1) {} } log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +