enhancement to crawl stacker enqueue order

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4192 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 93905e5c7b
commit ccbfb15b6b

@ -305,8 +305,7 @@ public class WatchCrawler_p {
(String) e.getValue(),
new Date(),
0,
profile,
true);
profile);
}
} catch (PatternSyntaxException e) {

@ -86,6 +86,9 @@ public final class plasmaCrawlStacker extends Thread {
private long preloadTime;
private int dbtype;
private boolean prequeue;
private long dnsHit, dnsMiss;
private int alternateCount;
// objects for the prefetch task
private ArrayList dnsfetchHosts = new ArrayList();
@ -93,6 +96,9 @@ public final class plasmaCrawlStacker extends Thread {
public plasmaCrawlStacker(plasmaSwitchboard sb, File dbPath, long preloadTime, int dbtype, boolean prequeue) {
this.sb = sb;
this.prequeue = prequeue;
this.dnsHit = 0;
this.dnsMiss = 0;
this.alternateCount = 0;
// init the message list
this.urlEntryHashCache = new LinkedList();
@ -152,14 +158,18 @@ public final class plasmaCrawlStacker extends Thread {
} catch (InterruptedException e) {}
}
public void prefetchHost(String host) {
public boolean prefetchHost(String host) {
// returns true when the host was known in the dns cache.
// If not, the host is stacked on the fetch stack and false is returned
try {
serverDomains.dnsResolveFromCache(host);
return true;
} catch (UnknownHostException e) {
synchronized (this) {
dnsfetchHosts.add(host);
notifyAll();
}
return false;
}
}
@ -221,8 +231,7 @@ public final class plasmaCrawlStacker extends Thread {
String name,
Date loadDate,
int currentdepth,
plasmaCrawlProfile.entry profile,
boolean first) {
plasmaCrawlProfile.entry profile) {
if (profile == null) return;
plasmaCrawlEntry newEntry = new plasmaCrawlEntry(
initiatorHash,
@ -240,17 +249,28 @@ public final class plasmaCrawlStacker extends Thread {
synchronized(this.urlEntryHashCache) {
kelondroRow.Entry oldValue;
if (prequeue) prefetchHost(nexturl.getHost());
boolean hostknown = true;
if (prequeue) hostknown = prefetchHost(nexturl.getHost());
try {
oldValue = this.urlEntryCache.put(newEntry.toRow());
} catch (IOException e) {
oldValue = null;
}
if (oldValue == null) {
if (first) {
//System.out.println("*** debug crawlStacker dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ((this.dnsMiss > 0) ? (", Q=" + (this.dnsHit / this.dnsMiss)) : ""));
if (hostknown) {
this.alternateCount++;
this.urlEntryHashCache.addFirst(newEntry.url().hash());
this.dnsHit++;
} else {
this.urlEntryHashCache.addLast(newEntry.url().hash());
if ((this.dnsMiss > 0) && (this.alternateCount > 2 * this.dnsHit / this.dnsMiss)) {
this.urlEntryHashCache.addFirst(newEntry.url().hash());
this.alternateCount = 0;
//System.out.println("*** debug crawlStacker alternate switch, dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ", Q=" + (this.dnsHit / this.dnsMiss));
} else {
this.urlEntryHashCache.addLast(newEntry.url().hash());
}
this.dnsMiss++;
}
}
}

@ -2206,7 +2206,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
nextUrl = new yacyURL(nextUrlString, null);
// enqueue the hyperlink into the pre-notice-url db
crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile(), ((entry.depth() <= 1) || (entry.depth() + 1 >= entry.profile().generalDepth())));
crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
} catch (MalformedURLException e1) {}
}
log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +

Loading…
Cancel
Save