enhancement to crawl stacker enqueue order

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4192 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 93905e5c7b
commit ccbfb15b6b

@ -305,8 +305,7 @@ public class WatchCrawler_p {
(String) e.getValue(), (String) e.getValue(),
new Date(), new Date(),
0, 0,
profile, profile);
true);
} }
} catch (PatternSyntaxException e) { } catch (PatternSyntaxException e) {

@ -86,6 +86,9 @@ public final class plasmaCrawlStacker extends Thread {
private long preloadTime; private long preloadTime;
private int dbtype; private int dbtype;
private boolean prequeue; private boolean prequeue;
private long dnsHit, dnsMiss;
private int alternateCount;
// objects for the prefetch task // objects for the prefetch task
private ArrayList dnsfetchHosts = new ArrayList(); private ArrayList dnsfetchHosts = new ArrayList();
@ -93,6 +96,9 @@ public final class plasmaCrawlStacker extends Thread {
public plasmaCrawlStacker(plasmaSwitchboard sb, File dbPath, long preloadTime, int dbtype, boolean prequeue) { public plasmaCrawlStacker(plasmaSwitchboard sb, File dbPath, long preloadTime, int dbtype, boolean prequeue) {
this.sb = sb; this.sb = sb;
this.prequeue = prequeue; this.prequeue = prequeue;
this.dnsHit = 0;
this.dnsMiss = 0;
this.alternateCount = 0;
// init the message list // init the message list
this.urlEntryHashCache = new LinkedList(); this.urlEntryHashCache = new LinkedList();
@ -152,14 +158,18 @@ public final class plasmaCrawlStacker extends Thread {
} catch (InterruptedException e) {} } catch (InterruptedException e) {}
} }
public void prefetchHost(String host) { public boolean prefetchHost(String host) {
// returns true when the host was known in the dns cache.
// If not, the host is stacked on the fetch stack and false is returned
try { try {
serverDomains.dnsResolveFromCache(host); serverDomains.dnsResolveFromCache(host);
return true;
} catch (UnknownHostException e) { } catch (UnknownHostException e) {
synchronized (this) { synchronized (this) {
dnsfetchHosts.add(host); dnsfetchHosts.add(host);
notifyAll(); notifyAll();
} }
return false;
} }
} }
@ -221,8 +231,7 @@ public final class plasmaCrawlStacker extends Thread {
String name, String name,
Date loadDate, Date loadDate,
int currentdepth, int currentdepth,
plasmaCrawlProfile.entry profile, plasmaCrawlProfile.entry profile) {
boolean first) {
if (profile == null) return; if (profile == null) return;
plasmaCrawlEntry newEntry = new plasmaCrawlEntry( plasmaCrawlEntry newEntry = new plasmaCrawlEntry(
initiatorHash, initiatorHash,
@ -240,17 +249,28 @@ public final class plasmaCrawlStacker extends Thread {
synchronized(this.urlEntryHashCache) { synchronized(this.urlEntryHashCache) {
kelondroRow.Entry oldValue; kelondroRow.Entry oldValue;
if (prequeue) prefetchHost(nexturl.getHost()); boolean hostknown = true;
if (prequeue) hostknown = prefetchHost(nexturl.getHost());
try { try {
oldValue = this.urlEntryCache.put(newEntry.toRow()); oldValue = this.urlEntryCache.put(newEntry.toRow());
} catch (IOException e) { } catch (IOException e) {
oldValue = null; oldValue = null;
} }
if (oldValue == null) { if (oldValue == null) {
if (first) { //System.out.println("*** debug crawlStacker dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ((this.dnsMiss > 0) ? (", Q=" + (this.dnsHit / this.dnsMiss)) : ""));
if (hostknown) {
this.alternateCount++;
this.urlEntryHashCache.addFirst(newEntry.url().hash()); this.urlEntryHashCache.addFirst(newEntry.url().hash());
this.dnsHit++;
} else { } else {
this.urlEntryHashCache.addLast(newEntry.url().hash()); if ((this.dnsMiss > 0) && (this.alternateCount > 2 * this.dnsHit / this.dnsMiss)) {
this.urlEntryHashCache.addFirst(newEntry.url().hash());
this.alternateCount = 0;
//System.out.println("*** debug crawlStacker alternate switch, dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ", Q=" + (this.dnsHit / this.dnsMiss));
} else {
this.urlEntryHashCache.addLast(newEntry.url().hash());
}
this.dnsMiss++;
} }
} }
} }

@ -2206,7 +2206,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
nextUrl = new yacyURL(nextUrlString, null); nextUrl = new yacyURL(nextUrlString, null);
// enqueue the hyperlink into the pre-notice-url db // enqueue the hyperlink into the pre-notice-url db
crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile(), ((entry.depth() <= 1) || (entry.depth() + 1 >= entry.profile().generalDepth()))); crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
} catch (MalformedURLException e1) {} } catch (MalformedURLException e1) {}
} }
log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) + log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +

Loading…
Cancel
Save