enhancement to crawling and remote crawling:

- for redirector and  remote crawling place crawling url on notice queue instead of direct enqueueing in crawler queue
- when a request to a remote crawl provider fails, remove the peer from the network to prevent that the url fetcher gets stuck another time again

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5320 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 3f746be5d4
commit 1b18d4bcf3

@ -76,17 +76,7 @@ public class rct_p {
if (urlRejectReason == null) {
// stack url
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
final String reasonString = sb.crawlStacker.stackCrawl(url, referrer, peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile);
if (reasonString == null) {
// done
env.getLog().logInfo("crawlOrder: added remote crawl url: " + urlToString(url));
} else if (reasonString.startsWith("double")) {
// case where we have already the url loaded;
env.getLog().logInfo("crawlOrder: ignored double remote crawl url: " + urlToString(url));
} else {
env.getLog().logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + urlToString(url));
}
sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile);
} else {
env.getLog().logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
}

@ -362,7 +362,13 @@ public class CrawlQueues {
// we know a peer which should provide remote crawl entries. load them now.
final RSSFeed feed = yacyClient.queryRemoteCrawlURLs(sb.webIndex.seedDB, seed, 30, 5000);
if (feed == null) return true;
if (feed == null || feed.size() == 0) {
// something is wrong with this provider. To prevent that we get not stuck with this peer
// we remove it from the peer list
sb.webIndex.peerActions.peerDeparture(seed, "no results from provided remote crawls");
return true;
}
// parse the rss
yacyURL url, referrer;
Date loaddate;
@ -389,17 +395,7 @@ public class CrawlQueues {
if (urlRejectReason == null) {
// stack url
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
final String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile);
if (reasonString == null) {
// done
log.logInfo("crawlOrder: added remote crawl url: " + urlToString(url));
} else if (reasonString.startsWith("double")) {
// case where we have already the url loaded;
log.logInfo("crawlOrder: ignored double remote crawl url: " + urlToString(url));
} else {
log.logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + urlToString(url));
}
sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile);
} else {
log.logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
}

@ -210,6 +210,33 @@ public final class CrawlStacker extends Thread {
return true;
}
public String stackCrawl(
final yacyURL url,
final String referrerhash,
final String initiatorHash,
final String name,
final Date loadDate,
final int currentdepth,
final CrawlProfile.entry profile) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
// add the url into the crawling queue
final CrawlEntry entry = new CrawlEntry(
initiatorHash, // initiator, needed for p2p-feedback
url, // url clear text string
(referrerhash == null) ? "" : referrerhash, // last url in crawling queue
name, // load date
loadDate, // the anchor name
(profile == null) ? null : profile.handle(), // profile must not be null!
currentdepth, // depth so far
0, // anchors, default value
0 // forkfactor, default value
);
return stackCrawl(entry);
}
public void enqueueEntry(
final yacyURL nexturl,
final String referrerhash,
@ -342,26 +369,6 @@ public final class CrawlStacker extends Thread {
return new CrawlEntry(entry);
}
public String stackCrawl(final yacyURL url, final yacyURL referrer, final String initiatorHash, final String name, final Date loadDate, final int currentdepth, final CrawlProfile.entry profile) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
// add the url into the crawling queue
final CrawlEntry entry = new CrawlEntry(
initiatorHash, // initiator, needed for p2p-feedback
url, // url clear text string
(referrer == null) ? "" : referrer.hash(), // last url in crawling queue
name, // load date
loadDate, // the anchor name
(profile == null) ? null : profile.handle(), // profile must not be null!
currentdepth, // depth so far
0, // anchors, default value
0 // forkfactor, default value
);
return stackCrawl(entry);
}
public String stackCrawl(final CrawlEntry entry) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful

@ -195,7 +195,7 @@ public class urlRedirectord implements serverHandler, Cloneable {
sb.crawlQueues.errorURL.remove(urlhash);
// enqueuing URL for crawling
reasonString = sb.crawlStacker.stackCrawl(
sb.crawlStacker.enqueueEntry(
reqURL,
null,
sb.webIndex.seedDB.mySeed().hash,

Loading…
Cancel
Save