- crawl profiles shall be deleted from active and passive stacks if they

are deleted to terminate the crawl because otherwise the crawl will go
on after the load-from-passive stack policy.
- better check if a crawl is terminated using the loader queue.
pull/1/head
Michael Peter Christen 11 years ago
parent 1b3d26dd23
commit 82bfd9e00a

@ -105,6 +105,7 @@ public class CrawlProfileEditor_p {
if (p != null) sb.crawler.putPassive(handle.getBytes(), p);
// delete all entries from the crawl queue that are deleted here
sb.crawler.removeActive(handle.getBytes());
sb.crawler.removePassive(handle.getBytes());
sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);

@ -129,6 +129,7 @@ public class Crawler_p {
if (p != null) sb.crawler.putPassive(handle.getBytes(), p);
// delete all entries from the crawl queue that are deleted here
sb.crawler.removeActive(handle.getBytes());
sb.crawler.removePassive(handle.getBytes());
sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);

@ -69,7 +69,10 @@ public class IndexCreateQueues_p {
entry = sb.crawler.getActive(handle);
final String name = entry.name();
if (CrawlSwitchboard.DEFAULT_PROFILES.contains(name)) continue;
if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes());
if (compiledPattern.matcher(name).find()) {
sb.crawler.removeActive(entry.handle().getBytes());
sb.crawler.removePassive(entry.handle().getBytes());
}
}
} else {
// iterating through the list of URLs

@ -80,8 +80,8 @@ public final class CrawlSwitchboard {
DEFAULT_PROFILES.add(CRAWL_PROFILE_SURROGATE);
}
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap";
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive1.heap";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive1.heap";
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
@ -103,21 +103,22 @@ public final class CrawlSwitchboard {
public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public CrawlProfile defaultSurrogateProfile;
private final File queuesRoot;
private Switchboard switchboard;
public CrawlSwitchboard(final String networkName, final ConcurrentLog log, final File queuesRoot) {
public CrawlSwitchboard(final String networkName, Switchboard switchboard) {
log.info("Initializing Word Index for the network '" + networkName + "'.");
this.log = switchboard.log;
this.queuesRoot = switchboard.queuesRoot;
this.log.info("Initializing Word Index for the network '" + networkName + "'.");
if ( networkName == null || networkName.isEmpty() ) {
log.severe("no network name given - shutting down");
System.exit(0);
}
this.log = log;
this.profilesActiveCrawlsCache = Collections.synchronizedMap(new TreeMap<byte[], CrawlProfile>(Base64Order.enhancedCoder));
this.profilesActiveCrawlsCounter = new ConcurrentHashMap<String, RowHandleSet>();
// make crawl profiles database and default profiles
this.queuesRoot = queuesRoot;
this.queuesRoot.mkdirs();
this.log.config("Initializing Crawl Profiles");
@ -254,10 +255,12 @@ public final class CrawlSwitchboard {
public void putActive(final byte[] profileKey, final CrawlProfile profile) {
this.profilesActiveCrawls.put(profileKey, profile);
this.profilesActiveCrawlsCache.put(profileKey, profile);
this.removePassive(profileKey);
}
public void putPassive(final byte[] profileKey, final CrawlProfile profile) {
this.profilesPassiveCrawls.put(profileKey, profile);
this.removeActive(profileKey);
}
public RowHandleSet getURLHashes(final byte[] profileKey) {
@ -586,6 +589,11 @@ public final class CrawlSwitchboard {
}
if (deletionCandidate.size() == 0) return new HashSet<String>(0);
}
// look into the CrawlQueues.worker as well
Request[] requests = switchboard.crawlQueues.activeWorkerEntries();
for (Request request: requests) {
deletionCandidate.remove(request.profileHandle());
}
} catch (final Throwable e) {
return new HashSet<String>(0);
}

@ -269,14 +269,13 @@ public class CrawlQueues {
if (urlEntry == null) {
continue;
}
final String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
if (profileHandle == null) {
if (urlEntry.profileHandle() == null) {
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
}
load(urlEntry, stats, profileHandle);
load(urlEntry, stats);
return true;
} catch (final IOException e) {
this.log.severe(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
@ -296,8 +295,8 @@ public class CrawlQueues {
* @param stats String for log prefixing
* @return
*/
private void load(final Request urlEntry, final String stats, final String profileHandle) {
final CrawlProfile profile = this.sb.crawler.get(UTF8.getBytes(profileHandle));
private void load(final Request urlEntry, final String stats) {
final CrawlProfile profile = this.sb.crawler.get(UTF8.getBytes(urlEntry.profileHandle()));
if (profile != null) {
// check if the protocol is supported
@ -574,11 +573,7 @@ public class CrawlQueues {
try {
final Request urlEntry = this.noticeURL.pop(NoticedURL.StackType.REMOTE, true, this.sb.crawler, this.sb.robots);
if (urlEntry == null) return false;
final String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " +
// urlEntry.url());
load(urlEntry, stats, profileHandle);
load(urlEntry, stats);
return true;
} catch (final IOException e) {
this.log.severe(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);

@ -536,7 +536,7 @@ public final class Switchboard extends serverSwitch {
}
// create a crawler
this.crawler = new CrawlSwitchboard(networkName, this.log, this.queuesRoot);
this.crawler = new CrawlSwitchboard(networkName, this);
// start yacy core
this.log.config("Starting YaCy Protocol Core");
@ -1330,7 +1330,7 @@ public final class Switchboard extends serverSwitch {
// create a crawler
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
this.crawler = new CrawlSwitchboard(networkName, this.log, this.queuesRoot);
this.crawler = new CrawlSwitchboard(networkName, this);
// init a DHT transmission dispatcher
this.dhtDispatcher =

@ -62,7 +62,7 @@ public class serverSwitch
public final File dataPath;
public final File appPath;
protected boolean firstInit;
protected ConcurrentLog log;
public ConcurrentLog log;
protected int serverJobs;
private ConcurrentMap<String, String> configProps;
private final ConcurrentMap<String, String> configRemoved;

Loading…
Cancel
Save