diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index ba9af1916..32243d102 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -64,7 +64,7 @@ public class urls { yacyURL referrer; while ((count > 0) && (sb.crawlQueues.noticeURL.stackSize(stackType) > 0)) { try { - entry = sb.crawlQueues.noticeURL.pop(stackType, false); + entry = sb.crawlQueues.noticeURL.pop(stackType, false, sb.webIndex.profilesActiveCrawls); } catch (final IOException e) { break; } diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index e0cde994a..a12e6bfc3 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -398,7 +398,18 @@ public class Balancer { } } - public synchronized CrawlEntry pop(boolean delay) throws IOException { + /** + * get the next entry in this crawl queue in such a way that the domain access time delta is maximized + * and always above the given minimum delay time. An additional delay time is computed using the robots.txt + * crawl-delay time which is always respected. In case the minimum time cannot ensured, this method pauses + * the necessary time until the url is released and returned as CrawlEntry object. In case that a profile + * for the computed Entry does not exist, null is returned + * @param delay + * @param profile + * @return a url in a CrawlEntry object + * @throws IOException + */ + public synchronized CrawlEntry pop(boolean delay, CrawlProfile profile) throws IOException { // returns a crawl entry from the stack and ensures minimum delta times // we have 3 sources to choose from: the ramStack, the domainStacks and the fileStack @@ -565,8 +576,11 @@ public class Balancer { throw new IOException("get() found a valid urlhash, but failed to fetch the corresponding url entry - total size = " + size() + ", fileStack.size() = " + urlFileStack.size() + ", ramStack.size() = " + urlRAMStack.size() + ", domainStacks.size() = " + domainStacks.size()); } assert urlFileIndex.size() + 1 == s : "urlFileIndex.size() = " + urlFileIndex.size() + ", s = " + s + ", result = " + result; - final CrawlEntry crawlEntry = new CrawlEntry(rowEntry); - long sleeptime = crawlEntry.waitingRemaining(minimumLocalDelta, minimumGlobalDelta); + final CrawlEntry crawlEntry = new CrawlEntry(rowEntry); + // at this point we must check if the crawlEntry has relevancy because the crawl profile still exists + // if not: return null. A calling method must handle the null value and try again + if (profile != null && !profile.hasEntry(crawlEntry.profileHandle())) return null; + long sleeptime = crawlEntry.waitingRemaining(minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server if (delay && sleeptime > 0) { // force a busy waiting here diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 73b84ffd5..efb1c4ca9 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -30,6 +30,7 @@ import java.util.Iterator; import java.util.Map; import de.anomic.kelondro.kelondroBLOB; +import de.anomic.kelondro.kelondroBLOBHeap; import de.anomic.kelondro.kelondroBLOBTree; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroCloneableIterator; @@ -47,10 +48,10 @@ public class CrawlProfile { kelondroMap profileTable; private final File profileTableFile; - public CrawlProfile(final File file) { + public CrawlProfile(final File file) throws IOException { this.profileTableFile = file; profileTableFile.getParentFile().mkdirs(); - final kelondroBLOB dyn = new kelondroBLOBTree(profileTableFile, true, true, yacySeedDB.commonHashLength, 2000, '#', kelondroNaturalOrder.naturalOrder, false, false, true); + final kelondroBLOB dyn = new kelondroBLOBHeap(profileTableFile, yacySeedDB.commonHashLength, kelondroNaturalOrder.naturalOrder); profileTable = new kelondroMap(dyn, 500); } @@ -183,6 +184,14 @@ public class CrawlProfile { return ne; } + public boolean hasEntry(final String handle) { + try { + return profileTable.has(handle); + } catch (final IOException e) { + return false; + } + } + public entry getEntry(final String handle) { HashMap m; try { diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index aa4b49b21..660a9c493 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -174,7 +174,7 @@ public class CrawlQueues { // move some tasks to the core crawl job so we have something to do final int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance for (int i = 0; i < toshift; i++) { - noticeURL.shift(NoticedURL.STACK_TYPE_LIMIT, NoticedURL.STACK_TYPE_CORE); + noticeURL.shift(NoticedURL.STACK_TYPE_LIMIT, NoticedURL.STACK_TYPE_CORE, sb.webIndex.profilesActiveCrawls); } log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() + ", limitCrawlJobSize()=" + limitCrawlJobSize() + ", cluster.mode=" + sb.getConfig(plasmaSwitchboardConstants.CLUSTER_MODE, "") + @@ -214,7 +214,7 @@ public class CrawlQueues { while (urlEntry == null && noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) > 0) { final String stats = "LOCALCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]"; try { - urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_CORE, true); + urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_CORE, true, sb.webIndex.profilesActiveCrawls); final String profileHandle = urlEntry.profileHandle(); // System.out.println("DEBUG plasmaSwitchboard.processCrawling: // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); @@ -224,7 +224,7 @@ public class CrawlQueues { } final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.getEntry(profileHandle); if (profile == null) { - log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); + log.logWarning(stats + ": LOST LOCALCRAWL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); return true; } @@ -421,7 +421,7 @@ public class CrawlQueues { final String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]"; try { - final CrawlEntry urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_REMOTE, true); + final CrawlEntry urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_REMOTE, true, sb.webIndex.profilesActiveCrawls); final String profileHandle = urlEntry.profileHandle(); // System.out.println("DEBUG plasmaSwitchboard.processCrawling: // profileHandle = " + profileHandle + ", urlEntry.url = " + @@ -429,7 +429,7 @@ public class CrawlQueues { final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.getEntry(profileHandle); if (profile == null) { - log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); + log.logWarning(stats + ": LOST REMOTETRIGGEREDCRAWL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); return false; } diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 1fbf60eb8..6b9d76ef8 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -397,7 +397,7 @@ public final class CrawlStacker extends Thread { final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.getEntry(entry.profileHandle()); if (profile == null) { - final String errorMsg = "LOST PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url(); + final String errorMsg = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url(); log.logWarning(errorMsg); return errorMsg; } diff --git a/source/de/anomic/crawler/NoticeURLImporter.java b/source/de/anomic/crawler/NoticeURLImporter.java index 07b7ed95e..8546c2421 100644 --- a/source/de/anomic/crawler/NoticeURLImporter.java +++ b/source/de/anomic/crawler/NoticeURLImporter.java @@ -12,7 +12,7 @@ public class NoticeURLImporter extends AbstractImporter implements Importer { private File plasmaPath = null; private final HashSet importProfileHandleCache = new HashSet(); - private final CrawlProfile importProfileDB; + private CrawlProfile importProfileDB; private final NoticedURL importNurlDB; private final int importStartSize; private int urlCount = 0; @@ -73,7 +73,17 @@ public class NoticeURLImporter extends AbstractImporter implements Importer { // init profile DB this.log.logInfo("Initializing the source profileDB"); - this.importProfileDB = new CrawlProfile(profileDbFile); + try { + this.importProfileDB = new CrawlProfile(profileDbFile); + } catch (IOException e) { + profileDbFile.delete(); + try { + this.importProfileDB = new CrawlProfile(profileDbFile); + } catch (IOException e1) { + e1.printStackTrace(); + this.importProfileDB = null; + } + } } public long getEstimatedTime() { @@ -130,7 +140,7 @@ public class NoticeURLImporter extends AbstractImporter implements Importer { if (this.importNurlDB.stackSize(stackTypes[stackType]) == 0) break; this.urlCount++; - nextEntry = this.importNurlDB.pop(stackTypes[stackType], false); + nextEntry = this.importNurlDB.pop(stackTypes[stackType], false, null); nextHash = nextEntry.url().hash(); } else { if (!entryIter.hasNext()) break; diff --git a/source/de/anomic/crawler/NoticedURL.java b/source/de/anomic/crawler/NoticedURL.java index 26e690fec..637d63a69 100755 --- a/source/de/anomic/crawler/NoticedURL.java +++ b/source/de/anomic/crawler/NoticedURL.java @@ -184,18 +184,18 @@ public class NoticedURL { } } - public CrawlEntry pop(final int stackType, final boolean delay) throws IOException { + public CrawlEntry pop(final int stackType, final boolean delay, CrawlProfile profile) throws IOException { switch (stackType) { - case STACK_TYPE_CORE: return pop(coreStack, delay); - case STACK_TYPE_LIMIT: return pop(limitStack, delay); - case STACK_TYPE_REMOTE: return pop(remoteStack, delay); + case STACK_TYPE_CORE: return pop(coreStack, delay, profile); + case STACK_TYPE_LIMIT: return pop(limitStack, delay, profile); + case STACK_TYPE_REMOTE: return pop(remoteStack, delay, profile); default: return null; } } - public void shift(final int fromStack, final int toStack) { + public void shift(final int fromStack, final int toStack, CrawlProfile profile) { try { - final CrawlEntry entry = pop(fromStack, false); + final CrawlEntry entry = pop(fromStack, false, profile); if (entry != null) push(toStack, entry); } catch (final IOException e) { return; @@ -211,13 +211,13 @@ public class NoticedURL { } } - private CrawlEntry pop(final Balancer balancer, final boolean delay) throws IOException { + private CrawlEntry pop(final Balancer balancer, final boolean delay, CrawlProfile profile) throws IOException { // this is a filo - pop int s; CrawlEntry entry; synchronized (balancer) { while ((s = balancer.size()) > 0) { - entry = balancer.pop(delay); + entry = balancer.pop(delay, profile); if (entry == null) { if (s > balancer.size()) continue; final int aftersize = balancer.size(); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 0b3434c61..cd4bcb0e5 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -88,8 +88,8 @@ public final class plasmaWordIndex implements indexRI { public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText"; public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia"; public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia"; - public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.db"; - public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.db"; + public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap"; + public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap"; public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; @@ -163,7 +163,17 @@ public final class plasmaWordIndex implements indexRI { final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesActive1.db"); if (oldFile.exists()) oldFile.renameTo(profilesActiveFile); } - this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile); + try { + this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile); + } catch (IOException e) { + profilesActiveFile.delete(); + try { + this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile); + } catch (IOException e1) { + e1.printStackTrace(); + this.profilesActiveCrawls = null; + } + } initActiveCrawlProfiles(); log.logConfig("Loaded active crawl profiles from file " + profilesActiveFile.getName() + ", " + this.profilesActiveCrawls.size() + " entries" + @@ -174,7 +184,17 @@ public final class plasmaWordIndex implements indexRI { final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesPassive1.db"); if (oldFile.exists()) oldFile.renameTo(profilesPassiveFile); } - this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile); + try { + this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile); + } catch (IOException e) { + profilesPassiveFile.delete(); + try { + this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile); + } catch (IOException e1) { + e1.printStackTrace(); + this.profilesPassiveCrawls = null; + } + } log.logConfig("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() + ", " + this.profilesPassiveCrawls.size() + " entries" + ", " + profilesPassiveFile.length()/1024); @@ -296,7 +316,11 @@ public final class plasmaWordIndex implements indexRI { private void resetProfiles() { final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); if (pdb.exists()) pdb.delete(); - profilesActiveCrawls = new CrawlProfile(pdb); + try { + profilesActiveCrawls = new CrawlProfile(pdb); + } catch (IOException e) { + e.printStackTrace(); + } initActiveCrawlProfiles(); }