fix for problem with balancer and lost crawl profiles:

if crawl profile ist lost, no robots.txt is loaded any more

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5258 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent d197a62faf
commit ff68f394dd

@ -64,7 +64,7 @@ public class urls {
yacyURL referrer; yacyURL referrer;
while ((count > 0) && (sb.crawlQueues.noticeURL.stackSize(stackType) > 0)) { while ((count > 0) && (sb.crawlQueues.noticeURL.stackSize(stackType) > 0)) {
try { try {
entry = sb.crawlQueues.noticeURL.pop(stackType, false); entry = sb.crawlQueues.noticeURL.pop(stackType, false, sb.webIndex.profilesActiveCrawls);
} catch (final IOException e) { } catch (final IOException e) {
break; break;
} }

@ -398,7 +398,18 @@ public class Balancer {
} }
} }
public synchronized CrawlEntry pop(boolean delay) throws IOException { /**
* get the next entry in this crawl queue in such a way that the domain access time delta is maximized
* and always above the given minimum delay time. An additional delay time is computed using the robots.txt
* crawl-delay time which is always respected. In case the minimum time cannot ensured, this method pauses
* the necessary time until the url is released and returned as CrawlEntry object. In case that a profile
* for the computed Entry does not exist, null is returned
* @param delay
* @param profile
* @return a url in a CrawlEntry object
* @throws IOException
*/
public synchronized CrawlEntry pop(boolean delay, CrawlProfile profile) throws IOException {
// returns a crawl entry from the stack and ensures minimum delta times // returns a crawl entry from the stack and ensures minimum delta times
// we have 3 sources to choose from: the ramStack, the domainStacks and the fileStack // we have 3 sources to choose from: the ramStack, the domainStacks and the fileStack
@ -566,7 +577,10 @@ public class Balancer {
} }
assert urlFileIndex.size() + 1 == s : "urlFileIndex.size() = " + urlFileIndex.size() + ", s = " + s + ", result = " + result; assert urlFileIndex.size() + 1 == s : "urlFileIndex.size() = " + urlFileIndex.size() + ", s = " + s + ", result = " + result;
final CrawlEntry crawlEntry = new CrawlEntry(rowEntry); final CrawlEntry crawlEntry = new CrawlEntry(rowEntry);
long sleeptime = crawlEntry.waitingRemaining(minimumLocalDelta, minimumGlobalDelta); // at this point we must check if the crawlEntry has relevancy because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again
if (profile != null && !profile.hasEntry(crawlEntry.profileHandle())) return null;
long sleeptime = crawlEntry.waitingRemaining(minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
if (delay && sleeptime > 0) { if (delay && sleeptime > 0) {
// force a busy waiting here // force a busy waiting here

@ -30,6 +30,7 @@ import java.util.Iterator;
import java.util.Map; import java.util.Map;
import de.anomic.kelondro.kelondroBLOB; import de.anomic.kelondro.kelondroBLOB;
import de.anomic.kelondro.kelondroBLOBHeap;
import de.anomic.kelondro.kelondroBLOBTree; import de.anomic.kelondro.kelondroBLOBTree;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCloneableIterator; import de.anomic.kelondro.kelondroCloneableIterator;
@ -47,10 +48,10 @@ public class CrawlProfile {
kelondroMap profileTable; kelondroMap profileTable;
private final File profileTableFile; private final File profileTableFile;
public CrawlProfile(final File file) { public CrawlProfile(final File file) throws IOException {
this.profileTableFile = file; this.profileTableFile = file;
profileTableFile.getParentFile().mkdirs(); profileTableFile.getParentFile().mkdirs();
final kelondroBLOB dyn = new kelondroBLOBTree(profileTableFile, true, true, yacySeedDB.commonHashLength, 2000, '#', kelondroNaturalOrder.naturalOrder, false, false, true); final kelondroBLOB dyn = new kelondroBLOBHeap(profileTableFile, yacySeedDB.commonHashLength, kelondroNaturalOrder.naturalOrder);
profileTable = new kelondroMap(dyn, 500); profileTable = new kelondroMap(dyn, 500);
} }
@ -183,6 +184,14 @@ public class CrawlProfile {
return ne; return ne;
} }
public boolean hasEntry(final String handle) {
try {
return profileTable.has(handle);
} catch (final IOException e) {
return false;
}
}
public entry getEntry(final String handle) { public entry getEntry(final String handle) {
HashMap<String, String> m; HashMap<String, String> m;
try { try {

@ -174,7 +174,7 @@ public class CrawlQueues {
// move some tasks to the core crawl job so we have something to do // move some tasks to the core crawl job so we have something to do
final int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance final int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance
for (int i = 0; i < toshift; i++) { for (int i = 0; i < toshift; i++) {
noticeURL.shift(NoticedURL.STACK_TYPE_LIMIT, NoticedURL.STACK_TYPE_CORE); noticeURL.shift(NoticedURL.STACK_TYPE_LIMIT, NoticedURL.STACK_TYPE_CORE, sb.webIndex.profilesActiveCrawls);
} }
log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() + log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() +
", limitCrawlJobSize()=" + limitCrawlJobSize() + ", cluster.mode=" + sb.getConfig(plasmaSwitchboardConstants.CLUSTER_MODE, "") + ", limitCrawlJobSize()=" + limitCrawlJobSize() + ", cluster.mode=" + sb.getConfig(plasmaSwitchboardConstants.CLUSTER_MODE, "") +
@ -214,7 +214,7 @@ public class CrawlQueues {
while (urlEntry == null && noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) > 0) { while (urlEntry == null && noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) > 0) {
final String stats = "LOCALCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]"; final String stats = "LOCALCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]";
try { try {
urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_CORE, true); urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_CORE, true, sb.webIndex.profilesActiveCrawls);
final String profileHandle = urlEntry.profileHandle(); final String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling: // System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
@ -224,7 +224,7 @@ public class CrawlQueues {
} }
final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.getEntry(profileHandle); final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.getEntry(profileHandle);
if (profile == null) { if (profile == null) {
log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); log.logWarning(stats + ": LOST LOCALCRAWL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true; return true;
} }
@ -421,7 +421,7 @@ public class CrawlQueues {
final String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", " final String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", "
+ noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]"; + noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]";
try { try {
final CrawlEntry urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_REMOTE, true); final CrawlEntry urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_REMOTE, true, sb.webIndex.profilesActiveCrawls);
final String profileHandle = urlEntry.profileHandle(); final String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling: // System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + // profileHandle = " + profileHandle + ", urlEntry.url = " +
@ -429,7 +429,7 @@ public class CrawlQueues {
final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.getEntry(profileHandle); final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.getEntry(profileHandle);
if (profile == null) { if (profile == null) {
log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); log.logWarning(stats + ": LOST REMOTETRIGGEREDCRAWL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return false; return false;
} }

@ -397,7 +397,7 @@ public final class CrawlStacker extends Thread {
final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.getEntry(entry.profileHandle()); final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.getEntry(entry.profileHandle());
if (profile == null) { if (profile == null) {
final String errorMsg = "LOST PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url(); final String errorMsg = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
log.logWarning(errorMsg); log.logWarning(errorMsg);
return errorMsg; return errorMsg;
} }

@ -12,7 +12,7 @@ public class NoticeURLImporter extends AbstractImporter implements Importer {
private File plasmaPath = null; private File plasmaPath = null;
private final HashSet<String> importProfileHandleCache = new HashSet<String>(); private final HashSet<String> importProfileHandleCache = new HashSet<String>();
private final CrawlProfile importProfileDB; private CrawlProfile importProfileDB;
private final NoticedURL importNurlDB; private final NoticedURL importNurlDB;
private final int importStartSize; private final int importStartSize;
private int urlCount = 0; private int urlCount = 0;
@ -73,7 +73,17 @@ public class NoticeURLImporter extends AbstractImporter implements Importer {
// init profile DB // init profile DB
this.log.logInfo("Initializing the source profileDB"); this.log.logInfo("Initializing the source profileDB");
try {
this.importProfileDB = new CrawlProfile(profileDbFile);
} catch (IOException e) {
profileDbFile.delete();
try {
this.importProfileDB = new CrawlProfile(profileDbFile); this.importProfileDB = new CrawlProfile(profileDbFile);
} catch (IOException e1) {
e1.printStackTrace();
this.importProfileDB = null;
}
}
} }
public long getEstimatedTime() { public long getEstimatedTime() {
@ -130,7 +140,7 @@ public class NoticeURLImporter extends AbstractImporter implements Importer {
if (this.importNurlDB.stackSize(stackTypes[stackType]) == 0) break; if (this.importNurlDB.stackSize(stackTypes[stackType]) == 0) break;
this.urlCount++; this.urlCount++;
nextEntry = this.importNurlDB.pop(stackTypes[stackType], false); nextEntry = this.importNurlDB.pop(stackTypes[stackType], false, null);
nextHash = nextEntry.url().hash(); nextHash = nextEntry.url().hash();
} else { } else {
if (!entryIter.hasNext()) break; if (!entryIter.hasNext()) break;

@ -184,18 +184,18 @@ public class NoticedURL {
} }
} }
public CrawlEntry pop(final int stackType, final boolean delay) throws IOException { public CrawlEntry pop(final int stackType, final boolean delay, CrawlProfile profile) throws IOException {
switch (stackType) { switch (stackType) {
case STACK_TYPE_CORE: return pop(coreStack, delay); case STACK_TYPE_CORE: return pop(coreStack, delay, profile);
case STACK_TYPE_LIMIT: return pop(limitStack, delay); case STACK_TYPE_LIMIT: return pop(limitStack, delay, profile);
case STACK_TYPE_REMOTE: return pop(remoteStack, delay); case STACK_TYPE_REMOTE: return pop(remoteStack, delay, profile);
default: return null; default: return null;
} }
} }
public void shift(final int fromStack, final int toStack) { public void shift(final int fromStack, final int toStack, CrawlProfile profile) {
try { try {
final CrawlEntry entry = pop(fromStack, false); final CrawlEntry entry = pop(fromStack, false, profile);
if (entry != null) push(toStack, entry); if (entry != null) push(toStack, entry);
} catch (final IOException e) { } catch (final IOException e) {
return; return;
@ -211,13 +211,13 @@ public class NoticedURL {
} }
} }
private CrawlEntry pop(final Balancer balancer, final boolean delay) throws IOException { private CrawlEntry pop(final Balancer balancer, final boolean delay, CrawlProfile profile) throws IOException {
// this is a filo - pop // this is a filo - pop
int s; int s;
CrawlEntry entry; CrawlEntry entry;
synchronized (balancer) { synchronized (balancer) {
while ((s = balancer.size()) > 0) { while ((s = balancer.size()) > 0) {
entry = balancer.pop(delay); entry = balancer.pop(delay, profile);
if (entry == null) { if (entry == null) {
if (s > balancer.size()) continue; if (s > balancer.size()) continue;
final int aftersize = balancer.size(); final int aftersize = balancer.size();

@ -88,8 +88,8 @@ public final class plasmaWordIndex implements indexRI {
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText"; public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia"; public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia";
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia"; public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia";
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.db"; public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.db"; public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap";
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
@ -163,7 +163,17 @@ public final class plasmaWordIndex implements indexRI {
final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesActive1.db"); final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesActive1.db");
if (oldFile.exists()) oldFile.renameTo(profilesActiveFile); if (oldFile.exists()) oldFile.renameTo(profilesActiveFile);
} }
try {
this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile);
} catch (IOException e) {
profilesActiveFile.delete();
try {
this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile); this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile);
} catch (IOException e1) {
e1.printStackTrace();
this.profilesActiveCrawls = null;
}
}
initActiveCrawlProfiles(); initActiveCrawlProfiles();
log.logConfig("Loaded active crawl profiles from file " + profilesActiveFile.getName() + log.logConfig("Loaded active crawl profiles from file " + profilesActiveFile.getName() +
", " + this.profilesActiveCrawls.size() + " entries" + ", " + this.profilesActiveCrawls.size() + " entries" +
@ -174,7 +184,17 @@ public final class plasmaWordIndex implements indexRI {
final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesPassive1.db"); final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesPassive1.db");
if (oldFile.exists()) oldFile.renameTo(profilesPassiveFile); if (oldFile.exists()) oldFile.renameTo(profilesPassiveFile);
} }
try {
this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile); this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile);
} catch (IOException e) {
profilesPassiveFile.delete();
try {
this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile);
} catch (IOException e1) {
e1.printStackTrace();
this.profilesPassiveCrawls = null;
}
}
log.logConfig("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() + log.logConfig("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() +
", " + this.profilesPassiveCrawls.size() + " entries" + ", " + this.profilesPassiveCrawls.size() + " entries" +
", " + profilesPassiveFile.length()/1024); ", " + profilesPassiveFile.length()/1024);
@ -296,7 +316,11 @@ public final class plasmaWordIndex implements indexRI {
private void resetProfiles() { private void resetProfiles() {
final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
if (pdb.exists()) pdb.delete(); if (pdb.exists()) pdb.delete();
try {
profilesActiveCrawls = new CrawlProfile(pdb); profilesActiveCrawls = new CrawlProfile(pdb);
} catch (IOException e) {
e.printStackTrace();
}
initActiveCrawlProfiles(); initActiveCrawlProfiles();
} }

Loading…
Cancel
Save