fix for problem with balancer and lost crawl profiles:

if crawl profile ist lost, no robots.txt is loaded any more

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5258 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent d197a62faf
commit ff68f394dd

@ -64,7 +64,7 @@ public class urls {
yacyURL referrer;
while ((count > 0) && (sb.crawlQueues.noticeURL.stackSize(stackType) > 0)) {
try {
entry = sb.crawlQueues.noticeURL.pop(stackType, false);
entry = sb.crawlQueues.noticeURL.pop(stackType, false, sb.webIndex.profilesActiveCrawls);
} catch (final IOException e) {
break;
}

@ -398,7 +398,18 @@ public class Balancer {
}
}
public synchronized CrawlEntry pop(boolean delay) throws IOException {
/**
* get the next entry in this crawl queue in such a way that the domain access time delta is maximized
* and always above the given minimum delay time. An additional delay time is computed using the robots.txt
* crawl-delay time which is always respected. In case the minimum time cannot ensured, this method pauses
* the necessary time until the url is released and returned as CrawlEntry object. In case that a profile
* for the computed Entry does not exist, null is returned
* @param delay
* @param profile
* @return a url in a CrawlEntry object
* @throws IOException
*/
public synchronized CrawlEntry pop(boolean delay, CrawlProfile profile) throws IOException {
// returns a crawl entry from the stack and ensures minimum delta times
// we have 3 sources to choose from: the ramStack, the domainStacks and the fileStack
@ -565,8 +576,11 @@ public class Balancer {
throw new IOException("get() found a valid urlhash, but failed to fetch the corresponding url entry - total size = " + size() + ", fileStack.size() = " + urlFileStack.size() + ", ramStack.size() = " + urlRAMStack.size() + ", domainStacks.size() = " + domainStacks.size());
}
assert urlFileIndex.size() + 1 == s : "urlFileIndex.size() = " + urlFileIndex.size() + ", s = " + s + ", result = " + result;
final CrawlEntry crawlEntry = new CrawlEntry(rowEntry);
long sleeptime = crawlEntry.waitingRemaining(minimumLocalDelta, minimumGlobalDelta);
final CrawlEntry crawlEntry = new CrawlEntry(rowEntry);
// at this point we must check if the crawlEntry has relevancy because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again
if (profile != null && !profile.hasEntry(crawlEntry.profileHandle())) return null;
long sleeptime = crawlEntry.waitingRemaining(minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
if (delay && sleeptime > 0) {
// force a busy waiting here

@ -30,6 +30,7 @@ import java.util.Iterator;
import java.util.Map;
import de.anomic.kelondro.kelondroBLOB;
import de.anomic.kelondro.kelondroBLOBHeap;
import de.anomic.kelondro.kelondroBLOBTree;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCloneableIterator;
@ -47,10 +48,10 @@ public class CrawlProfile {
kelondroMap profileTable;
private final File profileTableFile;
public CrawlProfile(final File file) {
public CrawlProfile(final File file) throws IOException {
this.profileTableFile = file;
profileTableFile.getParentFile().mkdirs();
final kelondroBLOB dyn = new kelondroBLOBTree(profileTableFile, true, true, yacySeedDB.commonHashLength, 2000, '#', kelondroNaturalOrder.naturalOrder, false, false, true);
final kelondroBLOB dyn = new kelondroBLOBHeap(profileTableFile, yacySeedDB.commonHashLength, kelondroNaturalOrder.naturalOrder);
profileTable = new kelondroMap(dyn, 500);
}
@ -183,6 +184,14 @@ public class CrawlProfile {
return ne;
}
public boolean hasEntry(final String handle) {
try {
return profileTable.has(handle);
} catch (final IOException e) {
return false;
}
}
public entry getEntry(final String handle) {
HashMap<String, String> m;
try {

@ -174,7 +174,7 @@ public class CrawlQueues {
// move some tasks to the core crawl job so we have something to do
final int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance
for (int i = 0; i < toshift; i++) {
noticeURL.shift(NoticedURL.STACK_TYPE_LIMIT, NoticedURL.STACK_TYPE_CORE);
noticeURL.shift(NoticedURL.STACK_TYPE_LIMIT, NoticedURL.STACK_TYPE_CORE, sb.webIndex.profilesActiveCrawls);
}
log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() +
", limitCrawlJobSize()=" + limitCrawlJobSize() + ", cluster.mode=" + sb.getConfig(plasmaSwitchboardConstants.CLUSTER_MODE, "") +
@ -214,7 +214,7 @@ public class CrawlQueues {
while (urlEntry == null && noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) > 0) {
final String stats = "LOCALCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]";
try {
urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_CORE, true);
urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_CORE, true, sb.webIndex.profilesActiveCrawls);
final String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
@ -224,7 +224,7 @@ public class CrawlQueues {
}
final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.getEntry(profileHandle);
if (profile == null) {
log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
log.logWarning(stats + ": LOST LOCALCRAWL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
}
@ -421,7 +421,7 @@ public class CrawlQueues {
final String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", "
+ noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]";
try {
final CrawlEntry urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_REMOTE, true);
final CrawlEntry urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_REMOTE, true, sb.webIndex.profilesActiveCrawls);
final String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " +
@ -429,7 +429,7 @@ public class CrawlQueues {
final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.getEntry(profileHandle);
if (profile == null) {
log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
log.logWarning(stats + ": LOST REMOTETRIGGEREDCRAWL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return false;
}

@ -397,7 +397,7 @@ public final class CrawlStacker extends Thread {
final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.getEntry(entry.profileHandle());
if (profile == null) {
final String errorMsg = "LOST PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
final String errorMsg = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
log.logWarning(errorMsg);
return errorMsg;
}

@ -12,7 +12,7 @@ public class NoticeURLImporter extends AbstractImporter implements Importer {
private File plasmaPath = null;
private final HashSet<String> importProfileHandleCache = new HashSet<String>();
private final CrawlProfile importProfileDB;
private CrawlProfile importProfileDB;
private final NoticedURL importNurlDB;
private final int importStartSize;
private int urlCount = 0;
@ -73,7 +73,17 @@ public class NoticeURLImporter extends AbstractImporter implements Importer {
// init profile DB
this.log.logInfo("Initializing the source profileDB");
this.importProfileDB = new CrawlProfile(profileDbFile);
try {
this.importProfileDB = new CrawlProfile(profileDbFile);
} catch (IOException e) {
profileDbFile.delete();
try {
this.importProfileDB = new CrawlProfile(profileDbFile);
} catch (IOException e1) {
e1.printStackTrace();
this.importProfileDB = null;
}
}
}
public long getEstimatedTime() {
@ -130,7 +140,7 @@ public class NoticeURLImporter extends AbstractImporter implements Importer {
if (this.importNurlDB.stackSize(stackTypes[stackType]) == 0) break;
this.urlCount++;
nextEntry = this.importNurlDB.pop(stackTypes[stackType], false);
nextEntry = this.importNurlDB.pop(stackTypes[stackType], false, null);
nextHash = nextEntry.url().hash();
} else {
if (!entryIter.hasNext()) break;

@ -184,18 +184,18 @@ public class NoticedURL {
}
}
public CrawlEntry pop(final int stackType, final boolean delay) throws IOException {
public CrawlEntry pop(final int stackType, final boolean delay, CrawlProfile profile) throws IOException {
switch (stackType) {
case STACK_TYPE_CORE: return pop(coreStack, delay);
case STACK_TYPE_LIMIT: return pop(limitStack, delay);
case STACK_TYPE_REMOTE: return pop(remoteStack, delay);
case STACK_TYPE_CORE: return pop(coreStack, delay, profile);
case STACK_TYPE_LIMIT: return pop(limitStack, delay, profile);
case STACK_TYPE_REMOTE: return pop(remoteStack, delay, profile);
default: return null;
}
}
public void shift(final int fromStack, final int toStack) {
public void shift(final int fromStack, final int toStack, CrawlProfile profile) {
try {
final CrawlEntry entry = pop(fromStack, false);
final CrawlEntry entry = pop(fromStack, false, profile);
if (entry != null) push(toStack, entry);
} catch (final IOException e) {
return;
@ -211,13 +211,13 @@ public class NoticedURL {
}
}
private CrawlEntry pop(final Balancer balancer, final boolean delay) throws IOException {
private CrawlEntry pop(final Balancer balancer, final boolean delay, CrawlProfile profile) throws IOException {
// this is a filo - pop
int s;
CrawlEntry entry;
synchronized (balancer) {
while ((s = balancer.size()) > 0) {
entry = balancer.pop(delay);
entry = balancer.pop(delay, profile);
if (entry == null) {
if (s > balancer.size()) continue;
final int aftersize = balancer.size();

@ -88,8 +88,8 @@ public final class plasmaWordIndex implements indexRI {
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia";
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia";
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.db";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.db";
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap";
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
@ -163,7 +163,17 @@ public final class plasmaWordIndex implements indexRI {
final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesActive1.db");
if (oldFile.exists()) oldFile.renameTo(profilesActiveFile);
}
this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile);
try {
this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile);
} catch (IOException e) {
profilesActiveFile.delete();
try {
this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile);
} catch (IOException e1) {
e1.printStackTrace();
this.profilesActiveCrawls = null;
}
}
initActiveCrawlProfiles();
log.logConfig("Loaded active crawl profiles from file " + profilesActiveFile.getName() +
", " + this.profilesActiveCrawls.size() + " entries" +
@ -174,7 +184,17 @@ public final class plasmaWordIndex implements indexRI {
final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesPassive1.db");
if (oldFile.exists()) oldFile.renameTo(profilesPassiveFile);
}
this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile);
try {
this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile);
} catch (IOException e) {
profilesPassiveFile.delete();
try {
this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile);
} catch (IOException e1) {
e1.printStackTrace();
this.profilesPassiveCrawls = null;
}
}
log.logConfig("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() +
", " + this.profilesPassiveCrawls.size() + " entries" +
", " + profilesPassiveFile.length()/1024);
@ -296,7 +316,11 @@ public final class plasmaWordIndex implements indexRI {
private void resetProfiles() {
final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
if (pdb.exists()) pdb.delete();
profilesActiveCrawls = new CrawlProfile(pdb);
try {
profilesActiveCrawls = new CrawlProfile(pdb);
} catch (IOException e) {
e.printStackTrace();
}
initActiveCrawlProfiles();
}

Loading…
Cancel
Save