a set of small fixes to crawling behaviour

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6216 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent a564df3984
commit 13c63f4082

@ -63,20 +63,20 @@ public class IndexCreateIndexingQueue_p {
}
if (post.containsKey("clearIndexingQueue")) {
try {
synchronized (sb.crawler.queuePreStack) {
synchronized (sb.crawler.indexingStack) {
IndexingStack.QueueEntry entry = null;
while ((entry = sb.crawler.queuePreStack.pop()) != null) {
while ((entry = sb.crawler.indexingStack.pop()) != null) {
if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) {
plasmaHTCache.deleteFromCache(entry.url());
}
}
sb.crawler.queuePreStack.clear(); // reset file to clean up content completely
sb.crawler.indexingStack.clear(); // reset file to clean up content completely
}
} catch (final Exception e) {}
} else if (post.containsKey("deleteEntry")) {
final String urlHash = post.get("deleteEntry");
try {
sb.crawler.queuePreStack.remove(urlHash);
sb.crawler.indexingStack.remove(urlHash);
} catch (final Exception e) {}
prop.put("LOCATION","");
return prop;
@ -86,7 +86,7 @@ public class IndexCreateIndexingQueue_p {
yacySeed initiator;
boolean dark;
if ((sb.crawler.queuePreStack.size() == 0) && (sb.crawler.queuePreStack.getActiveQueueSize() == 0)) {
if ((sb.crawler.indexingStack.size() == 0) && (sb.crawler.indexingStack.getActiveQueueSize() == 0)) {
prop.put("indexing-queue", "0"); //is empty
} else {
prop.put("indexing-queue", "1"); // there are entries in the queue or in process
@ -98,12 +98,12 @@ public class IndexCreateIndexingQueue_p {
// getting all entries that are currently in process
final ArrayList<IndexingStack.QueueEntry> entryList = new ArrayList<IndexingStack.QueueEntry>();
entryList.addAll(sb.crawler.queuePreStack.getActiveQueueEntries());
entryList.addAll(sb.crawler.indexingStack.getActiveQueueEntries());
final int inProcessCount = entryList.size();
// getting all enqueued entries
if ((sb.crawler.queuePreStack.size() > 0)) {
final Iterator<IndexingStack.QueueEntry> i = sb.crawler.queuePreStack.entryIterator(false);
if ((sb.crawler.indexingStack.size() > 0)) {
final Iterator<IndexingStack.QueueEntry> i = sb.crawler.indexingStack.entryIterator(false);
while (i.hasNext()) entryList.add(i.next());
}

@ -286,7 +286,7 @@ public class Status {
prop.putNum("connectionsMax", httpd.getMaxSessionCount());
// Queue information
final int indexingJobCount = sb.getThread("80_indexing").getJobCount() + sb.crawler.queuePreStack.getActiveQueueSize();
final int indexingJobCount = sb.getThread("80_indexing").getJobCount() + sb.crawler.indexingStack.getActiveQueueSize();
final int indexingMaxCount = (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30);
final int indexingPercent = (indexingMaxCount==0)?0:indexingJobCount*100/indexingMaxCount;
prop.putNum("indexingQueueSize", indexingJobCount);

@ -39,11 +39,11 @@ public class queues_p {
yacySeed initiator;
//indexing queue
prop.putNum("indexingSize", sb.getThread(plasmaSwitchboardConstants.INDEXER).getJobCount() + sb.crawler.queuePreStack.getActiveQueueSize());
prop.putNum("indexingSize", sb.getThread(plasmaSwitchboardConstants.INDEXER).getJobCount() + sb.crawler.indexingStack.getActiveQueueSize());
prop.putNum("indexingMax", (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30));
prop.putNum("urlpublictextSize", sb.indexSegment.urlMetadata().size());
prop.putNum("rwipublictextSize", sb.indexSegment.termIndex().sizesMax());
if ((sb.crawler.queuePreStack.size() == 0) && (sb.crawler.queuePreStack.getActiveQueueSize() == 0)) {
if ((sb.crawler.indexingStack.size() == 0) && (sb.crawler.indexingStack.getActiveQueueSize() == 0)) {
prop.put("list", "0"); //is empty
} else {
IndexingStack.QueueEntry pcentry;
@ -52,12 +52,12 @@ public class queues_p {
// getting all entries that are currently in process
final ArrayList<IndexingStack.QueueEntry> entryList = new ArrayList<IndexingStack.QueueEntry>();
entryList.addAll(sb.crawler.queuePreStack.getActiveQueueEntries());
entryList.addAll(sb.crawler.indexingStack.getActiveQueueEntries());
final int inProcessCount = entryList.size();
// getting all enqueued entries
if ((sb.crawler.queuePreStack.size() > 0)) {
final Iterator<IndexingStack.QueueEntry> i1 = sb.crawler.queuePreStack.entryIterator(false);
if ((sb.crawler.indexingStack.size() > 0)) {
final Iterator<IndexingStack.QueueEntry> i1 = sb.crawler.indexingStack.entryIterator(false);
while (i1.hasNext()) try {
entryList.add(i1.next());
} catch (kelondroException e) {

@ -95,7 +95,7 @@ public class Balancer {
}
public void clear() {
Log.logInfo("Balancer", "cleaing balancer with " + urlFileIndex.size() + " entries from " + urlFileIndex.filename());
Log.logInfo("Balancer", "cleaning balancer with " + urlFileIndex.size() + " entries from " + urlFileIndex.filename());
try {
urlFileIndex.clear();
} catch (IOException e) {
@ -289,7 +289,7 @@ public class Balancer {
* @return a url in a CrawlEntry object
* @throws IOException
*/
public CrawlEntry pop(boolean delay, CrawlProfile profile) throws IOException {
public CrawlEntry pop(final boolean delay, final CrawlProfile profile) throws IOException {
// returns a crawl entry from the stack and ensures minimum delta times
filltop(delay, -600000, false);
@ -355,7 +355,7 @@ public class Balancer {
// in best case, this should never happen if the balancer works propertly
// this is only to protection against the worst case, where the crawler could
// behave in a DoS-manner
Log.logInfo("BALANCER", "forcing crawl-delay of " + (sleeptime / 1000) + " seconds for " + crawlEntry.url().getHost() + ((sleeptime > Math.max(minimumLocalDelta, minimumGlobalDelta)) ? " (forced latency)" : ""));
Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ((sleeptime > Math.max(minimumLocalDelta, minimumGlobalDelta)) ? " (forced latency)" : ""));
long loops = sleeptime / 3000;
long rest = sleeptime % 3000;
if (loops < 2) {

@ -303,9 +303,9 @@ public class CrawlQueues {
}
value = (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30);
if (sb.crawler.queuePreStack.size() >= value) {
if (sb.crawler.indexingStack.size() >= value) {
if (this.log.isFine()) {
log.logFine(type + "Crawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.crawler.queuePreStack.size() + ")");
log.logFine(type + "Crawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.crawler.indexingStack.size() + ")");
}
return false;
}
@ -322,9 +322,10 @@ public class CrawlQueues {
return false;
}
if (sb.onlineCaution()) {
String cautionCause = sb.onlineCaution();
if (cautionCause != null) {
if (this.log.isFine()) {
log.logFine(type + "Crawl: online caution, omitting processing");
log.logFine(type + "Crawl: online caution for " + cautionCause + ", omitting processing");
}
return false;
}
@ -344,8 +345,8 @@ public class CrawlQueues {
return false;
}
if (sb.crawler.queuePreStack.size() >= (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30) / 2) {
if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.crawler.queuePreStack.size() + ")");
if (sb.crawler.indexingStack.size() >= (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30) / 2) {
if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.crawler.indexingStack.size() + ")");
return false;
}
@ -359,8 +360,9 @@ public class CrawlQueues {
return false;
}
if (sb.onlineCaution()) {
if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: online caution, omitting processing");
String cautionCause = sb.onlineCaution();
if (cautionCause != null) {
if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: online caution for " + cautionCause + ", omitting processing");
return false;
}

@ -58,7 +58,7 @@ public final class CrawlSwitchboard {
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;
private final Log log;
public IndexingStack queuePreStack;
public IndexingStack indexingStack;
public CrawlProfile profilesActiveCrawls, profilesPassiveCrawls;
public CrawlProfile.entry defaultProxyProfile;
public CrawlProfile.entry defaultRemoteProfile;
@ -134,11 +134,11 @@ public final class CrawlSwitchboard {
", " + profilesPassiveFile.length()/1024);
// init queues
this.queuePreStack = new IndexingStack(peers, queuesRoot, "urlNoticePreStack.stack", this.profilesActiveCrawls);
this.indexingStack = new IndexingStack(peers, queuesRoot, "urlNoticePreStack.stack", this.profilesActiveCrawls);
}
public void clear() {
queuePreStack.clear();
indexingStack.clear();
}
private void initActiveCrawlProfiles() {
@ -230,7 +230,7 @@ public final class CrawlSwitchboard {
public boolean cleanProfiles() throws InterruptedException {
if (queuePreStack.size() > 0) return false;
if (indexingStack.size() > 0) return false;
final Iterator<CrawlProfile.entry> iter = profilesActiveCrawls.profiles(true);
CrawlProfile.entry entry;
boolean hasDoneSomething = false;
@ -264,7 +264,7 @@ public final class CrawlSwitchboard {
public void close() {
this.profilesActiveCrawls.close();
this.profilesPassiveCrawls.close();
this.queuePreStack.close();
this.indexingStack.close();
}
}

@ -204,7 +204,7 @@ public class IndexingStack {
return new QueueEntry(url, referrer, ifModifiedSince, requestWithCookie, initiator, depth, profilehandle, anchorName);
}
public void enQueueToActive(final QueueEntry entry) {
public void store(final QueueEntry entry) {
queueInProcess.put(entry.url().hash(), entry);
}
@ -437,7 +437,7 @@ public class IndexingStack {
// check profile
if (!profile().indexText() && !profile().indexMedia()) {
return "Indexing_Not_Allowed";
return "indexing not allowed - indexText and indexMedia not set (for proxy)";
}
// -CGI access in request
@ -579,7 +579,7 @@ public class IndexingStack {
// check profile
if (!profile().indexText() && !profile().indexMedia()) {
return "Indexing_Not_Allowed";
return "indexing not allowed - indexText and indexMedia not set (for crawler)";
}
// -CGI access in request

@ -149,18 +149,17 @@ public class Latency {
* @return the remaining waiting time in milliseconds
*/
public static long waitingRemaining(yacyURL url, final long minimumLocalDelta, final long minimumGlobalDelta) {
// find the minimum waiting time based on the network domain (local or global)
final boolean local = url.isLocal();
long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta;
// first check if the domain was _ever_ accessed before
String hosthash = url.hash().substring(6);
Host host = host(hosthash);
if (host == null) return 0; // no delay
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
// find the minimum waiting time based on the network domain (local or global)
final boolean local = url.isLocal();
long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta;
final long timeSinceLastAccess = (host == null) ? 0 : System.currentTimeMillis() - host.lastacc();
// for CGI accesses, we double the minimum time
// mostly there is a database access in the background
@ -168,7 +167,7 @@ public class Latency {
if (url.isCGI()) waiting = waiting * 2;
// if we have accessed the domain many times, get slower (the flux factor)
if (!local) waiting += host.flux(waiting);
if (!local && host != null) waiting += host.flux(waiting);
// find the delay as given by robots.txt on target site
long robotsDelay = (local) ? 0 : plasmaSwitchboard.getSwitchboard().robots.crawlDelayMillis(url);
@ -177,7 +176,7 @@ public class Latency {
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2);
if (host != null) waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2);
// prevent that that a robots file can stop our indexer completely
waiting = Math.min(60000, waiting);

@ -51,6 +51,7 @@ public class NoticedURL {
private Balancer remoteStack; // links from remote crawl orders
public NoticedURL(final File cachePath) {
Log.logInfo("NoticedURL", "CREATING STACKS at " + cachePath.toString());
this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", false, minimumLocalDeltaInit, minimumGlobalDeltaInit);
this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", false, minimumLocalDeltaInit, minimumGlobalDeltaInit);
//overhangStack = new plasmaCrawlBalancer(overhangStackFile);
@ -72,13 +73,14 @@ public class NoticedURL {
}
public void clear() {
Log.logInfo("NoticedURL", "CLEARING ALL STACKS!");
Log.logInfo("NoticedURL", "CLEARING ALL STACKS");
coreStack.clear();
limitStack.clear();
remoteStack.clear();
}
public void close() {
Log.logInfo("NoticedURL", "CLOSING ALL STACKS");
if (coreStack != null) {
coreStack.close();
coreStack = null;

@ -113,12 +113,12 @@ public final class Parser {
if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'.");
mime2parser.put(mimeType, parser);
Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName());
if (prototypeMime != null) for (String ext: parser.supportedExtensions()) {
String s = ext2mime.get(ext);
if (s != null) log.logSevere("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'.");
ext2mime.put(ext, prototypeMime);
}
}
if (prototypeMime != null) for (String ext: parser.supportedExtensions()) {
String s = ext2mime.get(ext);
if (s != null) log.logSevere("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'.");
ext2mime.put(ext, prototypeMime);
}
}

@ -785,20 +785,20 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
}
public void switchNetwork(final String networkDefinition) {
log.logInfo("SWITCH NETWORK: switching to '" + networkDefinition + "'");
// pause crawls
final boolean lcp = crawlJobIsPaused(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
if (!lcp) pauseCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
final boolean rcp = crawlJobIsPaused(plasmaSwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
if (!rcp) pauseCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
// trigger online caution
proxyLastAccess = System.currentTimeMillis() + 10000; // at least 10 seconds online caution to prevent unnecessary action on database meanwhile
proxyLastAccess = System.currentTimeMillis() + 3000; // at least 3 seconds online caution to prevent unnecessary action on database meanwhile
log.logInfo("SWITCH NETWORK: SHUT DOWN OF OLD INDEX DATABASE...");
// clean search events which have cached relations to the old index
QueryEvent.cleanupEvents(true);
// switch the networks
synchronized (this) {
synchronized (this) {
// shut down
synchronized (this.indexSegment) {
this.indexSegment.close();
@ -809,6 +809,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
this.robots.close();
this.crawlQueues.close();
log.logInfo("SWITCH NETWORK: START UP OF NEW INDEX DATABASE...");
// start up
setConfig("network.unit.definition", networkDefinition);
overwriteNetworkDefinition();
@ -846,15 +848,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
log,
this.queuesRoot);
// we need a new stacker, because this uses network-specific attributes to sort out urls (local, global)
this.crawlStacker = new CrawlStacker(
this.crawlQueues,
this.crawler,
this.indexSegment,
this.peers,
"local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0,
"global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0);
// create new web structure
this.webStructure = new plasmaWebStructure(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", plasmaRankingDistribution.CR_OWN), new File(queuesRoot, "webStructure.map"));
@ -873,11 +867,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
this.getConfigLong("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta()));
// we need a new stacker, because this uses network-specific attributes to sort out urls (local, global)
this.crawlStacker = new CrawlStacker(
this.crawlQueues,
this.crawler,
this.indexSegment,
this.peers,
"local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0,
"global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0);
}
// start up crawl jobs
continueCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
continueCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
this.log.logInfo("switched network to " + networkDefinition);
log.logInfo("SWITCH NETWORK: FINISHED START UP, new network is now '" + networkDefinition + "'.");
// check status of account configuration: when local url crawling is allowed, it is not allowed
// that an automatic authorization of localhost is done, because in this case crawls from local
// addresses are blocked to prevent attack szenarios where remote pages contain links to localhost
@ -1032,11 +1036,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
new RankingProfile("", crypt.simpleDecode(sb.getConfig("rankingProfile", ""), null));
}
public boolean onlineCaution() {
return
(System.currentTimeMillis() - this.proxyLastAccess < Integer.parseInt(getConfig(plasmaSwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, "30000"))) ||
(System.currentTimeMillis() - this.localSearchLastAccess < Integer.parseInt(getConfig(plasmaSwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, "30000"))) ||
(System.currentTimeMillis() - this.remoteSearchLastAccess < Integer.parseInt(getConfig(plasmaSwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, "30000")));
/**
* checks if the proxy, the local search or remote search was accessed some time before
* If no limit is exceeded, null is returned. If a limit is exceeded,
* then the name of the service that caused the caution is returned
* @return
*/
public String onlineCaution() {
if (System.currentTimeMillis() - this.proxyLastAccess < Integer.parseInt(getConfig(plasmaSwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, "30000"))) return "proxy";
if (System.currentTimeMillis() - this.localSearchLastAccess < Integer.parseInt(getConfig(plasmaSwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, "30000"))) return "localsearch";
if (System.currentTimeMillis() - this.remoteSearchLastAccess < Integer.parseInt(getConfig(plasmaSwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, "30000"))) return"remotesearch";
return null;
}
private static String ppRamString(long bytes) {
@ -1113,7 +1123,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
doIndexing = false;
}
synchronized (crawler.queuePreStack) {
synchronized (crawler.indexingStack) {
/* =========================================================================
* STORING DATA
*
@ -1149,7 +1159,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
if (doIndexing && supportError == null) {
// enqueue for further crawling
enQueue(this.crawler.queuePreStack.newEntry(
enQueue(this.crawler.indexingStack.newEntry(
entry.url(),
(entry.referrerURL() == null) ? null : entry.referrerURL().hash(),
entry.ifModifiedSince(),
@ -1179,13 +1189,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
terminateAllThreads(true);
log.logConfig("SWITCHBOARD SHUTDOWN STEP 2: sending termination signal to threaded indexing");
// closing all still running db importer jobs
dhtDispatcher.close();
indexingDocumentProcessor.announceShutdown();
indexingDocumentProcessor.awaitShutdown(12000);
crawlStacker.announceClose();
indexingCondensementProcessor.announceShutdown();
indexingAnalysisProcessor.announceShutdown();
indexingStorageProcessor.announceShutdown();
dhtDispatcher.close();
indexingCondensementProcessor.awaitShutdown(12000);
indexingAnalysisProcessor.awaitShutdown(12000);
indexingStorageProcessor.awaitShutdown(12000);
@ -1213,13 +1223,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
}
public int queueSize() {
return crawler.queuePreStack.size();
return crawler.indexingStack.size();
}
public void enQueue(final IndexingStack.QueueEntry job) {
assert job != null;
try {
crawler.queuePreStack.push(job);
crawler.indexingStack.push(job);
} catch (final IOException e) {
log.logSevere("IOError in plasmaSwitchboard.enQueue: " + e.getMessage(), e);
}
@ -1234,24 +1244,24 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
public IndexingStack.QueueEntry deQueue() {
// getting the next entry from the indexing queue
IndexingStack.QueueEntry nextentry = null;
synchronized (crawler.queuePreStack) {
synchronized (crawler.indexingStack) {
// do one processing step
if (this.log.isFine()) log.logFine("DEQUEUE: sbQueueSize=" + crawler.queuePreStack.size() +
if (this.log.isFine()) log.logFine("DEQUEUE: sbQueueSize=" + crawler.indexingStack.size() +
", coreStackSize=" + crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) +
", limitStackSize=" + crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) +
", overhangStackSize=" + crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) +
", remoteStackSize=" + crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE));
try {
final int sizeBefore = crawler.queuePreStack.size();
final int sizeBefore = crawler.indexingStack.size();
if (sizeBefore == 0) return null;
nextentry = crawler.queuePreStack.pop();
nextentry = crawler.indexingStack.pop();
if (nextentry == null) {
log.logWarning("deQueue: null entry on queue stack.");
if (crawler.queuePreStack.size() == sizeBefore) {
if (crawler.indexingStack.size() == sizeBefore) {
// this is a severe problem: because this time a null is returned, it means that this status will last forever
// to re-enable use of the sbQueue, it must be emptied completely
log.logSevere("deQueue: does not shrink after pop() == null. Emergency reset of sbQueue");
crawler.queuePreStack.clear();
crawler.indexingStack.clear();
}
return null;
}
@ -1285,7 +1295,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// create a queue entry
Document document = surrogate.document();
queueentry = this.crawler.queuePreStack.newEntry(surrogate.url(), null, null, false, null, 0, this.crawler.defaultSurrogateProfile.handle(), null);
queueentry = this.crawler.indexingStack.newEntry(surrogate.url(), null, null, false, null, 0, this.crawler.defaultSurrogateProfile.handle(), null);
/*
* public QueueEntry newEntry(final yacyURL url, final String referrer, final Date ifModifiedSince, final boolean requestWithCookie,
final String initiator, final int depth, final String profilehandle, final String anchorName)
@ -1311,12 +1321,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
public boolean deQueueProcess() {
try {
// work off fresh entries from the proxy or from the crawler
if (onlineCaution()) {
if (this.log.isFine()) log.logFine("deQueue: online caution, omitting resource stack processing");
String cautionCause = onlineCaution();
if (cautionCause != null) {
if (this.log.isFine()) log.logFine("deQueue: online caution for " + cautionCause + ", omitting resource stack processing");
return false;
}
boolean doneSomething = false;
// check for interruption
checkInterruption();
@ -1334,9 +1343,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
}
// getting the next entry from the indexing queue
if (crawler.queuePreStack.size() == 0) {
//log.logFine("deQueue: nothing to do, queue is emtpy");
return doneSomething; // nothing to do
if (crawler.indexingStack.size() == 0) {
if (log.isFinest()) log.logFinest("deQueue: nothing to do, queue is emtpy");
return false; // nothing to do
}
// if we were interrupted we should return now
@ -1347,10 +1356,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// get next queue entry and start a queue processing
final IndexingStack.QueueEntry queueEntry = deQueue();
if (queueEntry == null) return true;
if (queueEntry == null) {
if (this.log.isFine()) log.logFine("deQueue: queue entry is null");
return false;
}
if (queueEntry.profile() == null) {
queueEntry.close();
return true;
if (this.log.isFine()) log.logFine("deQueue: profile is null");
return false;
}
// check if the document should be indexed
@ -1366,19 +1379,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
if (noIndexReason != null) {
// this document should not be indexed. log cause and close queue
final yacyURL referrerURL = queueEntry.referrerURL(this.indexSegment.urlMetadata());
log.logFine("Not indexed any word in URL " + queueEntry.url() + "; cause: " + noIndexReason);
if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + queueEntry.url() + "; cause: " + noIndexReason);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? "" : referrerURL.hash(), queueEntry.initiator(), queueEntry.anchorName(), noIndexReason);
// finish this entry
return true;
}
// put document into the concurrent processing queue
crawler.queuePreStack.enQueueToActive(queueEntry);
// check for interruption
checkInterruption();
// enqueue to indexing queue
// put document into the concurrent processing queue
if (log.isFinest()) log.logFinest("deQueue: passing entry to indexing queue");
this.crawler.indexingStack.store(queueEntry);
this.indexingDocumentProcessor.enQueue(new indexingQueueEntry(queueEntry, null, null));
return true;
} catch (final InterruptedException e) {
@ -1879,7 +1891,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
public String toString() {
// it is possible to use this method in the cgi pages.
// actually it is used there for testing purpose
return "PROPS: " + super.toString() + "; QUEUE: " + crawler.queuePreStack.toString();
return "PROPS: " + super.toString() + "; QUEUE: " + crawler.indexingStack.toString();
}
// method for index deletion
@ -2027,8 +2039,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
}
public String dhtShallTransfer() {
if (onlineCaution()) {
return "online caution, dht transmission";
String cautionCause = onlineCaution();
if (cautionCause != null) {
return "online caution for " + cautionCause + ", dht transmission";
}
if (this.peers == null) {
return "no DHT distribution: seedDB == null";
@ -2055,10 +2068,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
return "no DHT distribution: not enough words - wordIndex.size() = " + indexSegment.termIndex().sizesMax();
}
if ((getConfig(plasmaSwitchboardConstants.INDEX_DIST_ALLOW_WHILE_CRAWLING, "false").equalsIgnoreCase("false")) && (crawlQueues.noticeURL.notEmptyLocal())) {
return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " + crawlQueues.noticeURL.size() + ", sbQueue.size() = " + crawler.queuePreStack.size();
return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " + crawlQueues.noticeURL.size() + ", sbQueue.size() = " + crawler.indexingStack.size();
}
if ((getConfig(plasmaSwitchboardConstants.INDEX_DIST_ALLOW_WHILE_INDEXING, "false").equalsIgnoreCase("false")) && (crawler.queuePreStack.size() > 1)) {
return "no DHT distribution: indexing in progress: noticeURL.stackSize() = " + crawlQueues.noticeURL.size() + ", sbQueue.size() = " + crawler.queuePreStack.size();
if ((getConfig(plasmaSwitchboardConstants.INDEX_DIST_ALLOW_WHILE_INDEXING, "false").equalsIgnoreCase("false")) && (crawler.indexingStack.size() > 1)) {
return "no DHT distribution: indexing in progress: noticeURL.stackSize() = " + crawlQueues.noticeURL.size() + ", sbQueue.size() = " + crawler.indexingStack.size();
}
return null; // this means; yes, please do dht transfer
}
@ -2275,7 +2288,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
public boolean waitForShutdown() throws InterruptedException {
this.shutdownSync.P();
dhtDispatcher.close();
return this.terminate;
}

@ -903,6 +903,7 @@ public class yacyURL implements Serializable {
if (this.hash == null) this.hash = urlHashComputation();
}
}
//if (domDomain(this.hash) != 7) System.out.println("*** DEBUG - not local: " + this.toNormalform(true, false));
return domDomain(this.hash) == 7;
}

@ -421,9 +421,9 @@ public final class yacy {
server.interrupt();
MultiThreadedHttpConnectionManager.shutdownAll();
}
MultiThreadedHttpConnectionManager.shutdownAll();
Log.logConfig("SHUTDOWN", "server has terminated");
sb.close();
MultiThreadedHttpConnectionManager.shutdownAll();
}
} catch (final Exception e) {
Log.logSevere("STARTUP", "Unexpected Error: " + e.getClass().getName(),e);

Loading…
Cancel
Save