You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
575 lines
28 KiB
575 lines
28 KiB
18 years ago
|
// plasmaCrawlQueues.java
|
||
|
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||
|
// first published 29.10.2007 on http://yacy.net
|
||
|
//
|
||
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
||
|
//
|
||
|
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||
|
// $LastChangedRevision: 1986 $
|
||
|
// $LastChangedBy: orbiter $
|
||
|
//
|
||
|
// LICENSE
|
||
|
//
|
||
|
// This program is free software; you can redistribute it and/or modify
|
||
|
// it under the terms of the GNU General Public License as published by
|
||
|
// the Free Software Foundation; either version 2 of the License, or
|
||
|
// (at your option) any later version.
|
||
|
//
|
||
|
// This program is distributed in the hope that it will be useful,
|
||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
// GNU General Public License for more details.
|
||
|
//
|
||
|
// You should have received a copy of the GNU General Public License
|
||
|
// along with this program; if not, write to the Free Software
|
||
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||
|
|
||
|
package de.anomic.plasma.crawler;
|
||
|
|
||
|
import java.io.File;
|
||
|
import java.io.IOException;
|
||
|
import java.util.Date;
|
||
|
import java.util.HashMap;
|
||
|
import java.util.Iterator;
|
||
|
|
||
|
import de.anomic.data.robotsParser;
|
||
|
import de.anomic.index.indexURLEntry;
|
||
|
import de.anomic.plasma.plasmaCrawlEntry;
|
||
|
import de.anomic.plasma.plasmaCrawlNURL;
|
||
|
import de.anomic.plasma.plasmaCrawlProfile;
|
||
|
import de.anomic.plasma.plasmaCrawlZURL;
|
||
|
import de.anomic.plasma.plasmaHTCache;
|
||
|
import de.anomic.plasma.plasmaSwitchboard;
|
||
|
import de.anomic.server.logging.serverLog;
|
||
|
import de.anomic.tools.crypt;
|
||
|
import de.anomic.yacy.yacyClient;
|
||
|
import de.anomic.yacy.yacyCore;
|
||
|
import de.anomic.yacy.yacySeed;
|
||
|
import de.anomic.yacy.yacyURL;
|
||
|
|
||
|
public class plasmaCrawlQueues {
|
||
|
|
||
|
private plasmaSwitchboard sb;
|
||
|
private serverLog log;
|
||
|
private HashMap workers; // mapping from url hash to Worker thread object
|
||
|
private plasmaProtocolLoader loader;
|
||
|
|
||
|
public plasmaCrawlNURL noticeURL;
|
||
|
public plasmaCrawlZURL errorURL, delegatedURL;
|
||
|
|
||
|
public plasmaCrawlQueues(plasmaSwitchboard sb, File plasmaPath) {
|
||
|
this.sb = sb;
|
||
|
this.log = new serverLog("CRAWLER");
|
||
|
this.workers = new HashMap();
|
||
|
this.loader = new plasmaProtocolLoader(sb, log);
|
||
|
|
||
|
// start crawling management
|
||
|
log.logConfig("Starting Crawling Management");
|
||
|
noticeURL = new plasmaCrawlNURL(plasmaPath);
|
||
|
//errorURL = new plasmaCrawlZURL(); // fresh error DB each startup; can be hold in RAM and reduces IO;
|
||
|
errorURL = new plasmaCrawlZURL(plasmaPath, "urlError1.db", true);
|
||
|
delegatedURL = new plasmaCrawlZURL(plasmaPath, "urlDelegated1.db", false);
|
||
|
|
||
|
}
|
||
|
|
||
|
public String urlExists(String hash) {
|
||
|
// tests if hash occurrs in any database
|
||
|
// if it exists, the name of the database is returned,
|
||
|
// if it not exists, null is returned
|
||
|
if (noticeURL.existsInStack(hash)) return "crawler";
|
||
|
if (delegatedURL.exists(hash)) return "delegated";
|
||
|
if (errorURL.exists(hash)) return "errors";
|
||
|
if (workers.containsKey(new Integer(hash.hashCode()))) return "workers";
|
||
|
return null;
|
||
|
}
|
||
|
|
||
|
public void urlRemove(String hash) {
|
||
|
noticeURL.removeByURLHash(hash);
|
||
|
delegatedURL.remove(hash);
|
||
|
errorURL.remove(hash);
|
||
|
}
|
||
|
|
||
|
public yacyURL getURL(String urlhash) {
|
||
|
if (urlhash.equals(yacyURL.dummyHash)) return null;
|
||
|
plasmaCrawlEntry ne = (plasmaCrawlEntry) workers.get(new Integer(urlhash.hashCode()));
|
||
|
if (ne != null) return ne.url();
|
||
|
ne = noticeURL.get(urlhash);
|
||
|
if (ne != null) return ne.url();
|
||
|
plasmaCrawlZURL.Entry ee = delegatedURL.getEntry(urlhash);
|
||
|
if (ee != null) return ee.url();
|
||
|
ee = errorURL.getEntry(urlhash);
|
||
|
if (ee != null) return ee.url();
|
||
|
return null;
|
||
|
}
|
||
|
|
||
|
public void close() {
|
||
|
// wait for all workers to finish
|
||
|
Iterator i = workers.values().iterator();
|
||
|
while (i.hasNext()) ((Thread) i.next()).interrupt();
|
||
|
// TODO: wait some more time until all threads are finished
|
||
|
}
|
||
|
|
||
|
public plasmaCrawlEntry[] activeWorker() {
|
||
|
synchronized (workers) {
|
||
|
plasmaCrawlEntry[] w = new plasmaCrawlEntry[workers.size()];
|
||
|
int i = 0;
|
||
|
Iterator j = workers.values().iterator();
|
||
|
while (j.hasNext()) {
|
||
|
w[i++] = ((crawlWorker) j.next()).entry;
|
||
|
}
|
||
|
return w;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public boolean isSupportedProtocol(String protocol) {
|
||
|
return loader.isSupportedProtocol(protocol);
|
||
|
}
|
||
|
|
||
|
public int coreCrawlJobSize() {
|
||
|
return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
|
||
|
}
|
||
|
|
||
|
public boolean coreCrawlJob() {
|
||
|
if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) {
|
||
|
//log.logDebug("CoreCrawl: queue is empty");
|
||
|
return false;
|
||
|
}
|
||
|
if (sb.sbQueue.size() >= plasmaSwitchboard.indexingSlots) {
|
||
|
log.logFine("CoreCrawl: too many processes in indexing queue, dismissed (" +
|
||
|
"sbQueueSize=" + sb.sbQueue.size() + ")");
|
||
|
return false;
|
||
|
}
|
||
|
if (this.size() >= plasmaSwitchboard.crawlSlots) {
|
||
|
log.logFine("CoreCrawl: too many processes in loader queue, dismissed (" +
|
||
|
"cacheLoader=" + this.size() + ")");
|
||
|
return false;
|
||
|
}
|
||
|
if (sb.onlineCaution()) {
|
||
|
log.logFine("CoreCrawl: online caution, omitting processing");
|
||
|
return false;
|
||
|
}
|
||
|
// if the server is busy, we do crawling more slowly
|
||
|
//if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {}
|
||
|
|
||
|
// if crawling was paused we have to wait until we wer notified to continue
|
||
|
Object[] status = (Object[]) sb.crawlJobsStatus.get(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL);
|
||
|
synchronized(status[plasmaSwitchboard.CRAWLJOB_SYNC]) {
|
||
|
if (((Boolean)status[plasmaSwitchboard.CRAWLJOB_STATUS]).booleanValue()) {
|
||
|
try {
|
||
|
status[plasmaSwitchboard.CRAWLJOB_SYNC].wait();
|
||
|
}
|
||
|
catch (InterruptedException e){ return false;}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// do a local crawl
|
||
|
plasmaCrawlEntry urlEntry = null;
|
||
|
while (urlEntry == null && noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) {
|
||
|
String stats = "LOCALCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
|
||
|
try {
|
||
|
urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE, true);
|
||
|
String profileHandle = urlEntry.profileHandle();
|
||
|
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
|
||
|
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
|
||
|
if (profileHandle == null) {
|
||
|
log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
|
||
|
return true;
|
||
|
}
|
||
|
plasmaCrawlProfile.entry profile = sb.profilesActiveCrawls.getEntry(profileHandle);
|
||
|
if (profile == null) {
|
||
|
log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// check if the protocol is supported
|
||
|
yacyURL url = urlEntry.url();
|
||
|
String urlProtocol = url.getProtocol();
|
||
|
if (!this.sb.crawlQueues.isSupportedProtocol(urlProtocol)) {
|
||
|
this.log.logSevere("Unsupported protocol in URL '" + url.toString());
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
log.logFine("LOCALCRAWL: URL=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter()
|
||
|
+ ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed().isSenior()) || (yacyCore.seedDB.mySeed().isPrincipal())) ? "true" : "false")));
|
||
|
|
||
|
processLocalCrawling(urlEntry, stats);
|
||
|
return true;
|
||
|
} catch (IOException e) {
|
||
|
log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
|
||
|
if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE);
|
||
|
}
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
|
||
|
public int limitCrawlTriggerJobSize() {
|
||
|
return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
|
||
|
}
|
||
|
|
||
|
public boolean limitCrawlTriggerJob() {
|
||
|
if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) == 0) {
|
||
|
//log.logDebug("LimitCrawl: queue is empty");
|
||
|
return false;
|
||
|
}
|
||
|
boolean robinsonPrivateCase = ((sb.isRobinsonMode()) &&
|
||
|
(!sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(plasmaSwitchboard.CLUSTER_MODE_PUBLIC_CLUSTER)) &&
|
||
|
(!sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(plasmaSwitchboard.CLUSTER_MODE_PRIVATE_CLUSTER)));
|
||
|
|
||
|
if ((robinsonPrivateCase) || ((coreCrawlJobSize() <= 20) && (limitCrawlTriggerJobSize() > 10))) {
|
||
|
// it is not efficient if the core crawl job is empty and we have too much to do
|
||
|
// move some tasks to the core crawl job
|
||
|
int toshift = 10; // this cannot be a big number because the balancer makes a forced waiting if it cannot balance
|
||
|
if (toshift > limitCrawlTriggerJobSize()) toshift = limitCrawlTriggerJobSize();
|
||
|
for (int i = 0; i < toshift; i++) {
|
||
|
noticeURL.shift(plasmaCrawlNURL.STACK_TYPE_LIMIT, plasmaCrawlNURL.STACK_TYPE_CORE);
|
||
|
}
|
||
|
log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() + ", limitCrawlTriggerJobSize()=" + limitCrawlTriggerJobSize() + ", cluster.mode=" + sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "") + ", robinsonMode=" + ((sb.isRobinsonMode()) ? "on" : "off"));
|
||
|
if (robinsonPrivateCase) return false;
|
||
|
}
|
||
|
|
||
|
// check local indexing queues
|
||
|
// in case the placing of remote crawl fails, there must be space in the local queue to work off the remote crawl
|
||
|
if (sb.sbQueue.size() >= plasmaSwitchboard.indexingSlots * 2) {
|
||
|
log.logFine("LimitCrawl: too many processes in indexing queue, dismissed (" +
|
||
|
"sbQueueSize=" + sb.sbQueue.size() + ")");
|
||
|
return false;
|
||
|
}
|
||
|
if (this.size() >= plasmaSwitchboard.crawlSlots) {
|
||
|
log.logFine("LimitCrawl: too many processes in loader queue, dismissed (" +
|
||
|
"cacheLoader=" + this.size() + ")");
|
||
|
return false;
|
||
|
}
|
||
|
if (sb.onlineCaution()) {
|
||
|
log.logFine("LimitCrawl: online caution, omitting processing");
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// if crawling was paused we have to wait until we were notified to continue
|
||
|
Object[] status = (Object[]) sb.crawlJobsStatus.get(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER);
|
||
|
synchronized(status[plasmaSwitchboard.CRAWLJOB_SYNC]) {
|
||
|
if (((Boolean)status[plasmaSwitchboard.CRAWLJOB_STATUS]).booleanValue()) {
|
||
|
try {
|
||
|
status[plasmaSwitchboard.CRAWLJOB_SYNC].wait();
|
||
|
}
|
||
|
catch (InterruptedException e){ return false;}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// start a global crawl, if possible
|
||
|
String stats = "REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
|
||
|
+ noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
|
||
|
try {
|
||
|
plasmaCrawlEntry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT, true);
|
||
|
String profileHandle = urlEntry.profileHandle();
|
||
|
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
|
||
|
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
|
||
|
plasmaCrawlProfile.entry profile = sb.profilesActiveCrawls.getEntry(profileHandle);
|
||
|
if (profile == null) {
|
||
|
log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// check if the protocol is supported
|
||
|
yacyURL url = urlEntry.url();
|
||
|
String urlProtocol = url.getProtocol();
|
||
|
if (!this.sb.crawlQueues.isSupportedProtocol(urlProtocol)) {
|
||
|
this.log.logSevere("Unsupported protocol in URL '" + url.toString());
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
log.logFine("plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter="
|
||
|
+ profile.generalFilter() + ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed().isSenior()) || (yacyCore.seedDB.mySeed().isPrincipal())) ? "true" : "false")));
|
||
|
|
||
|
boolean tryRemote = ((noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (sb.sbQueue.size() != 0)) &&
|
||
|
(profile.remoteIndexing()) &&
|
||
|
(urlEntry.initiator() != null) &&
|
||
|
// (!(urlEntry.initiator().equals(indexURL.dummyHash))) &&
|
||
|
((yacyCore.seedDB.mySeed().isSenior()) || (yacyCore.seedDB.mySeed().isPrincipal()));
|
||
|
if (tryRemote) {
|
||
|
// checking robots.txt for http(s) resources
|
||
|
if ((urlProtocol.equals("http") || urlProtocol.equals("https")) && robotsParser.isDisallowed(url)) {
|
||
|
this.log.logFine("Crawling of URL '" + url.toString() + "' disallowed by robots.txt.");
|
||
|
return true;
|
||
|
}
|
||
|
boolean success = processRemoteCrawlTrigger(urlEntry);
|
||
|
if (success) return true;
|
||
|
}
|
||
|
|
||
|
processLocalCrawling(urlEntry, stats); // emergency case, work off the crawl locally
|
||
|
return true;
|
||
|
} catch (IOException e) {
|
||
|
log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
|
||
|
if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_LIMIT);
|
||
|
return true; // if we return a false here we will block everything
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public int remoteTriggeredCrawlJobSize() {
|
||
|
return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE);
|
||
|
}
|
||
|
|
||
|
public boolean remoteTriggeredCrawlJob() {
|
||
|
// work off crawl requests that had been placed by other peers to our crawl stack
|
||
|
|
||
|
// do nothing if either there are private processes to be done
|
||
|
// or there is no global crawl on the stack
|
||
|
if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) == 0) {
|
||
|
//log.logDebug("GlobalCrawl: queue is empty");
|
||
|
return false;
|
||
|
}
|
||
|
if (sb.sbQueue.size() >= plasmaSwitchboard.indexingSlots) {
|
||
|
log.logFine("GlobalCrawl: too many processes in indexing queue, dismissed (" +
|
||
|
"sbQueueSize=" + sb.sbQueue.size() + ")");
|
||
|
return false;
|
||
|
}
|
||
|
if (this.size() >= plasmaSwitchboard.crawlSlots) {
|
||
|
log.logFine("GlobalCrawl: too many processes in loader queue, dismissed (" +
|
||
|
"cacheLoader=" + this.size() + ")");
|
||
|
return false;
|
||
|
}
|
||
|
if (sb.onlineCaution()) {
|
||
|
log.logFine("GlobalCrawl: online caution, omitting processing");
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// if crawling was paused we have to wait until we wer notified to continue
|
||
|
Object[] status = (Object[]) sb.crawlJobsStatus.get(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
|
||
|
synchronized(status[plasmaSwitchboard.CRAWLJOB_SYNC]) {
|
||
|
if (((Boolean)status[plasmaSwitchboard.CRAWLJOB_STATUS]).booleanValue()) {
|
||
|
try {
|
||
|
status[plasmaSwitchboard.CRAWLJOB_SYNC].wait();
|
||
|
}
|
||
|
catch (InterruptedException e){ return false;}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
|
||
|
String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
|
||
|
+ noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
|
||
|
try {
|
||
|
plasmaCrawlEntry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE, true);
|
||
|
String profileHandle = urlEntry.profileHandle();
|
||
|
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
|
||
|
// profileHandle = " + profileHandle + ", urlEntry.url = " +
|
||
|
// urlEntry.url());
|
||
|
plasmaCrawlProfile.entry profile = sb.profilesActiveCrawls.getEntry(profileHandle);
|
||
|
|
||
|
if (profile == null) {
|
||
|
log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// check if the protocol is supported
|
||
|
yacyURL url = urlEntry.url();
|
||
|
String urlProtocol = url.getProtocol();
|
||
|
if (!this.sb.crawlQueues.isSupportedProtocol(urlProtocol)) {
|
||
|
this.log.logSevere("Unsupported protocol in URL '" + url.toString());
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
log.logFine("plasmaSwitchboard.remoteTriggeredCrawlJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter="
|
||
|
+ profile.generalFilter() + ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed().isSenior()) || (yacyCore.seedDB.mySeed().isPrincipal())) ? "true" : "false")));
|
||
|
|
||
|
processLocalCrawling(urlEntry, stats);
|
||
|
return true;
|
||
|
} catch (IOException e) {
|
||
|
log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
|
||
|
if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_REMOTE);
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private void processLocalCrawling(plasmaCrawlEntry entry, String stats) {
|
||
|
// work off one Crawl stack entry
|
||
|
if ((entry == null) || (entry.url() == null)) {
|
||
|
log.logInfo(stats + ": urlEntry = null");
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
synchronized (this.workers) {
|
||
|
crawlWorker w = new crawlWorker(entry);
|
||
|
synchronized (workers) {
|
||
|
workers.put(new Integer(entry.hashCode()), w);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
log.logInfo(stats + ": enqueued for load " + entry.url() + " [" + entry.url().hash() + "]");
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
private boolean processRemoteCrawlTrigger(plasmaCrawlEntry urlEntry) {
|
||
|
// if this returns true, then the urlEntry is considered as stored somewhere and the case is finished
|
||
|
// if this returns false, the urlEntry will be enqueued to the local crawl again
|
||
|
|
||
|
// wrong access
|
||
|
if (urlEntry == null) {
|
||
|
log.logInfo("REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null");
|
||
|
return true; // superfluous request; true correct in this context because the urlEntry shall not be tracked any more
|
||
|
}
|
||
|
|
||
|
// check url
|
||
|
if (urlEntry.url() == null) {
|
||
|
log.logFine("ERROR: plasmaSwitchboard.processRemoteCrawlTrigger - url is null. name=" + urlEntry.name());
|
||
|
return true; // same case as above: no more consideration
|
||
|
}
|
||
|
|
||
|
// are we qualified for a remote crawl?
|
||
|
if ((yacyCore.seedDB.mySeed() == null) || (yacyCore.seedDB.mySeed().isJunior())) {
|
||
|
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no permission");
|
||
|
return false; // no, we must crawl this page ourselves
|
||
|
}
|
||
|
|
||
|
// check if peer for remote crawl is available
|
||
|
yacySeed remoteSeed = ((sb.isPublicRobinson()) && (sb.getConfig("cluster.mode", "").equals("publiccluster"))) ?
|
||
|
yacyCore.dhtAgent.getPublicClusterCrawlSeed(urlEntry.url().hash(), sb.clusterhashes) :
|
||
|
yacyCore.dhtAgent.getGlobalCrawlSeed(urlEntry.url().hash());
|
||
|
if (remoteSeed == null) {
|
||
|
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no remote crawl seed available");
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// do the request
|
||
|
HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), sb.getURL(urlEntry.referrerhash()), 6000);
|
||
|
if (page == null) {
|
||
|
log.logSevere(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. URL CANNOT BE RETRIEVED from referrer hash: " + urlEntry.referrerhash());
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// check if we got contact to peer and the peer respondet
|
||
|
if ((page == null) || (page.get("delay") == null)) {
|
||
|
log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + "). Removed peer.");
|
||
|
yacyCore.peerActions.peerDeparture(remoteSeed, "remote crawl to peer failed; peer answered unappropriate");
|
||
|
return false; // no response from peer, we will crawl this ourself
|
||
|
}
|
||
|
|
||
|
String response = (String) page.get("response");
|
||
|
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed="
|
||
|
+ remoteSeed.getName() + ", url=" + urlEntry.url().toString()
|
||
|
+ ", response=" + page.toString()); // DEBUG
|
||
|
|
||
|
// we received an answer and we are told to wait a specific time until we shall ask again for another crawl
|
||
|
int newdelay = Integer.parseInt((String) page.get("delay"));
|
||
|
yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay);
|
||
|
if (response.equals("stacked")) {
|
||
|
// success, the remote peer accepted the crawl
|
||
|
log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
|
||
|
+ " PLACED URL=" + urlEntry.url().toString()
|
||
|
+ "; NEW DELAY=" + newdelay);
|
||
|
// track this remote crawl
|
||
|
delegatedURL.newEntry(urlEntry, remoteSeed.hash, new Date(), 0, response).store();
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// check other cases: the remote peer may respond that it already knows that url
|
||
|
if (response.equals("double")) {
|
||
|
// in case the peer answers double, it transmits the complete lurl data
|
||
|
String lurl = (String) page.get("lurl");
|
||
|
if ((lurl != null) && (lurl.length() != 0)) {
|
||
|
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
|
||
|
indexURLEntry entry = sb.wordIndex.loadedURL.newEntry(propStr);
|
||
|
try {
|
||
|
sb.wordIndex.loadedURL.store(entry);
|
||
|
sb.wordIndex.loadedURL.stack(entry, yacyCore.seedDB.mySeed().hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
|
||
|
// noticeURL.remove(entry.hash());
|
||
|
} catch (IOException e) {
|
||
|
// TODO Auto-generated catch block
|
||
|
e.printStackTrace();
|
||
|
}
|
||
|
|
||
|
log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
|
||
|
+ " SUPERFLUOUS. CAUSE: " + page.get("reason")
|
||
|
+ " (URL=" + urlEntry.url().toString()
|
||
|
+ "). URL IS CONSIDERED AS 'LOADED!'");
|
||
|
return true;
|
||
|
} else {
|
||
|
log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
|
||
|
+ " REJECTED. CAUSE: bad lurl response / " + page.get("reason") + " (URL="
|
||
|
+ urlEntry.url().toString() + ")");
|
||
|
remoteSeed.setFlagAcceptRemoteCrawl(false);
|
||
|
yacyCore.seedDB.update(remoteSeed.hash, remoteSeed);
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
|
||
|
+ " DENIED. RESPONSE=" + response + ", CAUSE="
|
||
|
+ page.get("reason") + ", URL=" + urlEntry.url().toString());
|
||
|
remoteSeed.setFlagAcceptRemoteCrawl(false);
|
||
|
yacyCore.seedDB.update(remoteSeed.hash, remoteSeed);
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
public plasmaHTCache.Entry loadResourceFromWeb(
|
||
|
yacyURL url,
|
||
|
int socketTimeout,
|
||
|
boolean keepInMemory,
|
||
|
boolean forText
|
||
|
) {
|
||
|
|
||
|
plasmaCrawlEntry centry = new plasmaCrawlEntry(
|
||
|
yacyCore.seedDB.mySeed().hash,
|
||
|
url,
|
||
|
null,
|
||
|
"",
|
||
|
new Date(),
|
||
|
(forText) ? sb.defaultTextSnippetProfile.handle() : sb.defaultMediaSnippetProfile.handle(), // crawl profile
|
||
|
0,
|
||
|
0,
|
||
|
0);
|
||
|
|
||
|
return loader.load(centry);
|
||
|
}
|
||
|
|
||
|
public int size() {
|
||
|
return workers.size();
|
||
|
}
|
||
|
|
||
|
protected class crawlWorker extends Thread {
|
||
|
|
||
|
public plasmaCrawlEntry entry;
|
||
|
|
||
|
public crawlWorker(plasmaCrawlEntry entry) {
|
||
|
this.entry = entry;
|
||
|
this.entry.setStatus("worker-initialized");
|
||
|
this.start();
|
||
|
}
|
||
|
|
||
|
public void run() {
|
||
|
try {
|
||
|
// checking robots.txt for http(s) resources
|
||
|
this.entry.setStatus("worker-checkingrobots");
|
||
|
if ((entry.url().getProtocol().equals("http") || entry.url().getProtocol().equals("https")) && robotsParser.isDisallowed(entry.url())) {
|
||
|
log.logFine("Crawling of URL '" + entry.url().toString() + "' disallowed by robots.txt.");
|
||
|
plasmaCrawlZURL.Entry eentry = errorURL.newEntry(this.entry.url(), "denied by robots.txt");
|
||
|
eentry.store();
|
||
|
errorURL.push(eentry);
|
||
|
} else {
|
||
|
// starting a load from the internet
|
||
|
this.entry.setStatus("worker-loading");
|
||
|
String result = loader.process(this.entry);
|
||
|
if (result != null) {
|
||
|
plasmaCrawlZURL.Entry eentry = errorURL.newEntry(this.entry.url(), "cannot load: " + result);
|
||
|
eentry.store();
|
||
|
errorURL.push(eentry);
|
||
|
} else {
|
||
|
this.entry.setStatus("worker-processed");
|
||
|
}
|
||
|
}
|
||
|
} catch (Exception e) {
|
||
|
plasmaCrawlZURL.Entry eentry = errorURL.newEntry(this.entry.url(), e.getMessage() + " - in worker");
|
||
|
eentry.store();
|
||
|
errorURL.push(eentry);
|
||
|
e.printStackTrace();
|
||
|
} finally {
|
||
|
synchronized (workers) {
|
||
|
workers.remove(new Integer(entry.hashCode()));
|
||
|
}
|
||
|
this.entry.setStatus("worker-finalized");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
}
|