- refactoring of CrawlStacker (to prepare it for new multi-Threading to remove DNS lookup bottleneck)

- fix of shallBeOwnWord target computation heuristic


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5392 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent b1e211b258
commit 2802138787

@ -62,7 +62,7 @@ public class ConfigAccounts_p {
}
if (localhostAccess) {
if (sb.acceptLocalURLs) {
if (sb.crawlStacker.acceptLocalURLs()) {
// in this case it is not allowed to use a localhostAccess option
prop.put("commitIntranetWarning", 1);
localhostAccess = false;

@ -105,14 +105,12 @@ public final class IndexTransfer_p {
} else {
if (!prop.containsKey("running_status")) prop.put("running_status","Not running");
}
//List known hosts
yacySeed seed;
int hc = 0;
if ((sb.webIndex.seedDB != null) && (sb.webIndex.seedDB.sizeConnected() > 0)) {
final Iterator<yacySeed> e = yacyPeerSelection.getAcceptRemoteIndexSeeds(sb.webIndex.seedDB, "AAAAAAAAAAAA", sb.webIndex.seedDB.sizeConnected());
final Iterator<yacySeed> e = yacyPeerSelection.getAcceptRemoteIndexSeeds(sb.webIndex.seedDB, null, sb.webIndex.seedDB.sizeConnected(), false);
final TreeMap<String, String> hostList = new TreeMap<String, String>();
while (e.hasNext()) {
seed = e.next();

@ -72,7 +72,7 @@ public class rct_p {
loaddate = new Date();
}
final yacyURL referrer = null; // referrer needed!
final String urlRejectReason = sb.acceptURL(url);
final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(url);
if (urlRejectReason == null) {
// stack url
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");

@ -127,7 +127,7 @@ public final class crawlReceipt {
}
// check if the entry is in our network domain
final String urlRejectReason = sb.acceptURL(comp.url());
final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(comp.url());
if (urlRejectReason != null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (" + urlRejectReason + ") for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr);
prop.put("delay", "9999");

@ -128,7 +128,7 @@ public final class transferURL {
}
// check if the entry is in our network domain
final String urlRejectReason = sb.acceptURL(comp.url());
final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(comp.url());
if (urlRejectReason != null) {
if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: blocked URL '" + comp.url() + "' (" + urlRejectReason + ") from peer " + otherPeerName);
lEntry = null;

@ -393,7 +393,7 @@ public class CrawlQueues {
} catch (final ParseException e) {
loaddate = new Date();
}
final String urlRejectReason = sb.acceptURL(url);
final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(url);
if (urlRejectReason == null) {
// stack url
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");

@ -28,151 +28,73 @@
package de.anomic.crawler;
import java.io.File;
import java.io.IOException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import de.anomic.index.indexReferenceBlacklist;
import de.anomic.index.indexURLReference;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroEcoTable;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRowSet;
import de.anomic.kelondro.kelondroTree;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverDomains;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyURL;
public final class CrawlStacker extends Thread {
private static final int EcoFSBufferSize = 20;
private static String stackfile = "urlNoticeStacker9.db";
// keys for different database types
public static final int QUEUE_DB_TYPE_RAM = 0;
public static final int QUEUE_DB_TYPE_TREE = 1;
public static final int QUEUE_DB_TYPE_ECO = 2;
public final class CrawlStacker {
final serverLog log = new serverLog("STACKCRAWL");
private final plasmaSwitchboard sb;
private final LinkedList<String> urlEntryHashCache;
private kelondroIndex urlEntryCache;
private final File cacheStacksPath;
private final int dbtype;
private final boolean prequeue;
private long dnsHit, dnsMiss;
private int alternateCount;
private final LinkedList<String> urlEntryHashCache; // the order how this queue is processed; entries with known DNS entries go first
private kelondroIndex urlEntryCache; // the entries in the queue
private long dnsHit, dnsMiss;
private int alternateCount;
private CrawlQueues nextQueue;
private plasmaWordIndex wordIndex;
private boolean acceptLocalURLs, acceptGlobalURLs;
// objects for the prefetch task
private final ArrayList<String> dnsfetchHosts = new ArrayList<String>();
public CrawlStacker(final plasmaSwitchboard sb, final File dbPath, final int dbtype, final boolean prequeue) {
this.sb = sb;
this.prequeue = prequeue;
// this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt
public CrawlStacker(CrawlQueues cq, plasmaWordIndex wordIndex, boolean acceptLocalURLs, boolean acceptGlobalURLs) {
this.nextQueue = cq;
this.wordIndex = wordIndex;
this.dnsHit = 0;
this.dnsMiss = 0;
this.alternateCount = 0;
this.acceptLocalURLs = acceptLocalURLs;
this.acceptGlobalURLs = acceptGlobalURLs;
// init the message list
this.urlEntryHashCache = new LinkedList<String>();
// create a stack for newly entered entries
this.cacheStacksPath = dbPath;
this.dbtype = dbtype;
openDB();
try {
// loop through the list and fill the messageList with url hashs
final Iterator<kelondroRow.Entry> rows = this.urlEntryCache.rows(true, null);
kelondroRow.Entry entry;
while (rows.hasNext()) {
entry = rows.next();
if (entry == null) {
System.out.println("ERROR! null element found");
continue;
}
this.urlEntryHashCache.add(entry.getColString(0, null));
}
} catch (final kelondroException e) {
/* if we have an error, we start with a fresh database */
CrawlStacker.this.log.logSevere("Unable to initialize crawl stacker queue, kelondroException:" + e.getMessage() + ". Reseting DB.\n", e);
// deleting old db and creating a new db
try {this.urlEntryCache.close();} catch (final Exception ex) {}
deleteDB();
openDB();
} catch (final IOException e) {
/* if we have an error, we start with a fresh database */
CrawlStacker.this.log.logSevere("Unable to initialize crawl stacker queue, IOException:" + e.getMessage() + ". Reseting DB.\n", e);
// deleting old db and creating a new db
try {this.urlEntryCache.close();} catch (final Exception ex) {}
deleteDB();
openDB();
}
this.log.logInfo(size() + " entries in the stackCrawl queue.");
this.start(); // start the prefetcher thread
this.urlEntryCache = new kelondroRowSet(CrawlEntry.rowdef, 0);
this.log.logInfo("STACKCRAWL thread initialized.");
}
public void run() {
String nextHost;
try {
while (!Thread.currentThread().isInterrupted()) { // action loop
if (dnsfetchHosts.size() == 0) synchronized (this) { wait(); }
synchronized (dnsfetchHosts) {
nextHost = dnsfetchHosts.remove(dnsfetchHosts.size() - 1);
}
try {
serverDomains.dnsResolve(nextHost);
} catch (final Exception e) {}
}
} catch (final InterruptedException e) {}
}
public boolean prefetchHost(final String host) {
// returns true when the host was known in the dns cache.
// If not, the host is stacked on the fetch stack and false is returned
try {
serverDomains.dnsResolveFromCache(host);
return true;
} catch (final UnknownHostException e) {
synchronized (this) {
dnsfetchHosts.add(host);
notifyAll();
}
return false;
}
}
public void terminateDNSPrefetcher() {
synchronized (this) {
interrupt();
public int size() {
synchronized (this.urlEntryHashCache) {
return this.urlEntryHashCache.size();
}
}
public void clear() throws IOException {
this.urlEntryHashCache.clear();
this.urlEntryCache.clear();
}
public void close() {
if (this.dbtype == QUEUE_DB_TYPE_RAM) {
this.log.logInfo("Shutdown. Flushing remaining " + size() + " crawl stacker job entries. please wait.");
while (size() > 0) {
if (!job()) break;
}
this.log.logInfo("Shutdown. Flushing remaining " + size() + " crawl stacker job entries. please wait.");
while (size() > 0) {
if (!job()) break;
}
terminateDNSPrefetcher();
this.log.logInfo("Shutdown. Closing stackCrawl queue.");
@ -182,26 +104,68 @@ public final class CrawlStacker extends Thread {
// clearing the hash list
this.urlEntryHashCache.clear();
}
private boolean prefetchHost(final String host) {
// returns true when the host was known in the dns cache.
// If not, the host is stacked on the fetch stack and false is returned
try {
serverDomains.dnsResolveFromCache(host);
return true;
} catch (final UnknownHostException e) {
synchronized (this) {
dnsfetchHosts.add(host);
notifyAll();
}
return false;
}
}
public boolean job() {
CrawlEntry entry;
// this is the method that is called by the busy thread from outside
if (this.urlEntryHashCache.size() == 0) return false;
// get the next entry from the queue
String urlHash = null;
kelondroRow.Entry ec = null;
synchronized (this.urlEntryHashCache) {
urlHash = this.urlEntryHashCache.removeFirst();
if (urlHash == null) {
urlEntryHashCache.clear();
try {
urlEntryCache.clear();
} catch (IOException e) {
e.printStackTrace();
}
return false;
}
try {
ec = this.urlEntryCache.remove(urlHash.getBytes());
} catch (IOException e) {
e.printStackTrace();
return false;
}
}
if (urlHash == null || ec == null) return false;
// make a crawl Entry out of it
CrawlEntry entry = null;
try {
entry = dequeueEntry();
} catch (final IOException e) {
e.printStackTrace();
entry = new CrawlEntry(ec);
} catch (IOException e1) {
e1.printStackTrace();
return false;
}
if (entry == null) return false;
try {
final String rejectReason = sb.crawlStacker.stackCrawl(entry);
final String rejectReason = stackCrawl(entry);
// if the url was rejected we store it into the error URL db
if (rejectReason != null) {
final ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, rejectReason);
final ZURL.Entry ee = nextQueue.errorURL.newEntry(entry, wordIndex.seedDB.mySeed().hash, new Date(), 1, rejectReason);
ee.store();
sb.crawlQueues.errorURL.push(ee);
nextQueue.errorURL.push(ee);
}
} catch (final Exception e) {
CrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e);
@ -270,8 +234,6 @@ public final class CrawlStacker extends Thread {
synchronized(this.urlEntryHashCache) {
kelondroRow.Entry oldValue;
boolean hostknown = true;
if (prequeue) hostknown = prefetchHost(nexturl.getHost());
try {
oldValue = this.urlEntryCache.put(newEntryRow);
} catch (final IOException e) {
@ -279,7 +241,7 @@ public final class CrawlStacker extends Thread {
}
if (oldValue == null) {
//System.out.println("*** debug crawlStacker dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ((this.dnsMiss > 0) ? (", Q=" + (this.dnsHit / this.dnsMiss)) : ""));
if (hostknown) {
if (prefetchHost(nexturl.getHost())) {
this.alternateCount++;
this.urlEntryHashCache.addFirst(newEntry.url().hash());
this.dnsHit++;
@ -297,79 +259,9 @@ public final class CrawlStacker extends Thread {
}
}
private void deleteDB() {
if (this.dbtype == QUEUE_DB_TYPE_RAM) {
// do nothing..
return;
}
if (this.dbtype == QUEUE_DB_TYPE_ECO) {
new File(cacheStacksPath, stackfile).delete();
//kelondroFlexWidthArray.delete(cacheStacksPath, stackfile);
}
if (this.dbtype == QUEUE_DB_TYPE_TREE) {
final File cacheFile = new File(cacheStacksPath, stackfile);
cacheFile.delete();
}
}
private void openDB() {
if (!(cacheStacksPath.exists())) cacheStacksPath.mkdir(); // make the path
if (this.dbtype == QUEUE_DB_TYPE_RAM) {
this.urlEntryCache = new kelondroRowSet(CrawlEntry.rowdef, 0);
}
if (this.dbtype == QUEUE_DB_TYPE_ECO) {
cacheStacksPath.mkdirs();
final File f = new File(cacheStacksPath, stackfile);
try {
this.urlEntryCache = new kelondroEcoTable(f, CrawlEntry.rowdef, kelondroEcoTable.tailCacheUsageAuto, EcoFSBufferSize, 0);
//this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, CrawlEntry.rowdef, 0, true));
} catch (final Exception e) {
e.printStackTrace();
// kill DB and try again
f.delete();
//kelondroFlexTable.delete(cacheStacksPath, newCacheName);
try {
this.urlEntryCache = new kelondroEcoTable(f, CrawlEntry.rowdef, kelondroEcoTable.tailCacheUsageAuto, EcoFSBufferSize, 0);
//this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, CrawlEntry.rowdef, 0, true));
} catch (final Exception ee) {
ee.printStackTrace();
System.exit(-1);
}
}
}
if (this.dbtype == QUEUE_DB_TYPE_TREE) {
final File cacheFile = new File(cacheStacksPath, stackfile);
cacheFile.getParentFile().mkdirs();
this.urlEntryCache = new kelondroCache(kelondroTree.open(cacheFile, true, 0, CrawlEntry.rowdef));
}
}
public int size() {
synchronized (this.urlEntryHashCache) {
return this.urlEntryHashCache.size();
}
}
public int getDBType() {
return this.dbtype;
}
public CrawlEntry dequeueEntry() throws IOException {
if (this.urlEntryHashCache.size() == 0) return null;
String urlHash = null;
kelondroRow.Entry entry = null;
synchronized (this.urlEntryHashCache) {
urlHash = this.urlEntryHashCache.removeFirst();
if (urlHash == null) throw new IOException("urlHash is null");
entry = this.urlEntryCache.remove(urlHash.getBytes());
}
if ((urlHash == null) || (entry == null)) return null;
return new CrawlEntry(entry);
}
public String stackCrawl(final CrawlEntry entry) {
private String stackCrawl(final CrawlEntry entry) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
@ -379,7 +271,7 @@ public final class CrawlStacker extends Thread {
// check if the protocol is supported
final String urlProtocol = entry.url().getProtocol();
if (!sb.crawlQueues.isSupportedProtocol(urlProtocol)) {
if (!nextQueue.isSupportedProtocol(urlProtocol)) {
reason = "unsupported protocol";
this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
@ -387,9 +279,9 @@ public final class CrawlStacker extends Thread {
}
// check if ip is local ip address
final String urlRejectReason = sb.acceptURL(entry.url());
final String urlRejectReason = urlInAcceptedDomain(entry.url());
if (urlRejectReason != null) {
reason = "denied_(" + urlRejectReason + ")_domain=" + sb.getConfig("network.unit.domain", "unknown");
reason = "denied_(" + urlRejectReason + ")";
if (this.log.isFine()) this.log.logFine(reason + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
}
@ -402,7 +294,7 @@ public final class CrawlStacker extends Thread {
return reason;
}
final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.getEntry(entry.profileHandle());
final CrawlProfile.entry profile = wordIndex.profilesActiveCrawls.getEntry(entry.profileHandle());
if (profile == null) {
final String errorMsg = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
log.logWarning(errorMsg);
@ -443,7 +335,7 @@ public final class CrawlStacker extends Thread {
return reason;
}
final yacyURL referrerURL = (entry.referrerhash() == null) ? null : sb.crawlQueues.getURL(entry.referrerhash());
final yacyURL referrerURL = (entry.referrerhash() == null) ? null : nextQueue.getURL(entry.referrerhash());
// add domain to profile domain list
if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) {
@ -467,8 +359,8 @@ public final class CrawlStacker extends Thread {
}
// check if the url is double registered
final String dbocc = sb.crawlQueues.urlExists(entry.url().hash());
final indexURLReference oldEntry = this.sb.webIndex.getURL(entry.url().hash(), null, 0);
final String dbocc = nextQueue.urlExists(entry.url().hash());
final indexURLReference oldEntry = wordIndex.getURL(entry.url().hash(), null, 0);
final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime());
// do double-check
if ((dbocc != null) && (!recrawl)) {
@ -489,16 +381,16 @@ public final class CrawlStacker extends Thread {
}
// store information
final boolean local = entry.initiator().equals(sb.webIndex.seedDB.mySeed().hash);
final boolean proxy = (entry.initiator() == null || entry.initiator().equals("------------")) && profile.handle().equals(this.sb.webIndex.defaultProxyProfile.handle());
final boolean remote = profile.handle().equals(this.sb.webIndex.defaultRemoteProfile.handle());
final boolean local = entry.initiator().equals(wordIndex.seedDB.mySeed().hash);
final boolean proxy = (entry.initiator() == null || entry.initiator().equals("------------")) && profile.handle().equals(wordIndex.defaultProxyProfile.handle());
final boolean remote = profile.handle().equals(wordIndex.defaultRemoteProfile.handle());
final boolean global =
(profile.remoteIndexing()) /* granted */ &&
(entry.depth() == profile.depth()) /* leaf node */ &&
//(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
(
(sb.webIndex.seedDB.mySeed().isSenior()) ||
(sb.webIndex.seedDB.mySeed().isPrincipal())
(wordIndex.seedDB.mySeed().isSenior()) ||
(wordIndex.seedDB.mySeed().isPrincipal())
) /* qualified */;
if (!local && !global && !remote && !proxy) {
@ -508,23 +400,62 @@ public final class CrawlStacker extends Thread {
// it may be possible that global == true and local == true, so do not check an error case against it
if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle());
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle());
sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_LIMIT, entry);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_LIMIT, entry);
}
if (local) {
if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle());
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle());
sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
}
if (proxy) {
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + entry.initiator() + ", profile.handle = " + profile.handle());
sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
}
if (remote) {
sb.crawlQueues.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry);
}
}
return null;
}
/**
* Test a url if it can be used for crawling/indexing
* This mainly checks if the url is in the declared domain (local/global)
* @param url
* @return null if the url can be accepted, a string containing a rejection reason if the url cannot be accepted
*/
public String urlInAcceptedDomain(final yacyURL url) {
// returns true if the url can be accepted accoring to network.unit.domain
if (url == null) return "url is null";
final String host = url.getHost();
if (host == null) return "url.host is null";
if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve
/*
InetAddress hostAddress = serverDomains.dnsResolve(host);
// if we don't know the host, we cannot load that resource anyway.
// But in case we use a proxy, it is possible that we dont have a DNS service.
final httpRemoteProxyConfig remoteProxyConfig = httpdProxyHandler.getRemoteProxyConfig();
if (hostAddress == null) {
if ((remoteProxyConfig != null) && (remoteProxyConfig.useProxy())) return null; else return "the dns of the host '" + host + "' cannot be resolved";
}
*/
// check if this is a local address and we are allowed to index local pages:
//boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
final boolean local = url.isLocal();
//assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above!
if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null;
return (local) ?
("the host '" + host + "' is local, but local addresses are not accepted") :
("the host '" + host + "' is global, but global addresses are not accepted");
}
public boolean acceptLocalURLs() {
return this.acceptLocalURLs;
}
public boolean acceptGlobalURLs() {
return this.acceptGlobalURLs;
}
}

@ -73,7 +73,7 @@ public class plasmaSearchAPI {
yacySeed seed;
int hc = 0;
prop.put("searchresult_keyhash", startHash);
final Iterator<yacySeed> e = yacyPeerSelection.getAcceptRemoteIndexSeeds(sb.webIndex.seedDB, startHash, sb.webIndex.seedDB.sizeConnected());
final Iterator<yacySeed> e = yacyPeerSelection.getAcceptRemoteIndexSeeds(sb.webIndex.seedDB, startHash, sb.webIndex.seedDB.sizeConnected(), true);
while (e.hasNext()) {
seed = e.next();
if (seed != null) {

@ -234,7 +234,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
public int totalPPM = 0;
public double totalQPM = 0d;
public TreeMap<String, String> clusterhashes; // map of peerhash(String)/alternative-local-address as ip:port or only ip (String) or null if address in seed should be used
public boolean acceptLocalURLs, acceptGlobalURLs;
public URLLicense licensedURLs;
public Timer moreMemory;
@ -548,9 +547,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
this.observer.resourceObserverJob();
// initializing the stackCrawlThread
this.crawlStacker = new CrawlStacker(this, this.plasmaPath, (int) getConfigLong("tableTypeForPreNURL", 0), (((int) getConfigLong("tableTypeForPreNURL", 0) == 0) && (getConfigLong(plasmaSwitchboardConstants.CRAWLSTACK_BUSYSLEEP, 0) <= 100)));
//this.sbStackCrawlThread = new plasmaStackCrawlThread(this,this.plasmaPath,ramPreNURL);
//this.sbStackCrawlThread.start();
this.crawlStacker = new CrawlStacker(
crawlQueues,
this.webIndex,
"local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0,
"global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0);
// initializing dht chunk generation
this.dhtTransferChunk = null;
@ -680,10 +681,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// initiate url license object
licensedURLs = new URLLicense(8);
// set URL domain acceptance
acceptGlobalURLs = "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0;
acceptLocalURLs = "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0;
/*
// in intranet and portal network set robinson mode
if (networkUnitDefinition.equals("defaults/yacy.network.webportal.unit") ||
@ -736,7 +733,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// that an automatic authorization of localhost is done, because in this case crawls from local
// addresses are blocked to prevent attack szenarios where remote pages contain links to localhost
// addresses that can steer a YaCy peer
if ((this.acceptLocalURLs) && (getConfigBool("adminAccountForLocalhost", false))) {
if ((crawlStacker.acceptLocalURLs()) && (getConfigBool("adminAccountForLocalhost", false))) {
setConfig("adminAccountForLocalhost", false);
if (getConfig(httpd.ADMIN_ACCOUNT_B64MD5, "").startsWith("0000")) {
// the password was set automatically with a random value.
@ -856,36 +853,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
}
}
/**
* Test a url if it can be used for crawling/indexing
* This mainly checks if the url is in the declared domain (local/global)
* @param url
* @return null if the url can be accepted, a string containing a rejection reason if the url cannot be accepted
*/
public String acceptURL(final yacyURL url) {
// returns true if the url can be accepted accoring to network.unit.domain
if (url == null) return "url is null";
final String host = url.getHost();
if (host == null) return "url.host is null";
if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve
/*
InetAddress hostAddress = serverDomains.dnsResolve(host);
// if we don't know the host, we cannot load that resource anyway.
// But in case we use a proxy, it is possible that we dont have a DNS service.
final httpRemoteProxyConfig remoteProxyConfig = httpdProxyHandler.getRemoteProxyConfig();
if (hostAddress == null) {
if ((remoteProxyConfig != null) && (remoteProxyConfig.useProxy())) return null; else return "the dns of the host '" + host + "' cannot be resolved";
}
*/
// check if this is a local address and we are allowed to index local pages:
//boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
final boolean local = url.isLocal();
//assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above!
if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null;
return (local) ?
("the host '" + host + "' is local, but local addresses are not accepted") :
("the host '" + host + "' is global, but global addresses are not accepted");
}
public String urlExists(final String hash) {
// tests if hash occurrs in any database
@ -992,7 +959,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
*
* check if ip is local ip address // TODO: remove this procotol specific code here
* ========================================================================= */
final String urlRejectReason = acceptURL(entry.url());
final String urlRejectReason = crawlStacker.urlInAcceptedDomain(entry.url());
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.logFine("Rejected URL '" + entry.url() + "': " + urlRejectReason);
doIndexing = false;
@ -1298,7 +1265,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
}
// set a random password if no password is configured
if (!this.acceptLocalURLs && getConfigBool("adminAccountForLocalhost", false) && getConfig(httpd.ADMIN_ACCOUNT_B64MD5, "").length() == 0) {
if (!crawlStacker.acceptLocalURLs() && getConfigBool("adminAccountForLocalhost", false) && getConfig(httpd.ADMIN_ACCOUNT_B64MD5, "").length() == 0) {
// make a 'random' password
setConfig(httpd.ADMIN_ACCOUNT_B64MD5, "0000" + serverCodings.encodeMD5Hex(System.getProperties().toString() + System.currentTimeMillis()));
setConfig("adminAccount", "");
@ -1998,7 +1965,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
try {
// find a list of DHT-peers
if (log != null) log.logInfo("Collecting DHT target peers for first_hash = " + dhtChunk.firstContainer().getWordHash() + ", last_hash = " + dhtChunk.lastContainer().getWordHash());
final Iterator<yacySeed> seedIter = yacyPeerSelection.getAcceptRemoteIndexSeeds(webIndex.seedDB, dhtChunk.lastContainer().getWordHash(), peerCount + 9);
final Iterator<yacySeed> seedIter = yacyPeerSelection.getAcceptRemoteIndexSeeds(webIndex.seedDB, dhtChunk.lastContainer().getWordHash(), peerCount + 9, false);
// send away the indexes to all these peers
int hc1 = 0;

@ -548,7 +548,7 @@ public final class yacyClient {
continue; // block with backlist
}
final String urlRejectReason = plasmaSwitchboard.getSwitchboard().acceptURL(comp.url());
final String urlRejectReason = plasmaSwitchboard.getSwitchboard().crawlStacker.urlInAcceptedDomain(comp.url());
if (urlRejectReason != null) {
yacyCore.log.logInfo("remote search (client): rejected url '" + comp.url() + "' (" + urlRejectReason + ") from peer " + target.getName());
continue; // reject url outside of our domain

@ -28,6 +28,7 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.serverDate;
@ -48,7 +49,7 @@ public class yacyPeerSelection {
long distance;
for (int v = 0; v < dhtVerticalTargets.length; v++) {
wordhash = yacySeed.positionToHash(dhtVerticalTargets[v]);
Iterator<yacySeed> dhtEnum = getAcceptRemoteIndexSeeds(seedDB, wordhash, redundancy);
Iterator<yacySeed> dhtEnum = getAcceptRemoteIndexSeeds(seedDB, wordhash, redundancy, false);
int c = Math.min(seedDB.sizeConnected(), redundancy);
int cc = 3; // select a maximum of 3, this is enough redundancy
while (dhtEnum.hasNext() && c > 0 && cc-- > 0) {
@ -64,23 +65,24 @@ public class yacyPeerSelection {
}
}
public static boolean verifyIfOwnWord(final yacySeedDB seedDB, final String wordhash, int redundancy) {
public static boolean verifyIfOwnWord(final yacySeedDB seedDB, String wordhash, int redundancy) {
String myHash = seedDB.mySeed().hash;
long[] dhtVerticalTargets = yacySeed.dhtPositions(wordhash, yacySeed.partitionExponent);
for (int v = 0; v < dhtVerticalTargets.length; v++) {
Iterator<yacySeed> dhtEnum = getAcceptRemoteIndexSeeds(seedDB, yacySeed.positionToHash(dhtVerticalTargets[v]), redundancy);
//long[] dhtVerticalTargets = yacySeed.dhtPositions(wordhash, yacySeed.partitionExponent);
//for (int v = 0; v < dhtVerticalTargets.length; v++) {
//wordhash = yacySeed.positionToHash(dhtVerticalTargets[0]);
Iterator<yacySeed> dhtEnum = getAcceptRemoteIndexSeeds(seedDB, wordhash, redundancy, true);
while (dhtEnum.hasNext()) {
if (dhtEnum.next().equals(myHash)) return true;
if (dhtEnum.next().hash.equals(myHash)) return true;
}
}
//}
return false;
}
public static Iterator<yacySeed> getAcceptRemoteIndexSeeds(yacySeedDB seedDB, final String starthash, int max) {
public static Iterator<yacySeed> getAcceptRemoteIndexSeeds(yacySeedDB seedDB, final String starthash, int max, boolean alsoMyOwn) {
// returns an enumeration of yacySeed-Objects
// that have the AcceptRemoteIndex-Flag set
// the seeds are enumerated in the right order according DHT
return new acceptRemoteIndexSeedEnum(seedDB, starthash, Math.max(max, seedDB.sizeConnected()));
return new acceptRemoteIndexSeedEnum(seedDB, starthash, Math.min(max, seedDB.sizeConnected()), alsoMyOwn);
}
private static class acceptRemoteIndexSeedEnum implements Iterator<yacySeed> {
@ -90,13 +92,15 @@ public class yacyPeerSelection {
private yacySeedDB seedDB;
private HashSet<String> doublecheck;
private int remaining;
private boolean alsoMyOwn;
public acceptRemoteIndexSeedEnum(yacySeedDB seedDB, final String starthash, int max) {
public acceptRemoteIndexSeedEnum(yacySeedDB seedDB, final String starthash, int max, boolean alsoMyOwn) {
this.seedDB = seedDB;
this.se = getDHTSeeds(seedDB, starthash, yacyVersion.YACY_HANDLES_COLLECTION_INDEX);
this.remaining = max;
this.doublecheck = new HashSet<String>();
this.nextSeed = nextInternal();
this.alsoMyOwn = alsoMyOwn && (kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) > 0);
}
public boolean hasNext() {
@ -127,9 +131,15 @@ public class yacyPeerSelection {
}
public yacySeed next() {
final yacySeed next = nextSeed;
nextSeed = nextInternal();
return next;
if (alsoMyOwn && kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) < 0) {
// take my own seed hash instead the enumeration result
alsoMyOwn = false;
return seedDB.mySeed();
} else {
final yacySeed next = nextSeed;
nextSeed = nextInternal();
return next;
}
}
public void remove() {

@ -1017,19 +1017,45 @@ public class yacySeed implements Cloneable {
private static int guessedOwn = 0;
//private static int guessedNotOwn = 0;
private static int verifiedOwn = 0;
private static int verifiedNotOwn = 0;
public static boolean shallBeOwnWord(final yacySeedDB seedDB, final String wordhash, int redundancy) {
if (!guessIfOwnWord(seedDB, wordhash)) return false;
guessedOwn++;
if (yacyPeerSelection.verifyIfOwnWord(seedDB, wordhash, redundancy)) {
verifiedOwn++;
System.out.println("*** DEBUG shallBeOwnWord: true. verified/guessed ration = " + verifiedOwn + "/" + guessedOwn);
return true;
// the guessIfOwnWord is a fast method that should only fail in case that a 'true' may be incorrect, but a 'false' shall always be correct
if (guessIfOwnWord(seedDB, wordhash)) {
// this case must be verified, because it can be wrong.
guessedOwn++;
if (yacyPeerSelection.verifyIfOwnWord(seedDB, wordhash, redundancy)) {
// this is the correct case, but does not need to be an average case
verifiedOwn++;
//System.out.println("*** DEBUG shallBeOwnWord: true. guessed: true. verified/guessed ration = " + verifiedOwn + "/" + guessedOwn);
return true;
} else {
// this may happen, but can be corrected
verifiedNotOwn++;
//System.out.println("*** DEBUG shallBeOwnWord: false. guessed: true. verified/guessed ration = " + verifiedNotOwn + "/" + guessedNotOwn);
return false;
}
} else {
System.out.println("*** DEBUG shallBeOwnWord: false. verified/guessed ration = " + verifiedOwn + "/" + guessedOwn);
return false;
/*
// this should mean that the guessing should not be wrong
guessedNotOwn++;
if (yacyPeerSelection.verifyIfOwnWord(seedDB, wordhash, redundancy)) {
// this should never happen
verifiedOwn++;
System.out.println("*** DEBUG shallBeOwnWord: true. guessed: false. verified/guessed ration = " + verifiedOwn + "/" + guessedOwn);
return true;
} else {
// this should always happen
verifiedNotOwn++;
//System.out.println("*** DEBUG shallBeOwnWord: false. guessed: false. verified/guessed ration = " + verifiedNotOwn + "/" + guessedNotOwn);
return false;
}
*/
}
}
private static boolean guessIfOwnWord(final yacySeedDB seedDB, final String wordhash) {

Loading…
Cancel
Save