diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index dc0be6f31..497e10531 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -44,7 +44,6 @@ import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.ftp.FTPClient; -import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; @@ -52,7 +51,6 @@ import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.repository.Blacklist; import net.yacy.repository.FilterEngine; - import de.anomic.crawler.ResultURLs.EventOrigin; import de.anomic.crawler.ZURL.FailCategory; import de.anomic.crawler.retrieval.FTPLoader; @@ -77,34 +75,34 @@ public final class CrawlStacker { private final FilterEngine domainList; public final static class DomProfile { - + public String referrer; public int depth, count; - + public DomProfile(final String ref, final int d) { this.referrer = ref; this.depth = d; this.count = 1; } - + public void inc() { this.count++; } - + } - - private Map doms; - + + private final Map doms; + // this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt public CrawlStacker( - CrawlQueues cq, - CrawlSwitchboard cs, - Segment indexSegment, - yacySeedDB peers, - boolean acceptLocalURLs, - boolean acceptGlobalURLs, - FilterEngine domainList) { + final CrawlQueues cq, + final CrawlSwitchboard cs, + final Segment indexSegment, + final yacySeedDB peers, + final boolean acceptLocalURLs, + final boolean acceptGlobalURLs, + final FilterEngine domainList) { this.nextQueue = cq; this.crawler = cs; this.indexSegment = indexSegment; @@ -122,17 +120,17 @@ public final class CrawlStacker { } private void domInc(final String domain, final String referrer, final int depth) { - final DomProfile dp = doms.get(domain); + final DomProfile dp = this.doms.get(domain); if (dp == null) { // new domain - doms.put(domain, new DomProfile(referrer, depth)); + this.doms.put(domain, new DomProfile(referrer, depth)); } else { // increase counter dp.inc(); } } public String domName(final boolean attr, final int index){ - final Iterator> domnamesi = doms.entrySet().iterator(); + final Iterator> domnamesi = this.doms.entrySet().iterator(); String domname=""; Map.Entry ey; DomProfile dp; @@ -148,7 +146,7 @@ public final class CrawlStacker { } return domname; } - + public int size() { return this.fastQueue.queueSize() + this.slowQueue.queueSize(); } @@ -194,8 +192,8 @@ public final class CrawlStacker { // we just don't know anything about that host return false; } - - public Request job(Request entry) { + + public Request job(final Request entry) { // this is the method that is called by the busy thread from outside if (entry == null) return null; @@ -204,7 +202,7 @@ public final class CrawlStacker { // if the url was rejected we store it into the error URL db if (rejectReason != null) { - nextQueue.errorURL.push(entry, ASCII.getBytes(peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); + this.nextQueue.errorURL.push(entry, ASCII.getBytes(this.peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); } } catch (final Exception e) { CrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e); @@ -216,25 +214,25 @@ public final class CrawlStacker { public void enqueueEntry(final Request entry) { // DEBUG - if (log.isFinest()) log.logFinest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + ((entry.initiator() == null) ? "" : ASCII.String(entry.initiator())) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth()); + if (this.log.isFinest()) this.log.logFinest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + ((entry.initiator() == null) ? "" : ASCII.String(entry.initiator())) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth()); if (prefetchHost(entry.url().getHost())) { try { this.fastQueue.enQueue(entry); //this.dnsHit++; - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); } } else { try { this.slowQueue.enQueue(entry); this.dnsMiss++; - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); } } } - public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map hyperlinks, boolean replace) { + public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map hyperlinks, final boolean replace) { new Thread() { public void run() { enqueueEntries(initiator, profileHandle, hyperlinks, true); @@ -242,15 +240,15 @@ public final class CrawlStacker { }.start(); } - private void enqueueEntries(byte[] initiator, String profileHandle, Map hyperlinks, boolean replace) { - for (Map.Entry e: hyperlinks.entrySet()) { + private void enqueueEntries(final byte[] initiator, final String profileHandle, final Map hyperlinks, final boolean replace) { + for (final Map.Entry e: hyperlinks.entrySet()) { if (e.getKey() == null) continue; - + // delete old entry, if exists to force a re-load of the url (thats wanted here) final DigestURI url = new DigestURI(e.getKey()); final byte[] urlhash = url.hash(); if (replace) { - indexSegment.urlMetadata().remove(urlhash); + this.indexSegment.urlMetadata().remove(urlhash); this.nextQueue.urlRemove(urlhash); String u = url.toNormalform(true, true); if (u.endsWith("/")) { @@ -259,23 +257,23 @@ public final class CrawlStacker { u = u + "/index.html"; } try { - byte[] uh = new DigestURI(u, null).hash(); - indexSegment.urlMetadata().remove(uh); + final byte[] uh = new DigestURI(u, null).hash(); + this.indexSegment.urlMetadata().remove(uh); this.nextQueue.noticeURL.removeByURLHash(uh); this.nextQueue.errorURL.remove(uh); - } catch (MalformedURLException e1) {} + } catch (final MalformedURLException e1) {} } - + if (url.getProtocol().equals("ftp")) { // put the whole ftp site on the crawl stack enqueueEntriesFTP(initiator, profileHandle, url.getHost(), url.getPort(), replace); } else { // put entry on crawl stack enqueueEntry(new Request( - initiator, - url, - null, - e.getValue().getProperty("name", ""), + initiator, + url, + null, + e.getValue().getProperty("name", ""), new Date(), profileHandle, 0, @@ -286,7 +284,7 @@ public final class CrawlStacker { } } } - + public void enqueueEntriesFTP(final byte[] initiator, final String profileHandle, final String host, final int port, final boolean replace) { final CrawlQueues cq = this.nextQueue; new Thread() { @@ -296,27 +294,27 @@ public final class CrawlStacker { queue = FTPClient.sitelist(host, port); FTPClient.entryInfo entry; while ((entry = queue.take()) != FTPClient.POISON_entryInfo) { - + // delete old entry, if exists to force a re-load of the url (thats wanted here) DigestURI url = null; try { url = new DigestURI("ftp://" + host + (port == 21 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name)); - } catch (MalformedURLException e) { + } catch (final MalformedURLException e) { continue; } final byte[] urlhash = url.hash(); if (replace) { - indexSegment.urlMetadata().remove(urlhash); + CrawlStacker.this.indexSegment.urlMetadata().remove(urlhash); cq.noticeURL.removeByURLHash(urlhash); cq.errorURL.remove(urlhash); } - + // put entry on crawl stack enqueueEntry(new Request( - initiator, - url, - null, - MultiProtocolURI.unescape(entry.name), + initiator, + url, + null, + MultiProtocolURI.unescape(entry.name), entry.date, profileHandle, 0, @@ -325,22 +323,22 @@ public final class CrawlStacker { entry.size )); } - } catch (IOException e1) { - } catch (InterruptedException e) { + } catch (final IOException e1) { + } catch (final InterruptedException e) { } } }.start(); } - + /** * simple method to add one url as crawljob * @param url * @return null if successfull, a reason string if not successful */ public String stackSimpleCrawl(final DigestURI url) { - CrawlProfile pe = this.crawler.defaultSurrogateProfile; + final CrawlProfile pe = this.crawler.defaultSurrogateProfile; return stackCrawl(new Request( - peers.mySeed().hash.getBytes(), + this.peers.mySeed().hash.getBytes(), url, null, "CRAWLING-ROOT", @@ -352,7 +350,7 @@ public final class CrawlStacker { 0 )); } - + /** * stacks a crawl item. The position can also be remote * @param entry @@ -360,29 +358,29 @@ public final class CrawlStacker { */ public String stackCrawl(final Request entry) { //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'"); - - final CrawlProfile profile = crawler.getActive(UTF8.getBytes(entry.profileHandle())); + + final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle())); String error; if (profile == null) { error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url(); - log.logWarning(error); + this.log.logWarning(error); return error; } - + error = checkAcceptance(entry.url(), profile, entry.depth()); if (error != null) return error; - + // store information - final boolean local = Base64Order.enhancedCoder.equal(entry.initiator(), UTF8.getBytes(peers.mySeed().hash)); - final boolean proxy = (entry.initiator() == null || entry.initiator().length == 0 || ASCII.String(entry.initiator()).equals("------------")) && profile.handle().equals(crawler.defaultProxyProfile.handle()); - final boolean remote = profile.handle().equals(crawler.defaultRemoteProfile.handle()); + final boolean local = Base64Order.enhancedCoder.equal(entry.initiator(), UTF8.getBytes(this.peers.mySeed().hash)); + final boolean proxy = (entry.initiator() == null || entry.initiator().length == 0 || ASCII.String(entry.initiator()).equals("------------")) && profile.handle().equals(this.crawler.defaultProxyProfile.handle()); + final boolean remote = profile.handle().equals(this.crawler.defaultRemoteProfile.handle()); final boolean global = (profile.remoteIndexing()) /* granted */ && (entry.depth() == profile.depth()) /* leaf node */ && //(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ && ( - (peers.mySeed().isSenior()) || - (peers.mySeed().isPrincipal()) + (this.peers.mySeed().isSenior()) || + (this.peers.mySeed().isPrincipal()) ) /* qualified */; if (!local && !global && !remote && !proxy) { @@ -390,10 +388,10 @@ public final class CrawlStacker { this.log.logSevere(error); return error; } - + long maxFileSize = Long.MAX_VALUE; if (entry.size() > 0) { - String protocol = entry.url().getProtocol(); + final String protocol = entry.url().getProtocol(); if (protocol.equals("http") || protocol.equals("https")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); if (protocol.equals("ftp")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.ftp.maxFileSize", FTPLoader.DEFAULT_MAXFILESIZE); if (protocol.equals("smb")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.smb.maxFileSize", SMBLoader.DEFAULT_MAXFILESIZE); @@ -401,15 +399,15 @@ public final class CrawlStacker { // check availability of parser and maxfilesize String warning = null; - if (entry.size() > maxFileSize || + if (entry.size() > maxFileSize /*|| (entry.url().getFileExtension().length() > 0 && TextParser.supports(entry.url(), null) != null) - ) { - warning = nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry); + */) { + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry); if (warning != null) this.log.logWarning("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning); return null; } - - final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash()); + + final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : this.nextQueue.getURL(entry.referrerhash()); // add domain to profile domain list if (profile.domMaxPages() != Integer.MAX_VALUE) { @@ -420,24 +418,24 @@ public final class CrawlStacker { // it may be possible that global == true and local == true, so do not check an error case against it if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); - warning = nextQueue.noticeURL.push(NoticedURL.StackType.LIMIT, entry); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LIMIT, entry); } else if (local) { if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); - warning = nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry); } else if (proxy) { if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); - warning = nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry); } else if (remote) { - warning = nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry); } if (warning != null) this.log.logWarning("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning); return null; } - public String checkAcceptance(final DigestURI url, final CrawlProfile profile, int depth) { - + public String checkAcceptance(final DigestURI url, final CrawlProfile profile, final int depth) { + // check if the protocol is supported final String urlProtocol = url.getProtocol(); if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) { @@ -483,14 +481,14 @@ public final class CrawlStacker { } // check if the url is double registered - final String dbocc = nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists - URIMetadataRow oldEntry = indexSegment.urlMetadata().load(url.hash()); + final String dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists + final URIMetadataRow oldEntry = this.indexSegment.urlMetadata().load(url.hash()); if (oldEntry == null) { if (dbocc != null) { // do double-check if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is double registered in '" + dbocc + "'."); if (dbocc.equals("errors")) { - ZURL.Entry errorEntry = nextQueue.errorURL.get(url.hash()); + final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash()); return "double in: errors (" + errorEntry.anycause() + ")"; } else { return "double in: " + dbocc; @@ -499,7 +497,7 @@ public final class CrawlStacker { } else { final boolean recrawl = profile.recrawlIfOlder() > oldEntry.loaddate().getTime(); if (recrawl) { - if (this.log.isInfo()) + if (this.log.isInfo()) this.log.logInfo("RE-CRAWL of URL '" + url.toString() + "': this url was crawled " + ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago."); } else { @@ -508,7 +506,7 @@ public final class CrawlStacker { } else { if (this.log.isInfo()) this.log.logInfo("URL '" + url.toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time:"); if (dbocc.equals("errors")) { - ZURL.Entry errorEntry = nextQueue.errorURL.get(url.hash()); + final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash()); return "double in: errors (" + errorEntry.anycause() + ")"; } else { return "double in: " + dbocc; @@ -520,22 +518,22 @@ public final class CrawlStacker { // deny urls that exceed allowed number of occurrences final int maxAllowedPagesPerDomain = profile.domMaxPages(); if (maxAllowedPagesPerDomain < Integer.MAX_VALUE) { - final DomProfile dp = doms.get(url.getHost()); + final DomProfile dp = this.doms.get(url.getHost()); if (dp != null && dp.count >= maxAllowedPagesPerDomain) { if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often in crawl stack, a maximum of " + profile.domMaxPages() + " is allowed."); return "crawl stack domain counter exceeded"; } - + if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= profile.domMaxPages()) { if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often in result stack, a maximum of " + profile.domMaxPages() + " is allowed."); return "result stack domain counter exceeded"; } } - + return null; } - - + + /** * Test a url if it can be used for crawling/indexing * This mainly checks if the url is in the declared domain (local/global) @@ -559,12 +557,12 @@ public final class CrawlStacker { // check if this is a local address and we are allowed to index local pages: //boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress(); //assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above! - InetAddress ia = Domains.dnsResolve(host); + final InetAddress ia = Domains.dnsResolve(host); return (local) ? ("the host '" + host + "' is local, but local addresses are not accepted: " + ((ia == null) ? "null" : ia.getHostAddress())) : ("the host '" + host + "' is global, but global addresses are not accepted: " + ((ia == null) ? "null" : ia.getHostAddress())); } - + public String urlInAcceptedDomainHash(final byte[] urlhash) { // returns true if the url can be accepted according to network.unit.domain if (urlhash == null) return "url is null"; diff --git a/source/net/yacy/cora/services/federated/solr/SolrChardingConnector.java b/source/net/yacy/cora/services/federated/solr/SolrChardingConnector.java index 8127bcb71..3f39ac2a7 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrChardingConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrChardingConnector.java @@ -25,10 +25,12 @@ package net.yacy.cora.services.federated.solr; import java.io.IOException; +import java.net.InetAddress; import java.util.ArrayList; import java.util.Collection; import java.util.List; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.document.Document; import net.yacy.kelondro.data.meta.DigestURI; @@ -183,7 +185,11 @@ public class SolrChardingConnector { public String[] getAdminInterfaceList() { final String[] urlAdmin = new String[this.connectors.size()]; int i = 0; - for (final String u: this.urls) { + final InetAddress localhostExternAddress = Domains.myPublicLocalIP(); + final String localhostExtern = localhostExternAddress == null ? "127.0.0.1" : localhostExternAddress.getHostAddress(); + for (String u: this.urls) { + int p = u.indexOf("localhost"); if (p < 0) p = u.indexOf("127.0.0.1"); + if (p >= 0) u = u.substring(0, p) + localhostExtern + u.substring(p + 9); urlAdmin[i++] = u + (u.endsWith("/") ? "admin/" : "/admin/"); } return urlAdmin;