Michael Peter Christen 13 years ago
parent edaa8ac94c
commit c6c61be3f0

@ -253,7 +253,7 @@ public class CrawlResults {
if (showIP && urle != null) { if (showIP && urle != null) {
prop.put("table_indexed_" + cnt + "_showIP", "1"); prop.put("table_indexed_" + cnt + "_showIP", "1");
prop.put("table_indexed_" + cnt + "_showIP_ip", urle.url().getInetAddress().getHostAddress()); prop.put("table_indexed_" + cnt + "_showIP_ip", urle.url().getHost() == null ? "" : urle.url().getInetAddress().getHostAddress());
} else } else
prop.put("table_indexed_" + cnt + "_showIP", "0"); prop.put("table_indexed_" + cnt + "_showIP", "0");

@ -186,6 +186,7 @@ public final class CrawlStacker {
} }
public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map<MultiProtocolURI, Properties> hyperlinks, final boolean replace) { public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map<MultiProtocolURI, Properties> hyperlinks, final boolean replace) {
new Thread() { new Thread() {
@Override
public void run() { public void run() {
enqueueEntries(initiator, profileHandle, hyperlinks, true); enqueueEntries(initiator, profileHandle, hyperlinks, true);
} }
@ -240,6 +241,7 @@ public final class CrawlStacker {
public void enqueueEntriesFTP(final byte[] initiator, final String profileHandle, final String host, final int port, final boolean replace) { public void enqueueEntriesFTP(final byte[] initiator, final String profileHandle, final String host, final int port, final boolean replace) {
final CrawlQueues cq = this.nextQueue; final CrawlQueues cq = this.nextQueue;
new Thread() { new Thread() {
@Override
public void run() { public void run() {
BlockingQueue<FTPClient.entryInfo> queue; BlockingQueue<FTPClient.entryInfo> queue;
try { try {
@ -487,13 +489,13 @@ public final class CrawlStacker {
// this is expensive and those filters are check at the end of all other tests // this is expensive and those filters are check at the end of all other tests
// filter with must-match for IPs // filter with must-match for IPs
if ((depth > 0) && profile.ipMustMatchPattern() != CrawlProfile.MATCH_ALL_PATTERN && !profile.ipMustMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) { if ((depth > 0) && profile.ipMustMatchPattern() != CrawlProfile.MATCH_ALL_PATTERN && url.getHost() != null && !profile.ipMustMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) {
if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' does not match must-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'."); if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' does not match must-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'.");
return "ip " + url.getInetAddress().getHostAddress() + " of url does not match must-match filter"; return "ip " + url.getInetAddress().getHostAddress() + " of url does not match must-match filter";
} }
// filter with must-not-match for IPs // filter with must-not-match for IPs
if ((depth > 0) && profile.ipMustMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.ipMustNotMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) { if ((depth > 0) && profile.ipMustMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && url.getHost() != null && profile.ipMustNotMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) {
if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'."); if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'.");
return "ip " + url.getInetAddress().getHostAddress() + " of url matches must-not-match filter"; return "ip " + url.getInetAddress().getHostAddress() + " of url matches must-not-match filter";
} }

@ -702,6 +702,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
public InetAddress getInetAddress() { public InetAddress getInetAddress() {
if (this.hostAddress != null) return this.hostAddress; if (this.hostAddress != null) return this.hostAddress;
if (this.host == null) return null; // this may happen for file:// urls
this.hostAddress = Domains.dnsResolve(this.host.toLowerCase()); this.hostAddress = Domains.dnsResolve(this.host.toLowerCase());
return this.hostAddress; return this.hostAddress;
} }

Loading…
Cancel
Save