Michael Peter Christen 13 years ago
parent edaa8ac94c
commit c6c61be3f0

@ -253,7 +253,7 @@ public class CrawlResults {
if (showIP && urle != null) {
prop.put("table_indexed_" + cnt + "_showIP", "1");
prop.put("table_indexed_" + cnt + "_showIP_ip", urle.url().getInetAddress().getHostAddress());
prop.put("table_indexed_" + cnt + "_showIP_ip", urle.url().getHost() == null ? "" : urle.url().getInetAddress().getHostAddress());
} else
prop.put("table_indexed_" + cnt + "_showIP", "0");

@ -186,6 +186,7 @@ public final class CrawlStacker {
}
public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map<MultiProtocolURI, Properties> hyperlinks, final boolean replace) {
new Thread() {
@Override
public void run() {
enqueueEntries(initiator, profileHandle, hyperlinks, true);
}
@ -240,6 +241,7 @@ public final class CrawlStacker {
public void enqueueEntriesFTP(final byte[] initiator, final String profileHandle, final String host, final int port, final boolean replace) {
final CrawlQueues cq = this.nextQueue;
new Thread() {
@Override
public void run() {
BlockingQueue<FTPClient.entryInfo> queue;
try {
@ -487,13 +489,13 @@ public final class CrawlStacker {
// this is expensive and those filters are check at the end of all other tests
// filter with must-match for IPs
if ((depth > 0) && profile.ipMustMatchPattern() != CrawlProfile.MATCH_ALL_PATTERN && !profile.ipMustMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) {
if ((depth > 0) && profile.ipMustMatchPattern() != CrawlProfile.MATCH_ALL_PATTERN && url.getHost() != null && !profile.ipMustMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) {
if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' does not match must-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'.");
return "ip " + url.getInetAddress().getHostAddress() + " of url does not match must-match filter";
}
// filter with must-not-match for IPs
if ((depth > 0) && profile.ipMustMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.ipMustNotMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) {
if ((depth > 0) && profile.ipMustMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && url.getHost() != null && profile.ipMustNotMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) {
if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'.");
return "ip " + url.getInetAddress().getHostAddress() + " of url matches must-not-match filter";
}

@ -702,6 +702,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
public InetAddress getInetAddress() {
if (this.hostAddress != null) return this.hostAddress;
if (this.host == null) return null; // this may happen for file:// urls
this.hostAddress = Domains.dnsResolve(this.host.toLowerCase());
return this.hostAddress;
}
@ -1050,7 +1051,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
} else if (host_tld.equals("cr")) {//Costa Rica /2,060,000
language = "es";//spanish; spa
} else if (host_tld.equals("cy")) {//Cyprus /2,500,000
language = "el";//greek; gre (ell); ell
language = "el";//greek; gre (ell); ell
} else if (host_tld.equals("cu")) {//Cuba /2,040,000
language = "es";//spanish; spa
} else if (host_tld.equals("cx")) {//Christmas Island /1,830,000
@ -1323,7 +1324,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
} else if (host_tld.equals("mg")) {//Madagascar /255,000
language = "mg";//malagasy; mlg (mlg); mlg (macrolanguage): plt
//language = "fr";//french; fre (fra); fra
//malagasy is native language, but elite want to french
//malagasy is native language, but elite want to french
} else if (host_tld.equals("mr")) {//Mauritania /210,000
language = "ar";//arabic; ara; mey
//language = "fr";//french; fre (fra); fra

Loading…
Cancel
Save