*) next step of restructuring for new crawlers

- adding function isSupportedProcotol to plasmaCrawlLoader.java
   - disabling robots.txt check for protocols other than http(s)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2479 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent 9ded4e8d5a
commit e3f0136606

@ -70,6 +70,7 @@ public class plasmaCrawlEURL extends indexURL {
// invalid urls
public static final String DENIED_URL_NULL = "denied_(url_null)";
public static final String DENIED_MALFORMED_URL = "denied_(malformed_url)";
public static final String DENIED_UNSUPPORTED_PROTOCOL = "denied_(unsupported_protocol)";
public static final String DENIED_PRIVATE_IP_ADDRESS = "denied_(private_ip_address)";
public static final String DENIED_LOOPBACK_IP_ADDRESS = "denied_(loopback_ip_address)";
public static final String DENIED_CACHEFILE_PATH_TOO_LONG = "denied_(cachefile_path_too_long)";

@ -122,6 +122,15 @@ public final class plasmaCrawlLoader extends Thread {
this.crawlwerPool.setConfig(newConfig);
}
public boolean isSupportedProtocol(String protocol) {
if ((protocol == null) || (protocol.length() == 0)) return false;
// TODO: read the supported protocols out from a config file
protocol = protocol.trim().toLowerCase();
return protocol.equals("http") ||
protocol.equals("https");
}
public void close() {
try {
// setting the stop flag to true

@ -252,6 +252,9 @@ public final class plasmaCrawlStacker {
long startTime = System.currentTimeMillis();
String reason = null; // failure reason
// getting the initiator peer hash
if ((initiatorHash == null) || (initiatorHash.length() == 0)) initiatorHash = indexURL.dummyHash;
// strange errors
if (nexturlString == null) {
reason = plasmaCrawlEURL.DENIED_URL_NULL;
@ -259,11 +262,8 @@ public final class plasmaCrawlStacker {
return reason;
}
// getting the initiator peer hash
if ((initiatorHash == null) || (initiatorHash.length() == 0)) initiatorHash = indexURL.dummyHash;
// getting the referer url and url hash
URL nexturl = null, referrerURL = null;
URL referrerURL = null;
if (referrerString != null) {
try {
referrerURL = new URL(referrerString);
@ -275,6 +275,7 @@ public final class plasmaCrawlStacker {
String referrerHash = (referrerString==null)?null:indexURL.urlHash(referrerString);
// check for malformed urls
URL nexturl = null;
try {
nexturl = new URL(nexturlString);
} catch (MalformedURLException e) {
@ -284,6 +285,15 @@ public final class plasmaCrawlStacker {
return reason;
}
// check if the protocol is supported
String urlProtocol = nexturl.getProtocol();
if (!this.sb.cacheLoader.isSupportedProtocol(urlProtocol)) {
reason = plasmaCrawlEURL.DENIED_UNSUPPORTED_PROTOCOL;
this.log.logSevere("Unsupported protocol in URL '" + nexturlString + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
}
// check if ip is local ip address
checkInterruption();
InetAddress hostAddress = httpc.dnsResolve(nexturl.getHost());
@ -382,9 +392,9 @@ public final class plasmaCrawlStacker {
return reason;
}
// checking robots.txt
// checking robots.txt for http(s) resources
checkInterruption();
if (robotsParser.isDisallowed(nexturl)) {
if ((urlProtocol.equals("http") || urlProtocol.equals("https")) && robotsParser.isDisallowed(nexturl)) {
reason = plasmaCrawlEURL.DENIED_ROBOTS_TXT;
this.log.logFine("Crawling of URL '" + nexturlString + "' disallowed by robots.txt. " +

Loading…
Cancel
Save