changed order of robots and double-check

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@783 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 68d5ff2ef1
commit c6d2f50375

@ -1195,8 +1195,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// filter deny
if ((currentdepth > 0) && (profile != null) && (!(nexturlString.matches(profile.generalFilter())))) {
reason = "denied_(does_not_match_filter)";
/*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
log.logFine("URL '" + nexturlString + "' does not match crawling filter '" + profile.generalFilter() + "'.");
return reason;
}
@ -1204,8 +1205,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// deny cgi
if (plasmaHTCache.isCGI(nexturlString)) {
reason = "denied_(cgi_url)";
/*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
log.logFine("URL '" + nexturlString + "' is cgi URL.");
return reason;
}
@ -1213,33 +1215,35 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// deny post properties
if ((plasmaHTCache.isPOST(nexturlString)) && (profile != null) && (!(profile.crawlingQ()))) {
reason = "denied_(post_url)";
/*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
log.logFine("URL '" + nexturlString + "' is post URL.");
return reason;
}
// checking robots.txt
if (robotsParser.isDisallowed(nexturl)) {
reason = "denied_(robots.txt)";
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
log.logFine("Crawling of URL '" + nexturlString + "' disallowed by robots.txt.");
return reason;
}
String nexturlhash = plasmaURL.urlHash(nexturl);
String dbocc = "";
if ((dbocc = urlPool.exists(nexturlhash)) != null) {
// DISTIGUISH OLD/RE-SEARCH CASES HERE!
reason = "double_(registered_in_" + dbocc + ")";
/*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
log.logFine("URL '" + nexturlString + "' is double registered in '" + dbocc + "'.");
return reason;
}
// checking robots.txt
if (robotsParser.isDisallowed(nexturl)) {
reason = "denied_(robots.txt)";
/*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
log.logFine("Crawling of URL '" + nexturlString + "' disallowed by robots.txt.");
return reason;
}
// store information
boolean local = ((initiatorHash.equals(plasmaURL.dummyHash)) || (initiatorHash.equals(yacyCore.seedDB.mySeed.hash)));
boolean global =

Loading…
Cancel
Save