|
|
@ -1195,8 +1195,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|
|
|
// filter deny
|
|
|
|
// filter deny
|
|
|
|
if ((currentdepth > 0) && (profile != null) && (!(nexturlString.matches(profile.generalFilter())))) {
|
|
|
|
if ((currentdepth > 0) && (profile != null) && (!(nexturlString.matches(profile.generalFilter())))) {
|
|
|
|
reason = "denied_(does_not_match_filter)";
|
|
|
|
reason = "denied_(does_not_match_filter)";
|
|
|
|
|
|
|
|
/*
|
|
|
|
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
|
|
|
|
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
|
|
|
|
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
|
|
|
|
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
|
|
|
|
log.logFine("URL '" + nexturlString + "' does not match crawling filter '" + profile.generalFilter() + "'.");
|
|
|
|
log.logFine("URL '" + nexturlString + "' does not match crawling filter '" + profile.generalFilter() + "'.");
|
|
|
|
return reason;
|
|
|
|
return reason;
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -1204,8 +1205,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|
|
|
// deny cgi
|
|
|
|
// deny cgi
|
|
|
|
if (plasmaHTCache.isCGI(nexturlString)) {
|
|
|
|
if (plasmaHTCache.isCGI(nexturlString)) {
|
|
|
|
reason = "denied_(cgi_url)";
|
|
|
|
reason = "denied_(cgi_url)";
|
|
|
|
|
|
|
|
/*
|
|
|
|
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
|
|
|
|
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
|
|
|
|
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
|
|
|
|
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
|
|
|
|
log.logFine("URL '" + nexturlString + "' is cgi URL.");
|
|
|
|
log.logFine("URL '" + nexturlString + "' is cgi URL.");
|
|
|
|
return reason;
|
|
|
|
return reason;
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -1213,33 +1215,35 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|
|
|
// deny post properties
|
|
|
|
// deny post properties
|
|
|
|
if ((plasmaHTCache.isPOST(nexturlString)) && (profile != null) && (!(profile.crawlingQ()))) {
|
|
|
|
if ((plasmaHTCache.isPOST(nexturlString)) && (profile != null) && (!(profile.crawlingQ()))) {
|
|
|
|
reason = "denied_(post_url)";
|
|
|
|
reason = "denied_(post_url)";
|
|
|
|
|
|
|
|
/*
|
|
|
|
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
|
|
|
|
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
|
|
|
|
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
|
|
|
|
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
|
|
|
|
log.logFine("URL '" + nexturlString + "' is post URL.");
|
|
|
|
log.logFine("URL '" + nexturlString + "' is post URL.");
|
|
|
|
return reason;
|
|
|
|
return reason;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// checking robots.txt
|
|
|
|
|
|
|
|
if (robotsParser.isDisallowed(nexturl)) {
|
|
|
|
|
|
|
|
reason = "denied_(robots.txt)";
|
|
|
|
|
|
|
|
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
|
|
|
|
|
|
|
|
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
|
|
|
|
|
|
|
|
log.logFine("Crawling of URL '" + nexturlString + "' disallowed by robots.txt.");
|
|
|
|
|
|
|
|
return reason;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
String nexturlhash = plasmaURL.urlHash(nexturl);
|
|
|
|
String nexturlhash = plasmaURL.urlHash(nexturl);
|
|
|
|
String dbocc = "";
|
|
|
|
String dbocc = "";
|
|
|
|
if ((dbocc = urlPool.exists(nexturlhash)) != null) {
|
|
|
|
if ((dbocc = urlPool.exists(nexturlhash)) != null) {
|
|
|
|
// DISTIGUISH OLD/RE-SEARCH CASES HERE!
|
|
|
|
// DISTIGUISH OLD/RE-SEARCH CASES HERE!
|
|
|
|
reason = "double_(registered_in_" + dbocc + ")";
|
|
|
|
reason = "double_(registered_in_" + dbocc + ")";
|
|
|
|
|
|
|
|
/*
|
|
|
|
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
|
|
|
|
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
|
|
|
|
name, reason, new bitfield(plasmaURL.urlFlagLength), false);
|
|
|
|
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
|
|
|
|
log.logFine("URL '" + nexturlString + "' is double registered in '" + dbocc + "'.");
|
|
|
|
log.logFine("URL '" + nexturlString + "' is double registered in '" + dbocc + "'.");
|
|
|
|
return reason;
|
|
|
|
return reason;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// checking robots.txt
|
|
|
|
|
|
|
|
if (robotsParser.isDisallowed(nexturl)) {
|
|
|
|
|
|
|
|
reason = "denied_(robots.txt)";
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
|
|
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
|
|
|
|
|
|
|
|
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
|
|
|
|
|
|
|
|
log.logFine("Crawling of URL '" + nexturlString + "' disallowed by robots.txt.");
|
|
|
|
|
|
|
|
return reason;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// store information
|
|
|
|
// store information
|
|
|
|
boolean local = ((initiatorHash.equals(plasmaURL.dummyHash)) || (initiatorHash.equals(yacyCore.seedDB.mySeed.hash)));
|
|
|
|
boolean local = ((initiatorHash.equals(plasmaURL.dummyHash)) || (initiatorHash.equals(yacyCore.seedDB.mySeed.hash)));
|
|
|
|
boolean global =
|
|
|
|
boolean global =
|
|
|
|