From c6d2f50375f4ac29888ee57cf6a3571c131340ff Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 23 Sep 2005 00:18:08 +0000 Subject: [PATCH] changed order of robots and double-check git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@783 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../de/anomic/plasma/plasmaSwitchboard.java | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 023360a28..b84065f4e 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1195,8 +1195,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // filter deny if ((currentdepth > 0) && (profile != null) && (!(nexturlString.matches(profile.generalFilter())))) { reason = "denied_(does_not_match_filter)"; + /* urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, - name, reason, new bitfield(plasmaURL.urlFlagLength), false); + name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ log.logFine("URL '" + nexturlString + "' does not match crawling filter '" + profile.generalFilter() + "'."); return reason; } @@ -1204,8 +1205,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // deny cgi if (plasmaHTCache.isCGI(nexturlString)) { reason = "denied_(cgi_url)"; + /* urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, - name, reason, new bitfield(plasmaURL.urlFlagLength), false); + name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ log.logFine("URL '" + nexturlString + "' is cgi URL."); return reason; } @@ -1213,33 +1215,35 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // deny post properties if ((plasmaHTCache.isPOST(nexturlString)) && (profile != null) && (!(profile.crawlingQ()))) { reason = "denied_(post_url)"; + /* urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, - name, reason, new bitfield(plasmaURL.urlFlagLength), false); + name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ log.logFine("URL '" + nexturlString + "' is post URL."); return reason; } - // checking robots.txt - if (robotsParser.isDisallowed(nexturl)) { - reason = "denied_(robots.txt)"; - urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, - name, reason, new bitfield(plasmaURL.urlFlagLength), false); - log.logFine("Crawling of URL '" + nexturlString + "' disallowed by robots.txt."); - return reason; - } - - String nexturlhash = plasmaURL.urlHash(nexturl); String dbocc = ""; if ((dbocc = urlPool.exists(nexturlhash)) != null) { // DISTIGUISH OLD/RE-SEARCH CASES HERE! reason = "double_(registered_in_" + dbocc + ")"; + /* urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, - name, reason, new bitfield(plasmaURL.urlFlagLength), false); + name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ log.logFine("URL '" + nexturlString + "' is double registered in '" + dbocc + "'."); return reason; } + // checking robots.txt + if (robotsParser.isDisallowed(nexturl)) { + reason = "denied_(robots.txt)"; + /* + urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, + name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ + log.logFine("Crawling of URL '" + nexturlString + "' disallowed by robots.txt."); + return reason; + } + // store information boolean local = ((initiatorHash.equals(plasmaURL.dummyHash)) || (initiatorHash.equals(yacyCore.seedDB.mySeed.hash))); boolean global =