From 458c20ff729bce4bb34bac2b93491b43baf91c65 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 16 Nov 2011 13:06:46 +0000 Subject: [PATCH 1/4] fix for robot parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8044 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/crawler/RobotsTxt.java | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index 91e7444f9..94eaaf391 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -196,12 +196,17 @@ public class RobotsTxt { } } else { final byte[] robotsTxt = (byte[]) result[DOWNLOAD_ROBOTS_TXT]; - Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + UTF8.String(robotsTxt)); // debug TODO remove - final RobotsTxtParser parserResult = new RobotsTxtParser(robotsTxt, thisAgents); - ArrayList denyPath = parserResult.denyList(); + Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove + RobotsTxtParser parserResult; + ArrayList denyPath; if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) { + parserResult = new RobotsTxtParser(thisAgents); + // create virtual deny path denyPath = new ArrayList(); denyPath.add("/"); + } else { + parserResult = new RobotsTxtParser(thisAgents, robotsTxt); + denyPath = parserResult.denyList(); } // store the data into the robots DB From 7a5841e061021be2cab33ed7855d8a0751091c11 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 16 Nov 2011 13:12:46 +0000 Subject: [PATCH 2/4] fix for robot parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8045 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/crawler/RobotsTxtParser.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/source/de/anomic/crawler/RobotsTxtParser.java b/source/de/anomic/crawler/RobotsTxtParser.java index fa0a16964..986edfd3b 100644 --- a/source/de/anomic/crawler/RobotsTxtParser.java +++ b/source/de/anomic/crawler/RobotsTxtParser.java @@ -78,13 +78,17 @@ public final class RobotsTxtParser { private final Set myNames; // a list of own name lists private String agentName; // the name of the agent that was used to return the result - protected RobotsTxtParser(final byte[] robotsTxt, final Set myNames) { + protected RobotsTxtParser(final Set myNames) { this.allowList = new ArrayList(0); this.denyList = new ArrayList(0); this.sitemap = ""; this.crawlDelayMillis = 0; this.myNames = myNames; this.agentName = null; + } + + protected RobotsTxtParser(final Set myNames, final byte[] robotsTxt) { + this(myNames); if (robotsTxt != null && robotsTxt.length != 0) { final ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt); final BufferedReader reader = new BufferedReader(new InputStreamReader(bin)); From a99934226e5f70b330ac99ee575a5a7746a27f51 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 16 Nov 2011 13:56:31 +0000 Subject: [PATCH 3/4] more logging for debugging of robots.txt git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8046 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/crawler/RobotsTxt.java | 27 ++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index 94eaaf391..c689eb79e 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -378,7 +378,7 @@ public class RobotsTxt { } } else if (code == 401 || code == 403) { accessCompletelyRestricted = true; - if (log.isDebugEnabled()) log.debug("Access to Robots.txt not allowed on URL '" + robotsURL + "'."); + log.info("Access to Robots.txt not allowed on URL '" + robotsURL + "'., redirectionCount = " + redirectionCount); // since this is a strange case we log it all the time } else { if (log.isDebugEnabled()) log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "'. [" + client.getHttpResponse().getStatusLine() + "]."); @@ -389,4 +389,29 @@ public class RobotsTxt { } return new Object[]{Boolean.valueOf(accessCompletelyRestricted),robotsTxt,eTag,lastMod}; } + + public final static void main(final String[] args) throws Exception { + + final String url = "http://www.badelatschen.net/robots.txt"; + final Object[] o = downloadRobotsTxt(new MultiProtocolURI(url), 0, null); + if (o == null) { + System.out.println("result: null"); + } else { + System.out.println("not allowed = " + ((Boolean) o[0]).toString()); + System.out.println("robots = " + ((o[1] == null) ? "null" : UTF8.String((byte[]) o[1]))); + } + System.exit(0); +/* + final HttpClient httpclient = new DefaultHttpClient(); + try { + final HttpGet httpget = new HttpGet(url); + final ResponseHandler responseHandler = new BasicResponseHandler(); + final String responseBody = httpclient.execute(httpget, responseHandler); + System.out.println(responseBody); + } finally { + httpclient.getConnectionManager().shutdown(); + } + */ + } + } From 06352b8d6b3bbc24fe9f0281ff3e8380b7502265 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 16 Nov 2011 14:09:50 +0000 Subject: [PATCH 4/4] more logging git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8047 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/crawler/RobotsTxt.java | 2 +- source/net/yacy/search/Switchboard.java | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index c689eb79e..6a439b8e7 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -378,7 +378,7 @@ public class RobotsTxt { } } else if (code == 401 || code == 403) { accessCompletelyRestricted = true; - log.info("Access to Robots.txt not allowed on URL '" + robotsURL + "'., redirectionCount = " + redirectionCount); // since this is a strange case we log it all the time + log.info("Access to Robots.txt not allowed on URL '" + robotsURL + "', redirectionCount = " + redirectionCount); // since this is a strange case we log it all the time } else { if (log.isDebugEnabled()) log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "'. [" + client.getHttpResponse().getStatusLine() + "]."); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 96f586521..7560b0e88 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -923,7 +923,10 @@ public final class Switchboard extends serverSwitch { */ // write the YaCy network identification inside the yacybot client user agent to distinguish networks String newagent = ClientIdentification.generateYaCyBot(getConfig(SwitchboardConstants.NETWORK_NAME, "") + (isRobinsonMode() ? "-" : "/") + getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")); - if (!getConfigBool("network.unit.dht", false) && getConfig("network.unit.tenant.agent", "").length() > 0) newagent = getConfig("network.unit.tenant.agent", ""); + if (!getConfigBool("network.unit.dht", false) && getConfig("network.unit.tenant.agent", "").length() > 0) { + newagent = getConfig("network.unit.tenant.agent", "").trim(); + this.log.logInfo("new user agent: '" + newagent + "'"); + } ClientIdentification.setUserAgent(newagent); }