From eb1c7c041d27677f6baa8d2516c117d4036329cc Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 15 Nov 2011 00:33:54 +0000 Subject: [PATCH] write info about robots.txt evaluation into getpageinfo_p.xml git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8038 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/api/getpageinfo_p.java | 3 + htroot/api/getpageinfo_p.xml | 1 + source/de/anomic/crawler/RobotsTxtEntry.java | 100 +++++++++++-------- 3 files changed, 61 insertions(+), 43 deletions(-) diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index 888a2b38e..68c490807 100755 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -25,6 +25,7 @@ public class getpageinfo_p { prop.put("desc", ""); prop.put("lang", ""); prop.put("robots-allowed", "3"); //unknown + prop.put("robotsInfo", ""); //unknown prop.put("sitemap", ""); prop.put("favicon",""); prop.put("sitelist", ""); @@ -39,6 +40,7 @@ public class getpageinfo_p { String url=post.get("url"); if (url.toLowerCase().startsWith("ftp://")) { prop.put("robots-allowed", "1"); + prop.put("robotsInfo", "ftp does not follow robots.txt"); prop.putXML("title", "FTP: " + url); return prop; } else if (!url.startsWith("http://") && @@ -114,6 +116,7 @@ public class getpageinfo_p { Log.logException(e); } prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1); + prop.putHTML("robotsInfo", robotsEntry.getInfo()); // get the sitemap URL of the domain final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap(); diff --git a/htroot/api/getpageinfo_p.xml b/htroot/api/getpageinfo_p.xml index b9590c990..84da4eb97 100644 --- a/htroot/api/getpageinfo_p.xml +++ b/htroot/api/getpageinfo_p.xml @@ -4,6 +4,7 @@ #[desc]# #[lang]# #(robots-allowed)#0::1::#(/robots-allowed)# + #[robotsInfo]# #[sitemap]# #[favicon]# #[sitelist]# diff --git a/source/de/anomic/crawler/RobotsTxtEntry.java b/source/de/anomic/crawler/RobotsTxtEntry.java index 1b6636883..394f87802 100644 --- a/source/de/anomic/crawler/RobotsTxtEntry.java +++ b/source/de/anomic/crawler/RobotsTxtEntry.java @@ -1,4 +1,4 @@ -//RobotsEntry.java +//RobotsEntry.java //------------------------------------- //part of YACY //(C) by Michael Peter Christen; mc@yacy.net @@ -43,7 +43,7 @@ import net.yacy.kelondro.util.ByteArray; public class RobotsTxtEntry { - + private static final String HOST_NAME = "hostname"; private static final String ALLOW_PATH_LIST = "allow"; private static final String DISALLOW_PATH_LIST = "disallow"; @@ -54,16 +54,18 @@ public class RobotsTxtEntry { private static final String CRAWL_DELAY = "crawlDelay"; private static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis"; private static final String AGENT_NAME = "agentname"; - + // this is a simple record structure that holds all properties of a single crawl start private final Map mem; private final List allowPathList, denyPathList; private final String hostName, agentName; - + private String info; // this is filled if robots disallowed access; then the reason is noted there; + protected RobotsTxtEntry(final String hostName, final Map mem) { this.hostName = hostName.toLowerCase(); - this.mem = mem; - + this.mem = mem; + this.info = ""; + if (this.mem.containsKey(DISALLOW_PATH_LIST)) { this.denyPathList = new LinkedList(); final String csPl = UTF8.String(this.mem.get(DISALLOW_PATH_LIST)); @@ -89,12 +91,12 @@ public class RobotsTxtEntry { this.allowPathList = new LinkedList(); } this.agentName = this.mem.containsKey(AGENT_NAME) ? UTF8.String(this.mem.get(AGENT_NAME)) : null; - } - + } + protected RobotsTxtEntry( - final MultiProtocolURI theURL, - final List allowPathList, - final List disallowPathList, + final MultiProtocolURI theURL, + final List allowPathList, + final List disallowPathList, final Date loadedDate, final Date modDate, final String eTag, @@ -103,12 +105,12 @@ public class RobotsTxtEntry { final String agentName ) { if (theURL == null) throw new IllegalArgumentException("The url is missing"); - + this.hostName = RobotsTxt.getHostPort(theURL).toLowerCase(); this.allowPathList = new LinkedList(); this.denyPathList = new LinkedList(); this.agentName = agentName; - + this.mem = new LinkedHashMap(10); this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName)); if (loadedDate != null) this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(loadedDate.getTime()))); @@ -117,92 +119,92 @@ public class RobotsTxtEntry { if (sitemap != null) this.mem.put(SITEMAP, UTF8.getBytes(sitemap)); if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, UTF8.getBytes(Long.toString(crawlDelayMillis))); if (agentName != null) this.mem.put(AGENT_NAME, UTF8.getBytes(agentName)); - + if (allowPathList != null && !allowPathList.isEmpty()) { this.allowPathList.addAll(allowPathList); - + final StringBuilder pathListStr = new StringBuilder(allowPathList.size() * 30); - for (String element : allowPathList) { + for (final String element : allowPathList) { pathListStr.append(element) .append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR); } this.mem.put(ALLOW_PATH_LIST, UTF8.getBytes(pathListStr.substring(0,pathListStr.length()-1))); } - + if (disallowPathList != null && !disallowPathList.isEmpty()) { this.denyPathList.addAll(disallowPathList); - + final StringBuilder pathListStr = new StringBuilder(disallowPathList.size() * 30); - for (String element : disallowPathList) { + for (final String element : disallowPathList) { pathListStr.append(element) .append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR); } this.mem.put(DISALLOW_PATH_LIST, UTF8.getBytes(pathListStr.substring(0, pathListStr.length()-1))); } } - + protected String getHostName() { return this.hostName; } - + protected String getAgentName() { return this.agentName; } - + protected Map getMem() { if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName)); return this.mem; } - + @Override public String toString() { final StringBuilder str = new StringBuilder(6000); str.append((this.hostName == null) ? "null" : this.hostName).append(": "); if (this.mem != null) str.append(this.mem.toString()); return str.toString(); - } - + } + /** * get the sitemap url * @return the sitemap url or null if no sitemap url is given */ public MultiProtocolURI getSitemap() { - String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null; + final String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null; if (url == null) return null; try { return new MultiProtocolURI(url); - } catch (MalformedURLException e) { + } catch (final MalformedURLException e) { return null; } } - + protected Date getLoadedDate() { if (this.mem.containsKey(LOADED_DATE)) { return new Date(ByteArray.parseDecimal(this.mem.get(LOADED_DATE))); } return null; } - + protected void setLoadedDate(final Date newLoadedDate) { if (newLoadedDate != null) { this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(newLoadedDate.getTime()))); } } - + protected Date getModDate() { if (this.mem.containsKey(MOD_DATE)) { return new Date(ByteArray.parseDecimal(this.mem.get(MOD_DATE))); } return null; - } - + } + protected String getETag() { if (this.mem.containsKey(ETAG)) { return ASCII.String(this.mem.get(ETAG)); } return null; - } - + } + protected long getCrawlDelayMillis() { if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try { return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS)); @@ -214,26 +216,38 @@ public class RobotsTxtEntry { } catch (final NumberFormatException e) { return 0; } - return 0; + return 0; } - - public boolean isDisallowed(MultiProtocolURI subpathURL) { + + public boolean isDisallowed(final MultiProtocolURI subpathURL) { String path = subpathURL.getFile(); - if ((this.mem == null) || (this.denyPathList.isEmpty())) return false; - + if (this.mem == null) { + this.info = "no robots file available"; + return false; + } + if (this.denyPathList.isEmpty()) { + this.info = "no entry in robots.txt"; + return false; + } + // if the path is null or empty we set it to / - if ((path == null) || (path.length() == 0)) path = "/"; + if (path == null || path.length() == 0) path = "/"; // escaping all occurences of ; because this char is used as special char in the Robots DB else path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B"); - - for (String element : this.denyPathList) { - + + for (final String element : this.denyPathList) { + // disallow rule if (path.startsWith(element)) { + this.info = "path '" + path + "' starts with '" + element + "' from deny path list"; return true; } } + this.info = "path '" + path + "' does not start with any element from deny path list"; return false; } + public String getInfo() { + return this.info; + } } \ No newline at end of file