write info about robots.txt evaluation into getpageinfo_p.xml

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8038 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 13 years ago
parent f8b8c82421
commit eb1c7c041d

@ -25,6 +25,7 @@ public class getpageinfo_p {
prop.put("desc", "");
prop.put("lang", "");
prop.put("robots-allowed", "3"); //unknown
prop.put("robotsInfo", ""); //unknown
prop.put("sitemap", "");
prop.put("favicon","");
prop.put("sitelist", "");
@ -39,6 +40,7 @@ public class getpageinfo_p {
String url=post.get("url");
if (url.toLowerCase().startsWith("ftp://")) {
prop.put("robots-allowed", "1");
prop.put("robotsInfo", "ftp does not follow robots.txt");
prop.putXML("title", "FTP: " + url);
return prop;
} else if (!url.startsWith("http://") &&
@ -114,6 +116,7 @@ public class getpageinfo_p {
Log.logException(e);
}
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
prop.putHTML("robotsInfo", robotsEntry.getInfo());
// get the sitemap URL of the domain
final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap();

@ -4,6 +4,7 @@
<desc>#[desc]#</desc>
<lang>#[lang]#</lang>
<robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
<robotsInfo>#[robotsInfo]#</robotsInfo>
<sitemap>#[sitemap]#</sitemap>
<favicon>#[favicon]#</favicon>
<sitelist>#[sitelist]#</sitelist>

@ -1,4 +1,4 @@
//RobotsEntry.java
//RobotsEntry.java
//-------------------------------------
//part of YACY
//(C) by Michael Peter Christen; mc@yacy.net
@ -43,7 +43,7 @@ import net.yacy.kelondro.util.ByteArray;
public class RobotsTxtEntry {
private static final String HOST_NAME = "hostname";
private static final String ALLOW_PATH_LIST = "allow";
private static final String DISALLOW_PATH_LIST = "disallow";
@ -54,16 +54,18 @@ public class RobotsTxtEntry {
private static final String CRAWL_DELAY = "crawlDelay";
private static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
private static final String AGENT_NAME = "agentname";
// this is a simple record structure that holds all properties of a single crawl start
private final Map<String, byte[]> mem;
private final List<String> allowPathList, denyPathList;
private final String hostName, agentName;
private String info; // this is filled if robots disallowed access; then the reason is noted there;
protected RobotsTxtEntry(final String hostName, final Map<String, byte[]> mem) {
this.hostName = hostName.toLowerCase();
this.mem = mem;
this.mem = mem;
this.info = "";
if (this.mem.containsKey(DISALLOW_PATH_LIST)) {
this.denyPathList = new LinkedList<String>();
final String csPl = UTF8.String(this.mem.get(DISALLOW_PATH_LIST));
@ -89,12 +91,12 @@ public class RobotsTxtEntry {
this.allowPathList = new LinkedList<String>();
}
this.agentName = this.mem.containsKey(AGENT_NAME) ? UTF8.String(this.mem.get(AGENT_NAME)) : null;
}
}
protected RobotsTxtEntry(
final MultiProtocolURI theURL,
final List<String> allowPathList,
final List<String> disallowPathList,
final MultiProtocolURI theURL,
final List<String> allowPathList,
final List<String> disallowPathList,
final Date loadedDate,
final Date modDate,
final String eTag,
@ -103,12 +105,12 @@ public class RobotsTxtEntry {
final String agentName
) {
if (theURL == null) throw new IllegalArgumentException("The url is missing");
this.hostName = RobotsTxt.getHostPort(theURL).toLowerCase();
this.allowPathList = new LinkedList<String>();
this.denyPathList = new LinkedList<String>();
this.agentName = agentName;
this.mem = new LinkedHashMap<String, byte[]>(10);
this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName));
if (loadedDate != null) this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(loadedDate.getTime())));
@ -117,92 +119,92 @@ public class RobotsTxtEntry {
if (sitemap != null) this.mem.put(SITEMAP, UTF8.getBytes(sitemap));
if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, UTF8.getBytes(Long.toString(crawlDelayMillis)));
if (agentName != null) this.mem.put(AGENT_NAME, UTF8.getBytes(agentName));
if (allowPathList != null && !allowPathList.isEmpty()) {
this.allowPathList.addAll(allowPathList);
final StringBuilder pathListStr = new StringBuilder(allowPathList.size() * 30);
for (String element : allowPathList) {
for (final String element : allowPathList) {
pathListStr.append(element)
.append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(ALLOW_PATH_LIST, UTF8.getBytes(pathListStr.substring(0,pathListStr.length()-1)));
}
if (disallowPathList != null && !disallowPathList.isEmpty()) {
this.denyPathList.addAll(disallowPathList);
final StringBuilder pathListStr = new StringBuilder(disallowPathList.size() * 30);
for (String element : disallowPathList) {
for (final String element : disallowPathList) {
pathListStr.append(element)
.append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(DISALLOW_PATH_LIST, UTF8.getBytes(pathListStr.substring(0, pathListStr.length()-1)));
}
}
protected String getHostName() {
return this.hostName;
}
protected String getAgentName() {
return this.agentName;
}
protected Map<String, byte[]> getMem() {
if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName));
return this.mem;
}
@Override
public String toString() {
final StringBuilder str = new StringBuilder(6000);
str.append((this.hostName == null) ? "null" : this.hostName).append(": ");
if (this.mem != null) str.append(this.mem.toString());
return str.toString();
}
}
/**
* get the sitemap url
* @return the sitemap url or null if no sitemap url is given
*/
public MultiProtocolURI getSitemap() {
String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null;
final String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null;
if (url == null) return null;
try {
return new MultiProtocolURI(url);
} catch (MalformedURLException e) {
} catch (final MalformedURLException e) {
return null;
}
}
protected Date getLoadedDate() {
if (this.mem.containsKey(LOADED_DATE)) {
return new Date(ByteArray.parseDecimal(this.mem.get(LOADED_DATE)));
}
return null;
}
protected void setLoadedDate(final Date newLoadedDate) {
if (newLoadedDate != null) {
this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(newLoadedDate.getTime())));
}
}
protected Date getModDate() {
if (this.mem.containsKey(MOD_DATE)) {
return new Date(ByteArray.parseDecimal(this.mem.get(MOD_DATE)));
}
return null;
}
}
protected String getETag() {
if (this.mem.containsKey(ETAG)) {
return ASCII.String(this.mem.get(ETAG));
}
return null;
}
}
protected long getCrawlDelayMillis() {
if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS));
@ -214,26 +216,38 @@ public class RobotsTxtEntry {
} catch (final NumberFormatException e) {
return 0;
}
return 0;
return 0;
}
public boolean isDisallowed(MultiProtocolURI subpathURL) {
public boolean isDisallowed(final MultiProtocolURI subpathURL) {
String path = subpathURL.getFile();
if ((this.mem == null) || (this.denyPathList.isEmpty())) return false;
if (this.mem == null) {
this.info = "no robots file available";
return false;
}
if (this.denyPathList.isEmpty()) {
this.info = "no entry in robots.txt";
return false;
}
// if the path is null or empty we set it to /
if ((path == null) || (path.length() == 0)) path = "/";
if (path == null || path.length() == 0) path = "/";
// escaping all occurences of ; because this char is used as special char in the Robots DB
else path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B");
for (String element : this.denyPathList) {
for (final String element : this.denyPathList) {
// disallow rule
if (path.startsWith(element)) {
this.info = "path '" + path + "' starts with '" + element + "' from deny path list";
return true;
}
}
this.info = "path '" + path + "' does not start with any element from deny path list";
return false;
}
public String getInfo() {
return this.info;
}
}
Loading…
Cancel
Save