pull/1/head
sixcooler 13 years ago
commit 605bc4c10e

@ -196,12 +196,17 @@ public class RobotsTxt {
}
} else {
final byte[] robotsTxt = (byte[]) result[DOWNLOAD_ROBOTS_TXT];
Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + UTF8.String(robotsTxt)); // debug TODO remove
final RobotsTxtParser parserResult = new RobotsTxtParser(robotsTxt, thisAgents);
ArrayList<String> denyPath = parserResult.denyList();
Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
RobotsTxtParser parserResult;
ArrayList<String> denyPath;
if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
parserResult = new RobotsTxtParser(thisAgents);
// create virtual deny path
denyPath = new ArrayList<String>();
denyPath.add("/");
} else {
parserResult = new RobotsTxtParser(thisAgents, robotsTxt);
denyPath = parserResult.denyList();
}
// store the data into the robots DB
@ -373,7 +378,7 @@ public class RobotsTxt {
}
} else if (code == 401 || code == 403) {
accessCompletelyRestricted = true;
if (log.isDebugEnabled()) log.debug("Access to Robots.txt not allowed on URL '" + robotsURL + "'.");
log.info("Access to Robots.txt not allowed on URL '" + robotsURL + "', redirectionCount = " + redirectionCount); // since this is a strange case we log it all the time
} else {
if (log.isDebugEnabled())
log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "'. [" + client.getHttpResponse().getStatusLine() + "].");
@ -384,4 +389,29 @@ public class RobotsTxt {
}
return new Object[]{Boolean.valueOf(accessCompletelyRestricted),robotsTxt,eTag,lastMod};
}
public final static void main(final String[] args) throws Exception {
final String url = "http://www.badelatschen.net/robots.txt";
final Object[] o = downloadRobotsTxt(new MultiProtocolURI(url), 0, null);
if (o == null) {
System.out.println("result: null");
} else {
System.out.println("not allowed = " + ((Boolean) o[0]).toString());
System.out.println("robots = " + ((o[1] == null) ? "null" : UTF8.String((byte[]) o[1])));
}
System.exit(0);
/*
final HttpClient httpclient = new DefaultHttpClient();
try {
final HttpGet httpget = new HttpGet(url);
final ResponseHandler<String> responseHandler = new BasicResponseHandler();
final String responseBody = httpclient.execute(httpget, responseHandler);
System.out.println(responseBody);
} finally {
httpclient.getConnectionManager().shutdown();
}
*/
}
}

@ -78,13 +78,17 @@ public final class RobotsTxtParser {
private final Set<String> myNames; // a list of own name lists
private String agentName; // the name of the agent that was used to return the result
protected RobotsTxtParser(final byte[] robotsTxt, final Set<String> myNames) {
protected RobotsTxtParser(final Set<String> myNames) {
this.allowList = new ArrayList<String>(0);
this.denyList = new ArrayList<String>(0);
this.sitemap = "";
this.crawlDelayMillis = 0;
this.myNames = myNames;
this.agentName = null;
}
protected RobotsTxtParser(final Set<String> myNames, final byte[] robotsTxt) {
this(myNames);
if (robotsTxt != null && robotsTxt.length != 0) {
final ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
final BufferedReader reader = new BufferedReader(new InputStreamReader(bin));

@ -923,7 +923,10 @@ public final class Switchboard extends serverSwitch {
*/
// write the YaCy network identification inside the yacybot client user agent to distinguish networks
String newagent = ClientIdentification.generateYaCyBot(getConfig(SwitchboardConstants.NETWORK_NAME, "") + (isRobinsonMode() ? "-" : "/") + getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global"));
if (!getConfigBool("network.unit.dht", false) && getConfig("network.unit.tenant.agent", "").length() > 0) newagent = getConfig("network.unit.tenant.agent", "");
if (!getConfigBool("network.unit.dht", false) && getConfig("network.unit.tenant.agent", "").length() > 0) {
newagent = getConfig("network.unit.tenant.agent", "").trim();
this.log.logInfo("new user agent: '" + newagent + "'");
}
ClientIdentification.setUserAgent(newagent);
}

Loading…
Cancel
Save