fix for wrong robots.txt loading for https protocol

see also: http://forum.yacy-websuche.de/viewtopic.php?f=5&t=4579
pull/1/head
Michael Peter Christen 12 years ago
parent edbc86d2b0
commit af465cdca5

@ -132,9 +132,7 @@ public class RobotsTxt {
// we can now synchronize for each host separately // we can now synchronize for each host separately
synchronized (syncObj) { synchronized (syncObj) {
// if we have not found any data or the data is older than 7 days, we need to load it from the remote server // if we have not found any data or the data is older than 7 days, we need to load it from the remote server
// check the robots table again for all threads that come here because they waited for another one // check the robots table again for all threads that come here because they waited for another one
// to complete a download // to complete a download
try { try {
@ -156,7 +154,7 @@ public class RobotsTxt {
// generating the proper url to download the robots txt // generating the proper url to download the robots txt
DigestURI robotsURL = null; DigestURI robotsURL = null;
try { try {
robotsURL = new DigestURI("http://" + urlHostPort + "/robots.txt"); robotsURL = new DigestURI((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt");
} catch (final MalformedURLException e) { } catch (final MalformedURLException e) {
log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e); log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
robotsURL = null; robotsURL = null;
@ -175,58 +173,9 @@ public class RobotsTxt {
} }
if (response == null) { if (response == null) {
// no robots.txt available, make an entry to prevent that the robots loading is done twice processOldEntry(robotsTxt4Host, robotsURL, robotsTable);
if (robotsTxt4Host == null) {
// generate artificial entry
robotsTxt4Host = new RobotsTxtEntry(
robotsURL,
new ArrayList<String>(),
new ArrayList<String>(),
new Date(),
new Date(),
null,
null,
Integer.valueOf(0),
null);
} else {
robotsTxt4Host.setLoadedDate(new Date());
}
// store the data into the robots DB
final int sz = robotsTable.size();
addEntry(robotsTxt4Host);
if (robotsTable.size() <= sz) {
log.fatal("new entry in robots.txt table failed, resetting database");
try {clear();} catch (IOException e) {}
addEntry(robotsTxt4Host);
}
} else { } else {
final byte[] robotsTxt = response.getContent(); processNewEntry(robotsURL, response, thisAgents);
//Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
RobotsTxtParser parserResult;
ArrayList<String> denyPath;
if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) {
parserResult = new RobotsTxtParser(thisAgents);
// create virtual deny path
denyPath = new ArrayList<String>();
denyPath.add("/");
} else {
parserResult = new RobotsTxtParser(thisAgents, robotsTxt);
denyPath = parserResult.denyList();
}
// store the data into the robots DB
String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null;
robotsTxt4Host = addEntry(
robotsURL,
parserResult.allowList(),
denyPath,
new Date(),
response.getResponseHeader().lastModified(),
etag,
parserResult.sitemap(),
parserResult.crawlDelayMillis(),
parserResult.agentName());
} }
} }
} }
@ -246,116 +195,105 @@ public class RobotsTxt {
return; return;
} }
if (robotsTable == null || robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return; if (robotsTable == null || robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return;
Thread t = new Thread() {
public void run(){
// make or get a synchronization object
DomSync syncObj = RobotsTxt.this.syncObjects.get(urlHostPort);
if (syncObj == null) {
syncObj = new DomSync();
RobotsTxt.this.syncObjects.put(urlHostPort, syncObj);
}
// we can now synchronize for each host separately
synchronized (syncObj) {
if (robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return;
if (concurrent) // generating the proper url to download the robots txt
new Thread() {public void run(){ensureExist(urlHostPort, robotsTable, thisAgents);}}.start(); DigestURI robotsURL = null;
else try {
ensureExist(urlHostPort, robotsTable, thisAgents); robotsURL = new DigestURI((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt");
} } catch (final MalformedURLException e) {
log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
private void ensureExist(final String urlHostPort, BEncodedHeap robotsTable, final Set<String> thisAgents) { robotsURL = null;
}
// make or get a synchronization object
DomSync syncObj = RobotsTxt.this.syncObjects.get(urlHostPort);
if (syncObj == null) {
syncObj = new DomSync();
RobotsTxt.this.syncObjects.put(urlHostPort, syncObj);
}
// we can now synchronize for each host separately
synchronized (syncObj) {
if (robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return;
// generating the proper url to download the robots txt Response response = null;
DigestURI robotsURL = null; if (robotsURL != null) {
try { if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'.");
robotsURL = new DigestURI("http://" + urlHostPort + "/robots.txt"); Request request = new Request(robotsURL, null);
} catch (final MalformedURLException e) { try {
log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e); response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, 0);
robotsURL = null; } catch (IOException e) {
} response = null;
}
}
Response response = null; if (response == null) {
if (robotsURL != null) { processOldEntry(null, robotsURL, robotsTable);
if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'."); } else {
Request request = new Request(robotsURL, null); processNewEntry(robotsURL, response, thisAgents);
try { }
response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, 0);
} catch (IOException e) {
response = null;
} }
} }
};
if (concurrent) t.start(); else t.run();
}
RobotsTxtEntry robotsTxt4Host = null; private void processOldEntry(RobotsTxtEntry robotsTxt4Host, DigestURI robotsURL, BEncodedHeap robotsTable) {
if (response == null) { // no robots.txt available, make an entry to prevent that the robots loading is done twice
// no robots.txt available, make an entry to prevent that the robots loading is done twice if (robotsTxt4Host == null) {
// generate artificial entry // generate artificial entry
robotsTxt4Host = new RobotsTxtEntry( robotsTxt4Host = new RobotsTxtEntry(
robotsURL, robotsURL,
new ArrayList<String>(), new ArrayList<String>(),
new ArrayList<String>(), new ArrayList<String>(),
new Date(), new Date(),
new Date(), new Date(),
null, null,
null, null,
Integer.valueOf(0), Integer.valueOf(0),
null); null);
} else {
// store the data into the robots DB robotsTxt4Host.setLoadedDate(new Date());
final int sz = robotsTable.size(); }
addEntry(robotsTxt4Host);
if (robotsTable.size() <= sz) {
log.fatal("new entry in robots.txt table failed, resetting database");
try {clear();} catch (IOException e) {}
addEntry(robotsTxt4Host);
}
} else {
final byte[] robotsTxt = response.getContent();
//Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
RobotsTxtParser parserResult;
ArrayList<String> denyPath;
if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) {
parserResult = new RobotsTxtParser(thisAgents);
// create virtual deny path
denyPath = new ArrayList<String>();
denyPath.add("/");
} else {
parserResult = new RobotsTxtParser(thisAgents, robotsTxt);
denyPath = parserResult.denyList();
}
// store the data into the robots DB // store the data into the robots DB
String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null; final int sz = robotsTable.size();
robotsTxt4Host = addEntry( addEntry(robotsTxt4Host);
robotsURL, if (robotsTable.size() <= sz) {
parserResult.allowList(), log.fatal("new entry in robots.txt table failed, resetting database");
denyPath, try {clear();} catch (IOException e) {}
new Date(), addEntry(robotsTxt4Host);
response.getResponseHeader().lastModified(),
etag,
parserResult.sitemap(),
parserResult.crawlDelayMillis(),
parserResult.agentName());
}
} }
} }
private RobotsTxtEntry addEntry( private void processNewEntry(DigestURI robotsURL, Response response, final Set<String> thisAgents) {
final MultiProtocolURI theURL, final byte[] robotsTxt = response.getContent();
final ArrayList<String> allowPathList, //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
final ArrayList<String> denyPathList, RobotsTxtParser parserResult;
final Date loadedDate, ArrayList<String> denyPath;
final Date modDate, if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) {
final String eTag, parserResult = new RobotsTxtParser(thisAgents);
final String sitemap, // create virtual deny path
final long crawlDelayMillis, denyPath = new ArrayList<String>();
final String agentName denyPath.add("/");
) { } else {
final RobotsTxtEntry entry = new RobotsTxtEntry( parserResult = new RobotsTxtParser(thisAgents, robotsTxt);
theURL, allowPathList, denyPathList, denyPath = parserResult.denyList();
loadedDate, modDate, }
eTag, sitemap, crawlDelayMillis, agentName);
addEntry(entry); // store the data into the robots DB
return entry; String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null;
final RobotsTxtEntry robotsTxt4Host = new RobotsTxtEntry(
robotsURL,
parserResult.allowList(),
denyPath,
new Date(),
response.getResponseHeader().lastModified(),
etag,
parserResult.sitemap(),
parserResult.crawlDelayMillis(),
parserResult.agentName());
addEntry(robotsTxt4Host);
} }
private String addEntry(final RobotsTxtEntry entry) { private String addEntry(final RobotsTxtEntry entry) {
@ -371,25 +309,21 @@ public class RobotsTxt {
} }
static final String getHostPort(final MultiProtocolURI theURL) { static final String getHostPort(final MultiProtocolURI theURL) {
final int port = getPort(theURL);
String host = theURL.getHost();
if (host == null) return null;
StringBuilder sb = new StringBuilder(host.length() + 6);
sb.append(host).append(':').append(Integer.toString(port));
return sb.toString();
}
private static final int getPort(final MultiProtocolURI theURL) {
int port = theURL.getPort(); int port = theURL.getPort();
if (port == -1) { if (port == -1) {
if (theURL.getProtocol().equalsIgnoreCase("http")) { if (theURL.getProtocol().equalsIgnoreCase("http")) {
port = 80; port = 80;
} else if (theURL.getProtocol().equalsIgnoreCase("https")) { } else if (theURL.getProtocol().equalsIgnoreCase("https")) {
port = 443; port = 443;
} else {
port = 80;
} }
} }
return port; String host = theURL.getHost();
if (host == null) return null;
StringBuilder sb = new StringBuilder(host.length() + 6);
sb.append(host).append(':').append(Integer.toString(port));
return sb.toString();
} }
} }

Loading…
Cancel
Save