replaced the custom robots.txt loader by the standard http loader

pull/1/head
Michael Peter Christen 13 years ago
parent 799d71bc67
commit 2d9e577ad0

@ -84,19 +84,16 @@ public class CrawlCheck_p {
// try to load the robots
RobotsTxtEntry robotsEntry;
boolean robotsAllowed = true;
try {
robotsEntry = sb.robots.getEntry(u, sb.peers.myBotIDs());
if (robotsEntry == null) {
prop.put("table_list_" + row + "_robots", "no robots");
prop.put("table_list_" + row + "_crawldelay", CrawlQueues.queuedMinLoadDelay + " ms");
prop.put("table_list_" + row + "_sitemap", "");
} else {
robotsAllowed = !robotsEntry.isDisallowed(u);
prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed"));
prop.put("table_list_" + row + "_crawldelay", Math.max(CrawlQueues.queuedMinLoadDelay, robotsEntry.getCrawlDelayMillis()) + " ms");
prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true));
}
} catch (final IOException e) {
robotsEntry = sb.robots.getEntry(u, sb.peers.myBotIDs());
if (robotsEntry == null) {
prop.put("table_list_" + row + "_robots", "no robots");
prop.put("table_list_" + row + "_crawldelay", CrawlQueues.queuedMinLoadDelay + " ms");
prop.put("table_list_" + row + "_sitemap", "");
} else {
robotsAllowed = !robotsEntry.isDisallowed(u);
prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed"));
prop.put("table_list_" + row + "_crawldelay", Math.max(CrawlQueues.queuedMinLoadDelay, robotsEntry.getCrawlDelayMillis()) + " ms");
prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true));
}
// try to load the url

@ -148,13 +148,7 @@ public class getpageinfo {
final DigestURI theURL = new DigestURI(url);
// determine if crawling of the current URL is allowed
RobotsTxtEntry robotsEntry;
try {
robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
} catch (final IOException e) {
robotsEntry = null;
Log.logException(e);
}
RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());

@ -148,13 +148,7 @@ public class getpageinfo_p {
final DigestURI theURL = new DigestURI(url);
// determine if crawling of the current URL is allowed
RobotsTxtEntry robotsEntry;
try {
robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
} catch (final IOException e) {
robotsEntry = null;
Log.logException(e);
}
RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());

@ -464,10 +464,13 @@ public class Balancer {
rest = rest + 1000 * loops;
loops = 0;
}
if (rest > 0) {try {Thread.sleep(rest);} catch (final InterruptedException e) {}}
for (int i = 0; i < loops; i++) {
Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
try {Thread.sleep(1000); } catch (final InterruptedException e) {}
synchronized(this) {
// must be synchronized here to avoid 'takeover' moves from other threads which then idle the same time which would not be enough
if (rest > 0) {try {this.wait(rest);} catch (final InterruptedException e) {}}
for (int i = 0; i < loops; i++) {
Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
try {this.wait(1000); } catch (final InterruptedException e) {}
}
}
Latency.updateAfterSelection(crawlEntry.url(), robotsTime);
}

@ -45,8 +45,7 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException;
public final class CrawlSwitchboard
{
public final class CrawlSwitchboard {
public static final String CRAWL_PROFILE_PROXY = "proxy";
public static final String CRAWL_PROFILE_REMOTE = "remote";

@ -316,7 +316,6 @@ public class CrawlQueues {
* @param stats String for log prefixing
* @return
*/
@SuppressWarnings("unused")
private void load(final Request urlEntry, final String stats, final String profileHandle) {
final CrawlProfile profile = this.sb.crawler.getActive(UTF8.getBytes(profileHandle));
if (profile != null) {
@ -340,7 +339,16 @@ public class CrawlQueues {
if (urlEntry == null || urlEntry.url() == null) {
this.log.logInfo(stats + ": urlEntry = null");
} else {
new Loader(urlEntry);
if (!this.workers.containsKey(Integer.valueOf(urlEntry.hashCode()))) {
Loader loader = new Loader(urlEntry);
this.workers.put(loader.code, loader);
try {
loader.start();
} catch (final OutOfMemoryError e) {
Log.logWarning("CrawlQueues", "crawlWorker sequential fail-over: " + e.getMessage());
loader.run();
}
}
}
} else {
@ -615,16 +623,7 @@ public class CrawlQueues {
this.request = entry;
this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
this.code = Integer.valueOf(entry.hashCode());
if (!CrawlQueues.this.workers.containsKey(this.code)) {
CrawlQueues.this.workers.put(this.code, this);
try {
start();
} catch (final OutOfMemoryError e) {
Log.logWarning("CrawlQueues", "crawlWorker sequential fail-over: " + e.getMessage());
run();
}
}
setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
}
public long age() {

@ -23,7 +23,6 @@
package net.yacy.crawler.data;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
@ -100,12 +99,7 @@ public class Latency {
*/
public static long waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents) {
long robotsDelay = 0;
RobotsTxtEntry robotsEntry;
try {
robotsEntry = robots.getEntry(url, thisAgents);
} catch (final IOException e) {
robotsEntry = null;
}
RobotsTxtEntry robotsEntry = robots.getEntry(url, thisAgents);
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
return robotsDelay;

@ -201,7 +201,7 @@ public final class HTTPLoader {
}
// create a new cache entry
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = request.profileHandle() == null ? null : this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
response = new Response(
request,
requestHeader,

@ -292,9 +292,7 @@ public class Request extends WorkflowJob
public String profileHandle() {
// the handle of the crawl profile
assert this.profileHandle.length() == Word.commonHashLength : this.profileHandle
+ " != "
+ Word.commonHashLength;
assert this.profileHandle == null || this.profileHandle.length() == Word.commonHashLength : this.profileHandle + " != " + Word.commonHashLength;
return this.profileHandle;
}

@ -36,17 +36,15 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.HTTPLoader;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.blob.BEncodedHeap;
import net.yacy.kelondro.io.ByteCount;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.LoaderDispatcher;
import org.apache.log4j.Logger;
@ -61,14 +59,16 @@ public class RobotsTxt {
private final ConcurrentHashMap<String, DomSync> syncObjects;
//private static final HashSet<String> loadedRobots = new HashSet<String>(); // only for debugging
private final WorkTables tables;
private final LoaderDispatcher loader;
private static class DomSync {
private DomSync() {}
}
public RobotsTxt(final WorkTables worktables) {
public RobotsTxt(final WorkTables worktables, LoaderDispatcher loader) {
this.syncObjects = new ConcurrentHashMap<String, DomSync>();
this.tables = worktables;
this.loader = loader;
try {
this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME);
//log.info("initiated robots table: " + this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).getFile());
@ -90,23 +90,31 @@ public class RobotsTxt {
return this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).size();
}
public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) throws IOException {
public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) {
if (theURL == null) throw new IllegalArgumentException();
if (!theURL.getProtocol().startsWith("http")) return null;
return getEntry(theURL, thisAgents, true);
}
private RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException {
private RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
// this method will always return a non-null value
final String urlHostPort = getHostPort(theURL);
RobotsTxtEntry robotsTxt4Host = null;
Map<String, byte[]> record;
final BEncodedHeap robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME);
BEncodedHeap robotsTable = null;
try {
robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME);
} catch (IOException e1) {
log.fatal("tables not available", e1);
}
try {
record = robotsTable.get(robotsTable.encodedKey(urlHostPort));
} catch (final SpaceExceededException e) {
log.warn("memory exhausted", e);
record = null;
} catch (IOException e) {
log.warn("cannot get robotstxt from table", e);
record = null;
}
if (record != null) robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record);
@ -135,6 +143,9 @@ public class RobotsTxt {
} catch (final SpaceExceededException e) {
log.warn("memory exhausted", e);
record = null;
} catch (IOException e) {
log.warn("cannot get robotstxt from table", e);
record = null;
}
if (record != null) robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record);
if (robotsTxt4Host != null &&
@ -144,32 +155,26 @@ public class RobotsTxt {
}
// generating the proper url to download the robots txt
MultiProtocolURI robotsURL = null;
DigestURI robotsURL = null;
try {
robotsURL = new MultiProtocolURI("http://" + urlHostPort + "/robots.txt");
robotsURL = new DigestURI("http://" + urlHostPort + "/robots.txt");
} catch (final MalformedURLException e) {
log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
robotsURL = null;
}
Object[] result = null;
Response response = null;
if (robotsURL != null) {
if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'.");
Request request = new Request(robotsURL, null);
try {
result = downloadRobotsTxt(robotsURL, 3, robotsTxt4Host);
} catch (final Exception e) {
result = null;
response = this.loader.load(request, CacheStrategy.NOCACHE, null, 0);
} catch (IOException e) {
response = null;
}
}
/*
assert !loadedRobots.contains(robotsURL.toNormalform(false, false)) :
"robots-url=" + robotsURL.toString() +
", robots=" + ((result == null || result[DOWNLOAD_ROBOTS_TXT] == null) ? "NULL" : UTF8.String((byte[]) result[DOWNLOAD_ROBOTS_TXT])) +
", robotsTxt4Host=" + ((robotsTxt4Host == null) ? "NULL" : robotsTxt4Host.getLoadedDate().toString());
loadedRobots.add(robotsURL.toNormalform(false, false));
*/
if (result == null) {
if (response == null) {
// no robots.txt available, make an entry to prevent that the robots loading is done twice
if (robotsTxt4Host == null) {
// generate artificial entry
@ -192,15 +197,15 @@ public class RobotsTxt {
addEntry(robotsTxt4Host);
if (robotsTable.size() <= sz) {
log.fatal("new entry in robots.txt table failed, resetting database");
clear();
try {clear();} catch (IOException e) {}
addEntry(robotsTxt4Host);
}
} else {
final byte[] robotsTxt = (byte[]) result[DOWNLOAD_ROBOTS_TXT];
final byte[] robotsTxt = response.getContent();
//Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
RobotsTxtParser parserResult;
ArrayList<String> denyPath;
if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) {
parserResult = new RobotsTxtParser(thisAgents);
// create virtual deny path
denyPath = new ArrayList<String>();
@ -211,13 +216,14 @@ public class RobotsTxt {
}
// store the data into the robots DB
String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null;
robotsTxt4Host = addEntry(
robotsURL,
parserResult.allowList(),
denyPath,
new Date(),
(Date) result[DOWNLOAD_MODDATE],
(String) result[DOWNLOAD_ETAG],
response.getResponseHeader().lastModified(),
etag,
parserResult.sitemap(),
parserResult.crawlDelayMillis(),
parserResult.agentName());
@ -259,13 +265,6 @@ public class RobotsTxt {
}
}
// methods that had been in robotsParser.java:
private static final int DOWNLOAD_ACCESS_RESTRICTED = 0;
static final int DOWNLOAD_ROBOTS_TXT = 1;
private static final int DOWNLOAD_ETAG = 2;
private static final int DOWNLOAD_MODDATE = 3;
static final String getHostPort(final MultiProtocolURI theURL) {
final int port = getPort(theURL);
String host = theURL.getHost();
@ -287,131 +286,4 @@ public class RobotsTxt {
return port;
}
protected static Object[] downloadRobotsTxt(final MultiProtocolURI robotsURL, int redirectionCount, final RobotsTxtEntry entry) throws Exception {
if (robotsURL == null || !robotsURL.getProtocol().startsWith("http")) return null;
if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null};
redirectionCount--;
boolean accessCompletelyRestricted = false;
byte[] robotsTxt = null;
long downloadStart, downloadEnd;
String eTag=null, oldEtag = null;
Date lastMod=null;
downloadStart = System.currentTimeMillis();
// if we previously have downloaded this robots.txt then we can set the if-modified-since header
RequestHeader reqHeaders = new RequestHeader();
// add yacybot user agent
reqHeaders.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
// adding referer
reqHeaders.put(RequestHeader.REFERER, (MultiProtocolURI.newURL(robotsURL,"/")).toNormalform(true));
reqHeaders.put(HeaderFramework.ACCEPT, HTTPLoader.DEFAULT_ACCEPT);
if (entry != null) {
oldEtag = entry.getETag();
reqHeaders = new RequestHeader();
final Date modDate = entry.getModDate();
if (modDate != null) reqHeaders.put(RequestHeader.IF_MODIFIED_SINCE, HeaderFramework.formatRFC1123(entry.getModDate()));
}
// setup http-client
//TODO: adding Traffic statistic for robots download?
final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT);
client.setHeader(reqHeaders.entrySet());
try {
// check for interruption
if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress.");
// sending the get request
robotsTxt = client.GETbytes(robotsURL);
// statistics:
if (robotsTxt != null) {
ByteCount.addAccountCount(ByteCount.CRAWLER, robotsTxt.length);
}
final int code = client.getHttpResponse().getStatusLine().getStatusCode();
final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders());
// check the response status
if (code > 199 && code < 300) {
if (!header.mime().startsWith("text/plain")) {
robotsTxt = null;
log.info("Robots.txt from URL '" + robotsURL + "' has wrong mimetype '" + header.mime() + "'.");
} else {
// getting some metadata
eTag = header.containsKey(HeaderFramework.ETAG)?(header.get(HeaderFramework.ETAG)).trim():null;
lastMod = header.lastModified();
// if the robots.txt file was not changed we break here
if ((eTag != null) && (oldEtag != null) && (eTag.equals(oldEtag))) {
if (log.isDebugEnabled()) log.debug("Robots.txt from URL '" + robotsURL + "' was not modified. Abort downloading of new version.");
return null;
}
downloadEnd = System.currentTimeMillis();
if (log.isDebugEnabled()) log.debug("Robots.txt successfully loaded from URL '" + robotsURL + "' in " + (downloadEnd-downloadStart) + " ms.");
}
} else if (code == 304) {
return null;
} else if (code > 299 && code < 400) {
// getting redirection URL
String redirectionUrlString = header.get(HeaderFramework.LOCATION);
if (redirectionUrlString==null) {
if (log.isDebugEnabled())
log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "' because of missing redirecton header. [" + client.getHttpResponse().getStatusLine() + "].");
robotsTxt = null;
} else {
redirectionUrlString = redirectionUrlString.trim();
// generating the new URL object
final MultiProtocolURI redirectionUrl = MultiProtocolURI.newURL(robotsURL, redirectionUrlString);
// following the redirection
if (log.isDebugEnabled()) log.debug("Redirection detected for robots.txt with URL '" + robotsURL + "'." +
"\nRedirecting request to: " + redirectionUrl);
return downloadRobotsTxt(redirectionUrl,redirectionCount,entry);
}
} else if (code == 401 || code == 403) {
accessCompletelyRestricted = true;
log.info("Access to Robots.txt not allowed on URL '" + robotsURL + "', redirectionCount = " + redirectionCount); // since this is a strange case we log it all the time
} else {
if (log.isDebugEnabled())
log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "'. [" + client.getHttpResponse().getStatusLine() + "].");
robotsTxt = null;
}
} catch (final Exception e) {
throw e;
}
return new Object[]{Boolean.valueOf(accessCompletelyRestricted),robotsTxt,eTag,lastMod};
}
public final static void main(final String[] args) throws Exception {
final String url = "http://www.badelatschen.net/robots.txt";
final Object[] o = downloadRobotsTxt(new MultiProtocolURI(url), 0, null);
if (o == null) {
System.out.println("result: null");
} else {
System.out.println("not allowed = " + ((Boolean) o[0]).toString());
System.out.println("robots = " + ((o[1] == null) ? "null" : UTF8.String((byte[]) o[1])));
}
System.exit(0);
/*
final HttpClient httpclient = new DefaultHttpClient();
try {
final HttpGet httpget = new HttpGet(url);
final ResponseHandler<String> responseHandler = new BasicResponseHandler();
final String responseBody = httpclient.execute(httpget, responseHandler);
System.out.println(responseBody);
} finally {
httpclient.getConnectionManager().shutdown();
}
*/
}
}

@ -35,15 +35,11 @@ import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.kelondro.data.meta.DigestURI;
/*
* A class for Parsing robots.txt files.
@ -100,35 +96,6 @@ public final class RobotsTxtParser {
}
}
public static RobotsTxtParser getRobots(String homepage) {
DigestURI theURL;
try {
theURL = new DigestURI(homepage);
} catch (MalformedURLException e1) {
return null;
}
final String urlHostPort = RobotsTxt.getHostPort(theURL);
MultiProtocolURI robotsURL = null;
try {
robotsURL = new MultiProtocolURI("http://" + urlHostPort + "/robots.txt");
} catch (final MalformedURLException e) {
return null;
}
Object[] result;
try {
result = RobotsTxt.downloadRobotsTxt(robotsURL, 0, null);
} catch (Exception e) {
return null;
}
final byte[] robotsTxt = (byte[]) result[RobotsTxt.DOWNLOAD_ROBOTS_TXT];
RobotsTxtParser parserResult = new RobotsTxtParser(new HashSet<String>(), robotsTxt);
return parserResult;
}
private void parse(final BufferedReader reader) {
final ArrayList<String> deny4AllAgents = new ArrayList<String>();
final ArrayList<String> deny4ThisAgents = new ArrayList<String>();

@ -194,8 +194,8 @@ public final class LoaderDispatcher {
}
// check if we have the page in the cache
final CrawlProfile crawlProfile = this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
if (crawlProfile != null && cacheStrategy != CacheStrategy.NOCACHE) {
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
if (cacheStrategy != CacheStrategy.NOCACHE && crawlProfile != null) {
// we have passed a first test if caching is allowed
// now see if there is a cache entry

@ -596,11 +596,6 @@ public final class Switchboard extends serverSwitch
}.start();
*/
// load the robots.txt db
this.log.logConfig("Initializing robots.txt DB");
this.robots = new RobotsTxt(this.tables);
this.log.logConfig("Loaded robots.txt DB: " + this.robots.size() + " entries");
// start a cache manager
this.log.logConfig("Starting HT Cache Manager");
@ -718,6 +713,13 @@ public final class Switchboard extends serverSwitch
// start a loader
this.log.logConfig("Starting Crawl Loader");
this.loader = new LoaderDispatcher(this);
// load the robots.txt db
this.log.logConfig("Initializing robots.txt DB");
this.robots = new RobotsTxt(this.tables, this.loader);
this.log.logConfig("Loaded robots.txt DB: " + this.robots.size() + " entries");
// load oai tables
final Map<String, File> oaiFriends =
OAIListFriendsLoader.loadListFriendsSources(
new File("defaults/oaiListFriendsSource.xml"),

Loading…
Cancel
Save