fixes to crawler and new user-agent crawl-delay handling

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7640 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent b2fe4b7b1a
commit 96c32e87b0

@ -193,7 +193,7 @@ public class Latency {
robotsEntry = null;
}
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0) return 0; // no limits if granted exclusively for this peer
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return 0; // no limits if granted exclusively for this peer
}
waiting = Math.max(waiting, robotsDelay);

@ -52,11 +52,12 @@ public class RobotsEntry {
public static final String SITEMAP = "sitemap";
public static final String CRAWL_DELAY = "crawlDelay";
public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
public static final String AGENT_NAME = "agentname";
// this is a simple record structure that holds all properties of a single crawl start
private final Map<String, byte[]> mem;
private final List<String> allowPathList, denyPathList;
private final String hostName;
private final String hostName, agentName;
public RobotsEntry(final String hostName, final Map<String, byte[]> mem) {
this.hostName = hostName.toLowerCase();
@ -86,6 +87,7 @@ public class RobotsEntry {
} else {
this.allowPathList = new LinkedList<String>();
}
this.agentName = this.mem.containsKey(AGENT_NAME) ? UTF8.String(this.mem.get(AGENT_NAME)) : null;
}
public RobotsEntry(
@ -96,21 +98,24 @@ public class RobotsEntry {
final Date modDate,
final String eTag,
final String sitemap,
final long crawlDelayMillis
final long crawlDelayMillis,
final String agentName
) {
if (theURL == null) throw new IllegalArgumentException("The url is missing");
this.hostName = RobotsTxt.getHostPort(theURL).toLowerCase();
this.allowPathList = new LinkedList<String>();
this.denyPathList = new LinkedList<String>();
this.agentName = agentName;
this.mem = new LinkedHashMap<String, byte[]>(10);
this.mem.put(HOST_NAME, this.hostName.getBytes());
if (loadedDate != null) this.mem.put(LOADED_DATE, Long.toString(loadedDate.getTime()).getBytes());
if (modDate != null) this.mem.put(MOD_DATE, Long.toString(modDate.getTime()).getBytes());
if (eTag != null) this.mem.put(ETAG, eTag.getBytes());
if (sitemap != null) this.mem.put(SITEMAP, sitemap.getBytes());
if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, Long.toString(crawlDelayMillis).getBytes());
this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName));
if (loadedDate != null) this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(loadedDate.getTime())));
if (modDate != null) this.mem.put(MOD_DATE, UTF8.getBytes(Long.toString(modDate.getTime())));
if (eTag != null) this.mem.put(ETAG, UTF8.getBytes(eTag));
if (sitemap != null) this.mem.put(SITEMAP, UTF8.getBytes(sitemap));
if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, UTF8.getBytes(Long.toString(crawlDelayMillis)));
if (agentName != null) this.mem.put(AGENT_NAME, UTF8.getBytes(agentName));
if (allowPathList != null && !allowPathList.isEmpty()) {
this.allowPathList.addAll(allowPathList);
@ -139,6 +144,10 @@ public class RobotsEntry {
return this.hostName;
}
public String getAgentName() {
return this.agentName;
}
public Map<String, byte[]> getMem() {
if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, this.hostName.getBytes());
return this.mem;

@ -167,7 +167,8 @@ public class RobotsTxt {
new Date(),
null,
null,
Integer.valueOf(0));
Integer.valueOf(0),
null);
} else {
robotsTxt4Host.setLoadedDate(new Date());
}
@ -197,7 +198,8 @@ public class RobotsTxt {
(Date) result[DOWNLOAD_MODDATE],
(String) result[DOWNLOAD_ETAG],
parserResult.sitemap(),
parserResult.crawlDelayMillis());
parserResult.crawlDelayMillis(),
parserResult.agentName());
}
}
}
@ -213,12 +215,13 @@ public class RobotsTxt {
final Date modDate,
final String eTag,
final String sitemap,
final long crawlDelayMillis
final long crawlDelayMillis,
final String agentName
) {
final RobotsEntry entry = new RobotsEntry(
theURL, allowPathList, denyPathList,
loadedDate, modDate,
eTag, sitemap, crawlDelayMillis);
eTag, sitemap, crawlDelayMillis, agentName);
addEntry(entry);
return entry;
}

@ -171,7 +171,7 @@ public final class HTTPLoader {
}
} else if (responseBody == null) {
// no response, reject file
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body (you may increase the maxmimum file size)");
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body (code = " + code + ")");
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
} else if (code == 200 || code == 203) {
// the transfer is ok

@ -174,7 +174,7 @@ public final class robotsParser {
final String userAgent = line.substring(pos).trim();
isRule4AllAgents |= userAgent.equals("*");
for (String agent: this.myNames) {
if (userAgent.toLowerCase().indexOf(agent) >= 0) {
if (userAgent.toLowerCase().equals(agent)) {
this.agentName = agent;
isRule4ThisAgents = true;
break;

@ -477,7 +477,6 @@ public class HTTPClient {
setProxy(httpUriRequest.getParams());
// statistics
storeConnectionInfo(httpUriRequest);
try {
// execute the method; some asserts confirm that that the request can be send with Content-Length and is therefore not terminated by EOF
if (httpUriRequest instanceof HttpEntityEnclosingRequest) {
HttpEntityEnclosingRequest hrequest = (HttpEntityEnclosingRequest) httpUriRequest;
@ -488,13 +487,17 @@ public class HTTPClient {
assert !hrequest.expectContinue();
}
synchronized (httpClient) {
// without synchronization we get an interruptedException here very often
try {
httpResponse = httpClient.execute(httpUriRequest, httpContext);
} catch (Exception e) {
} catch (IOException e) {
ConnectionInfo.removeConnection(httpUriRequest.hashCode());
httpUriRequest.abort();
throw new IOException("Client can't execute: " + e.getMessage());
}
}
}
private void setHeaders(final HttpUriRequest httpUriRequest) {
if (headers != null) {

@ -157,16 +157,16 @@ public final class LoaderDispatcher {
// which may be successful faster because of a cache hit
}
try {
this.loaderSteering.put(url, new Semaphore(0));
Response response = loadInternal(request, cacheStrategy, maxFileSize, checkBlacklist);
check = this.loaderSteering.remove(url);
try {
Response response = loadInternal(request, cacheStrategy, maxFileSize, checkBlacklist);check = this.loaderSteering.remove(url);
if (check != null) check.release(1000);
return response;
} catch (Exception e) {
} catch (IOException e) {
// release the semaphore anyway
check = this.loaderSteering.remove(url);
if (check != null) check.release(1000);
//Log.logException(e);
throw new IOException(e);
}
}

Loading…
Cancel
Save