|
|
@ -278,7 +278,7 @@ public final class plasmaCrawlWorker extends Thread {
|
|
|
|
);
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private static void load(
|
|
|
|
private static plasmaHTCache.Entry load(
|
|
|
|
URL url,
|
|
|
|
URL url,
|
|
|
|
String name,
|
|
|
|
String name,
|
|
|
|
String referer,
|
|
|
|
String referer,
|
|
|
@ -292,17 +292,17 @@ public final class plasmaCrawlWorker extends Thread {
|
|
|
|
int crawlingRetryCount,
|
|
|
|
int crawlingRetryCount,
|
|
|
|
boolean useContentEncodingGzip
|
|
|
|
boolean useContentEncodingGzip
|
|
|
|
) throws IOException {
|
|
|
|
) throws IOException {
|
|
|
|
if (url == null) return;
|
|
|
|
if (url == null) return null;
|
|
|
|
|
|
|
|
|
|
|
|
// if the recrawling limit was exceeded we stop crawling now
|
|
|
|
// if the recrawling limit was exceeded we stop crawling now
|
|
|
|
if (crawlingRetryCount <= 0) return;
|
|
|
|
if (crawlingRetryCount <= 0) return null;
|
|
|
|
|
|
|
|
|
|
|
|
// getting a reference to the plasmaSwitchboard
|
|
|
|
// getting a reference to the plasmaSwitchboard
|
|
|
|
plasmaSwitchboard sb = plasmaCrawlLoader.switchboard;
|
|
|
|
plasmaSwitchboard sb = plasmaCrawlLoader.switchboard;
|
|
|
|
|
|
|
|
|
|
|
|
Date requestDate = new Date(); // remember the time...
|
|
|
|
Date requestDate = new Date(); // remember the time...
|
|
|
|
String host = url.getHost();
|
|
|
|
String host = url.getHost();
|
|
|
|
String path = url.getPath();
|
|
|
|
String path = url.getFile();
|
|
|
|
int port = url.getPort();
|
|
|
|
int port = url.getPort();
|
|
|
|
boolean ssl = url.getProtocol().equals("https");
|
|
|
|
boolean ssl = url.getProtocol().equals("https");
|
|
|
|
if (port < 0) port = (ssl) ? 443 : 80;
|
|
|
|
if (port < 0) port = (ssl) ? 443 : 80;
|
|
|
@ -321,7 +321,7 @@ public final class plasmaCrawlWorker extends Thread {
|
|
|
|
new bitfield(plasmaURL.urlFlagLength),
|
|
|
|
new bitfield(plasmaURL.urlFlagLength),
|
|
|
|
true
|
|
|
|
true
|
|
|
|
);
|
|
|
|
);
|
|
|
|
return;
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// TODO: resolve yacy and yacyh domains
|
|
|
|
// TODO: resolve yacy and yacyh domains
|
|
|
@ -333,6 +333,7 @@ public final class plasmaCrawlWorker extends Thread {
|
|
|
|
|
|
|
|
|
|
|
|
// take a file from the net
|
|
|
|
// take a file from the net
|
|
|
|
httpc remote = null;
|
|
|
|
httpc remote = null;
|
|
|
|
|
|
|
|
plasmaHTCache.Entry htCache = null;
|
|
|
|
try {
|
|
|
|
try {
|
|
|
|
// create a request header
|
|
|
|
// create a request header
|
|
|
|
httpHeader requestHeader = new httpHeader();
|
|
|
|
httpHeader requestHeader = new httpHeader();
|
|
|
@ -362,14 +363,14 @@ public final class plasmaCrawlWorker extends Thread {
|
|
|
|
//long contentLength = res.responseHeader.contentLength();
|
|
|
|
//long contentLength = res.responseHeader.contentLength();
|
|
|
|
|
|
|
|
|
|
|
|
// reserve cache entry
|
|
|
|
// reserve cache entry
|
|
|
|
plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, name, requestHeader, res.status, res.responseHeader, initiator, profile);
|
|
|
|
htCache = cacheManager.newEntry(requestDate, depth, url, name, requestHeader, res.status, res.responseHeader, initiator, profile);
|
|
|
|
if (!htCache.cacheFile.getCanonicalPath().startsWith(cacheManager.cachePath.getCanonicalPath())) {
|
|
|
|
if (!htCache.cacheFile.getCanonicalPath().startsWith(cacheManager.cachePath.getCanonicalPath())) {
|
|
|
|
// if the response has not the right file type then reject file
|
|
|
|
// if the response has not the right file type then reject file
|
|
|
|
remote.close();
|
|
|
|
remote.close();
|
|
|
|
log.logInfo("REJECTED URL " + url.toString() + " because of an invalid file path ('" +
|
|
|
|
log.logInfo("REJECTED URL " + url.toString() + " because of an invalid file path ('" +
|
|
|
|
htCache.cacheFile.getCanonicalPath() + "' does not start with '" +
|
|
|
|
htCache.cacheFile.getCanonicalPath() + "' does not start with '" +
|
|
|
|
cacheManager.cachePath.getAbsolutePath() + "').");
|
|
|
|
cacheManager.cachePath.getAbsolutePath() + "').");
|
|
|
|
return;
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// request has been placed and result has been returned. work off response
|
|
|
|
// request has been placed and result has been returned. work off response
|
|
|
@ -391,15 +392,16 @@ public final class plasmaCrawlWorker extends Thread {
|
|
|
|
} finally {
|
|
|
|
} finally {
|
|
|
|
if (fos!=null)try{fos.close();}catch(Exception e){}
|
|
|
|
if (fos!=null)try{fos.close();}catch(Exception e){}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// enQueue new entry with response header
|
|
|
|
|
|
|
|
if (profile != null) {
|
|
|
|
|
|
|
|
cacheManager.push(htCache);
|
|
|
|
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
// if the response has not the right file type then reject file
|
|
|
|
// if the response has not the right file type then reject file
|
|
|
|
remote.close();
|
|
|
|
remote.close();
|
|
|
|
log.logInfo("REJECTED WRONG MIME/EXT TYPE " + res.responseHeader.mime() + " for URL " + url.toString());
|
|
|
|
log.logInfo("REJECTED WRONG MIME/EXT TYPE " + res.responseHeader.mime() + " for URL " + url.toString());
|
|
|
|
return;
|
|
|
|
htCache = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
// enQueue new entry with response header
|
|
|
|
|
|
|
|
if (profile != null) {
|
|
|
|
|
|
|
|
cacheManager.push(htCache);
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} catch (SocketException e) {
|
|
|
|
} catch (SocketException e) {
|
|
|
|
// this may happen if the client suddenly closes its connection
|
|
|
|
// this may happen if the client suddenly closes its connection
|
|
|
@ -409,6 +411,7 @@ public final class plasmaCrawlWorker extends Thread {
|
|
|
|
// and most possible corrupted
|
|
|
|
// and most possible corrupted
|
|
|
|
if (cacheFile.exists()) cacheFile.delete();
|
|
|
|
if (cacheFile.exists()) cacheFile.delete();
|
|
|
|
log.logSevere("CRAWLER LOADER ERROR1: with URL=" + url.toString() + ": " + e.toString());
|
|
|
|
log.logSevere("CRAWLER LOADER ERROR1: with URL=" + url.toString() + ": " + e.toString());
|
|
|
|
|
|
|
|
htCache = null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (res.status.startsWith("30")) {
|
|
|
|
} else if (res.status.startsWith("30")) {
|
|
|
|
if (crawlingRetryCount > 0) {
|
|
|
|
if (crawlingRetryCount > 0) {
|
|
|
@ -419,7 +422,7 @@ public final class plasmaCrawlWorker extends Thread {
|
|
|
|
|
|
|
|
|
|
|
|
if (redirectionUrlString.length() == 0) {
|
|
|
|
if (redirectionUrlString.length() == 0) {
|
|
|
|
log.logWarning("CRAWLER Redirection of URL=" + url.toString() + " aborted. Location header is empty.");
|
|
|
|
log.logWarning("CRAWLER Redirection of URL=" + url.toString() + " aborted. Location header is empty.");
|
|
|
|
return;
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// normalizing URL
|
|
|
|
// normalizing URL
|
|
|
@ -439,7 +442,7 @@ public final class plasmaCrawlWorker extends Thread {
|
|
|
|
// if we are already doing a shutdown we don't need to retry crawling
|
|
|
|
// if we are already doing a shutdown we don't need to retry crawling
|
|
|
|
if (Thread.currentThread().isInterrupted()) {
|
|
|
|
if (Thread.currentThread().isInterrupted()) {
|
|
|
|
log.logSevere("CRAWLER Retry of URL=" + url.toString() + " aborted because of server shutdown.");
|
|
|
|
log.logSevere("CRAWLER Retry of URL=" + url.toString() + " aborted because of server shutdown.");
|
|
|
|
return;
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// generating url hash
|
|
|
|
// generating url hash
|
|
|
@ -449,7 +452,7 @@ public final class plasmaCrawlWorker extends Thread {
|
|
|
|
plasmaCrawlLoader.switchboard.urlPool.noticeURL.remove(urlhash);
|
|
|
|
plasmaCrawlLoader.switchboard.urlPool.noticeURL.remove(urlhash);
|
|
|
|
|
|
|
|
|
|
|
|
// retry crawling with new url
|
|
|
|
// retry crawling with new url
|
|
|
|
load(redirectionUrl,
|
|
|
|
plasmaHTCache.Entry redirectedEntry = load(redirectionUrl,
|
|
|
|
name,
|
|
|
|
name,
|
|
|
|
referer,
|
|
|
|
referer,
|
|
|
|
initiator,
|
|
|
|
initiator,
|
|
|
@ -462,6 +465,22 @@ public final class plasmaCrawlWorker extends Thread {
|
|
|
|
--crawlingRetryCount,
|
|
|
|
--crawlingRetryCount,
|
|
|
|
useContentEncodingGzip
|
|
|
|
useContentEncodingGzip
|
|
|
|
);
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (redirectedEntry != null) {
|
|
|
|
|
|
|
|
// TODO: Here we can store the content of the redirection
|
|
|
|
|
|
|
|
// as content of the original URL if some criterias are met
|
|
|
|
|
|
|
|
// See: http://www.yacy-forum.de/viewtopic.php?t=1719
|
|
|
|
|
|
|
|
//
|
|
|
|
|
|
|
|
// plasmaHTCache.Entry newEntry = (plasmaHTCache.Entry) redirectedEntry.clone();
|
|
|
|
|
|
|
|
// newEntry.url = url;
|
|
|
|
|
|
|
|
// TODO: which http header should we store here?
|
|
|
|
|
|
|
|
//
|
|
|
|
|
|
|
|
// // enQueue new entry with response header
|
|
|
|
|
|
|
|
// if (profile != null) {
|
|
|
|
|
|
|
|
// cacheManager.push(newEntry);
|
|
|
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
// htCache = newEntry;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
log.logInfo("Redirection counter exceeded for URL " + url.toString() + ". Processing aborted.");
|
|
|
|
log.logInfo("Redirection counter exceeded for URL " + url.toString() + ". Processing aborted.");
|
|
|
@ -471,7 +490,9 @@ public final class plasmaCrawlWorker extends Thread {
|
|
|
|
log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for URL " + url.toString());
|
|
|
|
log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for URL " + url.toString());
|
|
|
|
// not processed any further
|
|
|
|
// not processed any further
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (remote != null) remote.close();
|
|
|
|
if (remote != null) remote.close();
|
|
|
|
|
|
|
|
return htCache;
|
|
|
|
} catch (Exception e) {
|
|
|
|
} catch (Exception e) {
|
|
|
|
boolean retryCrawling = false;
|
|
|
|
boolean retryCrawling = false;
|
|
|
|
String errorMsg = e.getMessage();
|
|
|
|
String errorMsg = e.getMessage();
|
|
|
@ -522,7 +543,7 @@ public final class plasmaCrawlWorker extends Thread {
|
|
|
|
// if we are already doing a shutdown we don't need to retry crawling
|
|
|
|
// if we are already doing a shutdown we don't need to retry crawling
|
|
|
|
if (Thread.currentThread().isInterrupted()) {
|
|
|
|
if (Thread.currentThread().isInterrupted()) {
|
|
|
|
log.logSevere("CRAWLER Retry of URL=" + url.toString() + " aborted because of server shutdown.");
|
|
|
|
log.logSevere("CRAWLER Retry of URL=" + url.toString() + " aborted because of server shutdown.");
|
|
|
|
return;
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// returning the used httpc
|
|
|
|
// returning the used httpc
|
|
|
@ -533,7 +554,7 @@ public final class plasmaCrawlWorker extends Thread {
|
|
|
|
if (crawlingRetryCount > 2) crawlingRetryCount = 2;
|
|
|
|
if (crawlingRetryCount > 2) crawlingRetryCount = 2;
|
|
|
|
|
|
|
|
|
|
|
|
// retry crawling
|
|
|
|
// retry crawling
|
|
|
|
load(url,
|
|
|
|
return load(url,
|
|
|
|
name,
|
|
|
|
name,
|
|
|
|
referer,
|
|
|
|
referer,
|
|
|
|
initiator,
|
|
|
|
initiator,
|
|
|
@ -547,6 +568,7 @@ public final class plasmaCrawlWorker extends Thread {
|
|
|
|
false
|
|
|
|
false
|
|
|
|
);
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return null;
|
|
|
|
} finally {
|
|
|
|
} finally {
|
|
|
|
if (remote != null) httpc.returnInstance(remote);
|
|
|
|
if (remote != null) httpc.returnInstance(remote);
|
|
|
|
}
|
|
|
|
}
|
|
|
|