*) Bugfix for crawling URLs with query parameters

See: http://www.yacy-forum.de/viewtopic.php?p=14065
*) Preparation for http://www.yacy-forum.de/viewtopic.php?t=1719

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1405 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent 0fcc07d3f7
commit ecdc1f7547

@ -278,7 +278,7 @@ public final class plasmaCrawlWorker extends Thread {
); );
} }
private static void load( private static plasmaHTCache.Entry load(
URL url, URL url,
String name, String name,
String referer, String referer,
@ -292,17 +292,17 @@ public final class plasmaCrawlWorker extends Thread {
int crawlingRetryCount, int crawlingRetryCount,
boolean useContentEncodingGzip boolean useContentEncodingGzip
) throws IOException { ) throws IOException {
if (url == null) return; if (url == null) return null;
// if the recrawling limit was exceeded we stop crawling now // if the recrawling limit was exceeded we stop crawling now
if (crawlingRetryCount <= 0) return; if (crawlingRetryCount <= 0) return null;
// getting a reference to the plasmaSwitchboard // getting a reference to the plasmaSwitchboard
plasmaSwitchboard sb = plasmaCrawlLoader.switchboard; plasmaSwitchboard sb = plasmaCrawlLoader.switchboard;
Date requestDate = new Date(); // remember the time... Date requestDate = new Date(); // remember the time...
String host = url.getHost(); String host = url.getHost();
String path = url.getPath(); String path = url.getFile();
int port = url.getPort(); int port = url.getPort();
boolean ssl = url.getProtocol().equals("https"); boolean ssl = url.getProtocol().equals("https");
if (port < 0) port = (ssl) ? 443 : 80; if (port < 0) port = (ssl) ? 443 : 80;
@ -321,7 +321,7 @@ public final class plasmaCrawlWorker extends Thread {
new bitfield(plasmaURL.urlFlagLength), new bitfield(plasmaURL.urlFlagLength),
true true
); );
return; return null;
} }
// TODO: resolve yacy and yacyh domains // TODO: resolve yacy and yacyh domains
@ -333,6 +333,7 @@ public final class plasmaCrawlWorker extends Thread {
// take a file from the net // take a file from the net
httpc remote = null; httpc remote = null;
plasmaHTCache.Entry htCache = null;
try { try {
// create a request header // create a request header
httpHeader requestHeader = new httpHeader(); httpHeader requestHeader = new httpHeader();
@ -362,14 +363,14 @@ public final class plasmaCrawlWorker extends Thread {
//long contentLength = res.responseHeader.contentLength(); //long contentLength = res.responseHeader.contentLength();
// reserve cache entry // reserve cache entry
plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, name, requestHeader, res.status, res.responseHeader, initiator, profile); htCache = cacheManager.newEntry(requestDate, depth, url, name, requestHeader, res.status, res.responseHeader, initiator, profile);
if (!htCache.cacheFile.getCanonicalPath().startsWith(cacheManager.cachePath.getCanonicalPath())) { if (!htCache.cacheFile.getCanonicalPath().startsWith(cacheManager.cachePath.getCanonicalPath())) {
// if the response has not the right file type then reject file // if the response has not the right file type then reject file
remote.close(); remote.close();
log.logInfo("REJECTED URL " + url.toString() + " because of an invalid file path ('" + log.logInfo("REJECTED URL " + url.toString() + " because of an invalid file path ('" +
htCache.cacheFile.getCanonicalPath() + "' does not start with '" + htCache.cacheFile.getCanonicalPath() + "' does not start with '" +
cacheManager.cachePath.getAbsolutePath() + "')."); cacheManager.cachePath.getAbsolutePath() + "').");
return; return null;
} }
// request has been placed and result has been returned. work off response // request has been placed and result has been returned. work off response
@ -391,15 +392,16 @@ public final class plasmaCrawlWorker extends Thread {
} finally { } finally {
if (fos!=null)try{fos.close();}catch(Exception e){} if (fos!=null)try{fos.close();}catch(Exception e){}
} }
// enQueue new entry with response header
if (profile != null) {
cacheManager.push(htCache);
}
} else { } else {
// if the response has not the right file type then reject file // if the response has not the right file type then reject file
remote.close(); remote.close();
log.logInfo("REJECTED WRONG MIME/EXT TYPE " + res.responseHeader.mime() + " for URL " + url.toString()); log.logInfo("REJECTED WRONG MIME/EXT TYPE " + res.responseHeader.mime() + " for URL " + url.toString());
return; htCache = null;
}
// enQueue new entry with response header
if (profile != null) {
cacheManager.push(htCache);
} }
} catch (SocketException e) { } catch (SocketException e) {
// this may happen if the client suddenly closes its connection // this may happen if the client suddenly closes its connection
@ -409,6 +411,7 @@ public final class plasmaCrawlWorker extends Thread {
// and most possible corrupted // and most possible corrupted
if (cacheFile.exists()) cacheFile.delete(); if (cacheFile.exists()) cacheFile.delete();
log.logSevere("CRAWLER LOADER ERROR1: with URL=" + url.toString() + ": " + e.toString()); log.logSevere("CRAWLER LOADER ERROR1: with URL=" + url.toString() + ": " + e.toString());
htCache = null;
} }
} else if (res.status.startsWith("30")) { } else if (res.status.startsWith("30")) {
if (crawlingRetryCount > 0) { if (crawlingRetryCount > 0) {
@ -419,7 +422,7 @@ public final class plasmaCrawlWorker extends Thread {
if (redirectionUrlString.length() == 0) { if (redirectionUrlString.length() == 0) {
log.logWarning("CRAWLER Redirection of URL=" + url.toString() + " aborted. Location header is empty."); log.logWarning("CRAWLER Redirection of URL=" + url.toString() + " aborted. Location header is empty.");
return; return null;
} }
// normalizing URL // normalizing URL
@ -439,7 +442,7 @@ public final class plasmaCrawlWorker extends Thread {
// if we are already doing a shutdown we don't need to retry crawling // if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) { if (Thread.currentThread().isInterrupted()) {
log.logSevere("CRAWLER Retry of URL=" + url.toString() + " aborted because of server shutdown."); log.logSevere("CRAWLER Retry of URL=" + url.toString() + " aborted because of server shutdown.");
return; return null;
} }
// generating url hash // generating url hash
@ -449,7 +452,7 @@ public final class plasmaCrawlWorker extends Thread {
plasmaCrawlLoader.switchboard.urlPool.noticeURL.remove(urlhash); plasmaCrawlLoader.switchboard.urlPool.noticeURL.remove(urlhash);
// retry crawling with new url // retry crawling with new url
load(redirectionUrl, plasmaHTCache.Entry redirectedEntry = load(redirectionUrl,
name, name,
referer, referer,
initiator, initiator,
@ -462,6 +465,22 @@ public final class plasmaCrawlWorker extends Thread {
--crawlingRetryCount, --crawlingRetryCount,
useContentEncodingGzip useContentEncodingGzip
); );
if (redirectedEntry != null) {
// TODO: Here we can store the content of the redirection
// as content of the original URL if some criterias are met
// See: http://www.yacy-forum.de/viewtopic.php?t=1719
//
// plasmaHTCache.Entry newEntry = (plasmaHTCache.Entry) redirectedEntry.clone();
// newEntry.url = url;
// TODO: which http header should we store here?
//
// // enQueue new entry with response header
// if (profile != null) {
// cacheManager.push(newEntry);
// }
// htCache = newEntry;
}
} }
} else { } else {
log.logInfo("Redirection counter exceeded for URL " + url.toString() + ". Processing aborted."); log.logInfo("Redirection counter exceeded for URL " + url.toString() + ". Processing aborted.");
@ -471,7 +490,9 @@ public final class plasmaCrawlWorker extends Thread {
log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for URL " + url.toString()); log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for URL " + url.toString());
// not processed any further // not processed any further
} }
if (remote != null) remote.close(); if (remote != null) remote.close();
return htCache;
} catch (Exception e) { } catch (Exception e) {
boolean retryCrawling = false; boolean retryCrawling = false;
String errorMsg = e.getMessage(); String errorMsg = e.getMessage();
@ -522,7 +543,7 @@ public final class plasmaCrawlWorker extends Thread {
// if we are already doing a shutdown we don't need to retry crawling // if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) { if (Thread.currentThread().isInterrupted()) {
log.logSevere("CRAWLER Retry of URL=" + url.toString() + " aborted because of server shutdown."); log.logSevere("CRAWLER Retry of URL=" + url.toString() + " aborted because of server shutdown.");
return; return null;
} }
// returning the used httpc // returning the used httpc
@ -533,7 +554,7 @@ public final class plasmaCrawlWorker extends Thread {
if (crawlingRetryCount > 2) crawlingRetryCount = 2; if (crawlingRetryCount > 2) crawlingRetryCount = 2;
// retry crawling // retry crawling
load(url, return load(url,
name, name,
referer, referer,
initiator, initiator,
@ -547,6 +568,7 @@ public final class plasmaCrawlWorker extends Thread {
false false
); );
} }
return null;
} finally { } finally {
if (remote != null) httpc.returnInstance(remote); if (remote != null) httpc.returnInstance(remote);
} }

@ -675,6 +675,20 @@ public final class plasmaHTCache {
public plasmaCrawlProfile.entry profile; public plasmaCrawlProfile.entry profile;
private String initiator; private String initiator;
protected Object clone() throws CloneNotSupportedException {
return new Entry(
this.initDate,
this.depth,
this.url,
this.name,
this.requestHeader,
this.responseStatus,
this.responseHeader,
this.initiator,
this.profile
);
}
public Entry(Date initDate, int depth, URL url, String name, public Entry(Date initDate, int depth, URL url, String name,
httpHeader requestHeader, httpHeader requestHeader,
String responseStatus, httpHeader responseHeader, String responseStatus, httpHeader responseHeader,

Loading…
Cancel
Save