From cd279907c0d3d5eee02c4a5e91ba134ad9e4b5f4 Mon Sep 17 00:00:00 2001 From: theli Date: Tue, 28 Jun 2005 08:01:26 +0000 Subject: [PATCH] *) Adding redirection support to plasmaCrawlWorker.java git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@327 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/http/httpHeader.java | 1 + .../de/anomic/plasma/plasmaCrawlWorker.java | 61 ++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/source/de/anomic/http/httpHeader.java b/source/de/anomic/http/httpHeader.java index 8db02a03b..f1c57a1d9 100644 --- a/source/de/anomic/http/httpHeader.java +++ b/source/de/anomic/http/httpHeader.java @@ -105,6 +105,7 @@ public final class httpHeader extends TreeMap implements Map { public static final String RANGE = "Range"; public static final String CACHE_CONTROL = "Cache-Control"; public static final String TRANSFER_ENCODING = "Transfer-Encoding"; + public static final String LOCATION = "Location"; public static final String X_CACHE = "X-Cache"; public static final String X_CACHE_LOOKUP = "X-Cache-Lookup"; diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java index 5b8f46d93..c0c7a167c 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/plasmaCrawlWorker.java @@ -232,6 +232,35 @@ public final class plasmaCrawlWorker extends Thread { plasmaHTCache cacheManager, serverLog log ) throws IOException { + load(url, + referer, + initiator, + depth, + profile, + socketTimeout, + remoteProxyHost, + remoteProxyPort, + remoteProxyUse, + cacheManager, + log, + 0 + ); + } + + public static void load( + URL url, + String referer, + String initiator, + int depth, + plasmaCrawlProfile.entry profile, + int socketTimeout, + String remoteProxyHost, + int remoteProxyPort, + boolean remoteProxyUse, + plasmaHTCache cacheManager, + serverLog log, + int redirectionCount + ) throws IOException { if (url == null) return; Date requestDate = new Date(); // remember the time... String host = url.getHost(); @@ -316,7 +345,37 @@ public final class plasmaCrawlWorker extends Thread { if (cacheFile.exists()) cacheFile.delete(); log.logError("CRAWLER LOADER ERROR1: with url=" + url.toString() + ": " + e.toString()); } - } else { + } else if (res.status.startsWith("30")) { + if (redirectionCount < 5) { + if (res.responseHeader.containsKey(httpHeader.LOCATION)) { + // generating the new url + URL redirectionUrl = new URL(url, (String) res.responseHeader.get(httpHeader.LOCATION)); + + // returning the used httpc + httpc.returnInstance(remote); + remote = null; + + // restart crawling with new url + log.logInfo("Redirection detected ('" + res.status + "') for url " + url.toString() + + "\nRedirecting request to: " + redirectionUrl); + load(redirectionUrl, + referer, + initiator, + depth, + profile, + socketTimeout, + remoteProxyHost, + remoteProxyPort, + remoteProxyUse, + cacheManager, + log, + ++redirectionCount + ); + } + } else { + log.logInfo("Redirection counter exceeded for url " + url.toString() + ". Processing aborted."); + } + }else { // if the response has not the right response type then reject file log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for url " + url.toString()); // not processed any further