diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index 0162297e0..5a11aa24b 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -257,6 +257,11 @@ public class Response { return this.request.name(); } + /** + * @return the requested URL that produced this response. When redirection(s) + * occurred, this is not the initial URL, but the last redirection + * target. + */ public DigestURL url() { return this.request.url(); } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 897ae9f5c..03feb047d 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -267,7 +267,11 @@ public final class LoaderDispatcher { final String storeError = response.shallStoreCacheForCrawler(); if (storeError == null) { try { - Cache.store(url, response.getResponseHeader(), response.getContent()); + /* Important : we associate here the loaded content with the URL response.url(). + * On eventual redirection(s), response.url() provides the last redirection location. + * If instead we associated content with the initial url (beginning of the redirection(s) chain), + * the parsers would then have a wrong base URL when following links with relative URLs. */ + Cache.store(response.url(), response.getResponseHeader(), response.getContent()); } catch (final IOException e) { LoaderDispatcher.log.warn("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e); }