From 7baa99f26f752ccdab32f03bdecf7645ec564ab6 Mon Sep 17 00:00:00 2001 From: luccioman Date: Sat, 20 Jan 2018 18:54:08 +0100 Subject: [PATCH] Fixed stored URL in web cache when redirection(s) occurs. Associate cached content to the last redirection location, instead of the first URL of a redirection(s) chain : - for proper base URL processing in parsers (fixes mantis 636 - http://mantis.tokeek.de/view.php?id=636) - to prevent duplicated content in Solr index when recrawling a redirected URL --- source/net/yacy/crawler/retrieval/Response.java | 5 +++++ source/net/yacy/repository/LoaderDispatcher.java | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index 0162297e0..5a11aa24b 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -257,6 +257,11 @@ public class Response { return this.request.name(); } + /** + * @return the requested URL that produced this response. When redirection(s) + * occurred, this is not the initial URL, but the last redirection + * target. + */ public DigestURL url() { return this.request.url(); } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 897ae9f5c..03feb047d 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -267,7 +267,11 @@ public final class LoaderDispatcher { final String storeError = response.shallStoreCacheForCrawler(); if (storeError == null) { try { - Cache.store(url, response.getResponseHeader(), response.getContent()); + /* Important : we associate here the loaded content with the URL response.url(). + * On eventual redirection(s), response.url() provides the last redirection location. + * If instead we associated content with the initial url (beginning of the redirection(s) chain), + * the parsers would then have a wrong base URL when following links with relative URLs. */ + Cache.store(response.url(), response.getResponseHeader(), response.getContent()); } catch (final IOException e) { LoaderDispatcher.log.warn("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e); }