Fixed stored URL in web cache when redirection(s) occurs.

Associate cached content to the last redirection location, instead of
the first URL of a redirection(s) chain :
 - for proper base URL processing in parsers (fixes mantis 636 -
http://mantis.tokeek.de/view.php?id=636)
 - to prevent duplicated content in Solr index when recrawling a
redirected URL
pull/155/head
luccioman 7 years ago
parent 5e2812c060
commit 7baa99f26f

@ -257,6 +257,11 @@ public class Response {
return this.request.name(); return this.request.name();
} }
/**
* @return the requested URL that produced this response. When redirection(s)
* occurred, this is not the initial URL, but the last redirection
* target.
*/
public DigestURL url() { public DigestURL url() {
return this.request.url(); return this.request.url();
} }

@ -267,7 +267,11 @@ public final class LoaderDispatcher {
final String storeError = response.shallStoreCacheForCrawler(); final String storeError = response.shallStoreCacheForCrawler();
if (storeError == null) { if (storeError == null) {
try { try {
Cache.store(url, response.getResponseHeader(), response.getContent()); /* Important : we associate here the loaded content with the URL response.url().
* On eventual redirection(s), response.url() provides the last redirection location.
* If instead we associated content with the initial url (beginning of the redirection(s) chain),
* the parsers would then have a wrong base URL when following links with relative URLs. */
Cache.store(response.url(), response.getResponseHeader(), response.getContent());
} catch (final IOException e) { } catch (final IOException e) {
LoaderDispatcher.log.warn("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e); LoaderDispatcher.log.warn("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e);
} }

Loading…
Cancel
Save