From e586e423aad17bdc9af494964defb6ef71017df7 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 2 Dec 2014 13:35:19 +0100 Subject: [PATCH] in case that loading from the cache fails, load from wkhtmltopdf without cache using the user agent string given in the crawl profile --- source/net/yacy/cora/util/Html2Image.java | 15 ++++++++++----- source/net/yacy/crawler/data/Snapshots.java | 4 ++-- source/net/yacy/repository/LoaderDispatcher.java | 2 +- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/source/net/yacy/cora/util/Html2Image.java b/source/net/yacy/cora/util/Html2Image.java index 3821f6b42..e845a8a0f 100644 --- a/source/net/yacy/cora/util/Html2Image.java +++ b/source/net/yacy/cora/util/Html2Image.java @@ -71,17 +71,22 @@ public class Html2Image { * @param destination * @return */ - public static boolean writeWkhtmltopdf(String url, String proxy, File destination) { - boolean success = writeWkhtmltopdfInternal(url, proxy, destination); + public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, File destination) { + boolean success = writeWkhtmltopdfInternal(url, proxy, destination, null, false); if (success) return true; if (proxy == null) return false; ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url); - return writeWkhtmltopdfInternal(url, null, destination); + return writeWkhtmltopdfInternal(url, null, destination, userAgent, true); } - private static boolean writeWkhtmltopdfInternal(String url, String proxy, File destination) { + private static boolean writeWkhtmltopdfInternal(String url, String proxy, File destination, String userAgent, boolean ignoreErrors) { final File wkhtmltopdf = wkhtmltopdfMac.exists() ? wkhtmltopdfMac : wkhtmltopdfDebian; - String commandline = wkhtmltopdf.getAbsolutePath() + " -q --title " + url + (proxy == null ? " " : " --proxy " + proxy + " ") + (OS.isMacArchitecture ? "--load-error-handling ignore " : "--ignore-load-errors ") + url + " " + destination.getAbsolutePath(); + String commandline = + wkhtmltopdf.getAbsolutePath() + " -q --title " + url + + (userAgent == null ? "" : "--custom-header 'User-Agent' '" + userAgent + "' --custom-header-propagation") + + (proxy == null ? " " : " --proxy " + proxy + " ") + + (ignoreErrors ? (OS.isMacArchitecture ? "--load-error-handling ignore " : "--ignore-load-errors ") : "") + + url + " " + destination.getAbsolutePath(); try { List message; if (!usexvfb) { diff --git a/source/net/yacy/crawler/data/Snapshots.java b/source/net/yacy/crawler/data/Snapshots.java index fb9fe3f34..394d14f73 100644 --- a/source/net/yacy/crawler/data/Snapshots.java +++ b/source/net/yacy/crawler/data/Snapshots.java @@ -70,14 +70,14 @@ public class Snapshots { * @param proxy - a string of the form 'http://: * @return */ - public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy) { + public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy, String userAgent) { Collection oldPaths = findPaths(url, depth); if (replaceOld) { for (File oldPath: oldPaths) oldPath.delete(); } File path = definePath(url, "pdf", depth, date); path.getParentFile().mkdirs(); - boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, path); + boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, userAgent, path); return success ? path : null; } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index bdfad4ed2..566eca773 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -217,7 +217,7 @@ public final class LoaderDispatcher { String ext = MultiProtocolURL.getFileExtension(file).toLowerCase(); boolean extok = ext.length() == 0 || file.length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext); if (depthok && extok) { - File snapshotFile = sb.snapshots.downloadPDFSnapshot(request.url(), request.depth(), new Date(), crawlProfile.snapshotReplaceold(), sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null); + File snapshotFile = sb.snapshots.downloadPDFSnapshot(request.url(), request.depth(), new Date(), crawlProfile.snapshotReplaceold(), sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, agent.userAgent); log.info("SNAPSHOT - " + (snapshotFile == null ? "could not generate snapshot for " + request.url().toNormalform(true) : "wrote " + snapshotFile + " for " + request.url().toNormalform(true))); } else { //if (!depthok) log.warn("SNAPSHOT: depth not ok, " + (crawlProfile == null ? "profile = null" : "entry.depth() = " + request.depth() + ", profile.snapshotMaxdepth() = " + crawlProfile.snapshotMaxdepth()));