From 08ea0b03977faebc5504afb7427068a3ab83a5f7 Mon Sep 17 00:00:00 2001 From: luccioman Date: Tue, 11 Dec 2018 22:31:31 +0100 Subject: [PATCH] Added a configurable timeout to wkhtmltopdf calls for pdf snapshots Necessary to prevent blocking the indexing workflow when some wkhtmltopdf renderings fail without terminating --- defaults/yacy.init | 4 + source/net/yacy/cora/util/Html2Image.java | 108 +++++++++++++----- .../net/yacy/crawler/data/Transactions.java | 14 ++- source/net/yacy/kelondro/util/OS.java | 18 ++- source/net/yacy/search/Switchboard.java | 3 +- .../net/yacy/search/SwitchboardConstants.java | 10 +- 6 files changed, 121 insertions(+), 36 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index d2175eb35..92367510b 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -858,6 +858,10 @@ crawler.latencyFactor = 0.5 # defined here crawler.onDemandLimit = 1000 +# The maximum time in seconds to wait for each wkhtmltopdf call when rendering PDF snapshots +# Beyond that limit the process is killed +snapshots.wkhtmltopdf.timeout = 30 + # maximum size of indexing queue indexer.slots = 100 diff --git a/source/net/yacy/cora/util/Html2Image.java b/source/net/yacy/cora/util/Html2Image.java index d9adeb5bb..fa4bd17cf 100644 --- a/source/net/yacy/cora/util/Html2Image.java +++ b/source/net/yacy/cora/util/Html2Image.java @@ -193,21 +193,30 @@ public class Html2Image { return available; } - /** - * write a pdf of a web page - * @param url - * @param proxy must be of the form http://host:port; use YaCy here as proxy which is mostly http://localhost:8090 - * @param destination - * @return - */ - public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, final String acceptLanguage, File destination) { + /** + * Run the wkhtmltopdf external tool to fetch and render to PDF a web resource. + * wKhtmltopdf may be called multiple times with various parameters flavors in + * case of failure. + * + * @param url the URL of a web resource to fetch, render and convert to + * a pdf file. Must not be null. + * @param proxy the eventual proxy address to use. Can be null. Must be of + * the form http://host:port; use YaCy here as proxy which is + * mostly http://localhost:8090 + * @param destination the destination PDF file that should be written. Must not + * be null. + * @param maxSeconds the maximum time in seconds to wait for each wkhtmltopdf + * call termination. Beyond this limit the process is killed. + * @return true when the destination file was successfully written + */ + public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, final String acceptLanguage, final File destination, final long maxSeconds) { boolean success = false; for (boolean ignoreErrors: new boolean[]{false, true}) { - success = writeWkhtmltopdfInternal(url, proxy, destination, userAgent, acceptLanguage, ignoreErrors); + success = writeWkhtmltopdfInternal(url, proxy, destination, userAgent, acceptLanguage, ignoreErrors, maxSeconds); if (success) break; if (!success && proxy != null) { ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url); - success = writeWkhtmltopdfInternal(url, null, destination, userAgent, acceptLanguage, ignoreErrors); + success = writeWkhtmltopdfInternal(url, null, destination, userAgent, acceptLanguage, ignoreErrors, maxSeconds); if (success) break; } } @@ -219,7 +228,23 @@ public class Html2Image { return success; } - private static boolean writeWkhtmltopdfInternal(final String url, final String proxy, final File destination, final String userAgent, final String acceptLanguage, final boolean ignoreErrors) { + /** + * Run wkhtmltopdf in a separate process to fetch and render to PDF a web + * resource. + * + * @param url the URL of a web resource to fetch, render and convert to + * a pdf file. Must not be null. + * @param proxy the eventual proxy address to use. Can be null. + * @param destination the destination PDF file that should be written. Must not + * be null. + * @param ignoreErrors when true wkhtmltopdf is instructed to ignore load errors + * @param maxSeconds the maximum time in seconds to wait for the wkhtmltopdf + * dedicated process termination. Beyond this limit the + * process is killed. + * @return true when the destination file was successfully written + */ + private static boolean writeWkhtmltopdfInternal(final String url, final String proxy, final File destination, + final String userAgent, final String acceptLanguage, final boolean ignoreErrors, final long maxSeconds) { final String wkhtmltopdfCmd; final File wkhtmltopdf = wkhtmltopdfExecutable(); if(wkhtmltopdf != null) { @@ -241,26 +266,57 @@ public class Html2Image { url + " " + destination.getAbsolutePath(); try { ConcurrentLog.info("Html2Pdf", "creating pdf from url " + url + " with command: " + commandline); - List message; - if (!usexvfb) { - message = OS.execSynchronous(commandline); - if (destination.exists()) return true; - ConcurrentLog.warn("Html2Image", "failed to create pdf " + (proxy == null ? "" : "using proxy " + proxy) + " with command: " + commandline); - for (String m: message) ConcurrentLog.warn("Html2Image", ">> " + m); + if (!usexvfb && execWkhtmlToPdf(proxy, destination, commandline, maxSeconds)) { + return true; } // if this fails, we should try to wrap the X server with a virtual screen using xvfb, this works on headless servers commandline = "xvfb-run -a " + commandline; - message = OS.execSynchronous(commandline); - if (destination.exists()) {usexvfb = true; return true;} - ConcurrentLog.warn("Html2Pdf", "failed to create pdf " + (proxy == null ? "" : "using proxy " + proxy) + " and xvfb with command: " + commandline); - for (String m: message) ConcurrentLog.warn("Html2Image", ">> " + m); - return false; - } catch (IOException e) { - e.printStackTrace(); - ConcurrentLog.warn("Html2Pdf", "exception while creation of pdf with command: " + commandline); + return execWkhtmlToPdf(proxy, destination, commandline, maxSeconds); + } catch (final IOException e) { + ConcurrentLog.warn("Html2Pdf", "exception while creation of pdf with command: " + commandline, e); return false; } } + + /** + * Run a wkhtmltopdf commandline in a separate process. + * + * @param proxy the eventual proxy address to use. Can be null. + * @param destination the destination PDF file that should be written. Must not + * be null. + * @param commandline the wkhtmltopdf command line to execute. Must not be null. + * @param maxSeconds the maximum time in seconds to wait for the process + * termination. Beyond this limit the process is killed. + * @return true when the destination file was successfully written + * @throws IOException when an unexpected error occurred + */ + private static boolean execWkhtmlToPdf(final String proxy, final File destination, final String commandline, final long maxSeconds) + throws IOException { + final Process p = Runtime.getRuntime().exec(commandline); + + try { + p.waitFor(maxSeconds, TimeUnit.SECONDS); + } catch (final InterruptedException e) { + p.destroyForcibly(); + ConcurrentLog.warn("Html2Pdf", "Interrupted creation of pdf. Killing the process started with command : " + commandline); + Thread.currentThread().interrupt(); // Keep the thread interrupted state + return false; + } + if(p.isAlive()) { + ConcurrentLog.warn("Html2Pdf", "Creation of pdf did not terminate within " + maxSeconds + " seconds. Killing the process started with command : " + commandline); + p.destroyForcibly(); + return false; + } + if (p.exitValue() == 0 && destination.exists()) { + return true; + } + final List messages = OS.readStreams(p); + ConcurrentLog.warn("Html2Image", "failed to create pdf " + (proxy == null ? "" : "using proxy " + proxy) + " with command : " + commandline); + for (final String message : messages) { + ConcurrentLog.warn("Html2Image", ">> " + message); + } + return false; + } /** * Convert a pdf (first page) to an image. Proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75 @@ -459,7 +515,7 @@ public class Html2Image { return; } if(Html2Image.writeWkhtmltopdf(args[0], null, ClientIdentification.yacyInternetCrawlerAgent.userAgent, - "en-us,en;q=0.5", targetPdfFile)) { + "en-us,en;q=0.5", targetPdfFile, 30)) { if(targetPath.endsWith(".jpg") || targetPath.endsWith(".png")) { if(Html2Image.pdf2image(targetPdfFile, new File(targetPath), 1024, 1024, 300, 75)) { ConcurrentLog.info("Html2Image", "wrote " + targetPath + " converted from " + targetPdfFile); diff --git a/source/net/yacy/crawler/data/Transactions.java b/source/net/yacy/crawler/data/Transactions.java index ae2c6043d..c12eeb548 100644 --- a/source/net/yacy/crawler/data/Transactions.java +++ b/source/net/yacy/crawler/data/Transactions.java @@ -65,6 +65,9 @@ public class Transactions { private static ExecutorService executor = Executors.newCachedThreadPool(); private static AtomicInteger executorRunning = new AtomicInteger(0); + /** the maximum to wait for each wkhtmltopdf call when rendering PDF snapshots */ + private static long wkhtmltopdfTimeout = 30; + static { for (int i = 0; i < WHITESPACE.length; i++) WHITESPACE[i] = 32; } @@ -77,13 +80,18 @@ public class Transactions { } } - public static void init(File dir) { + /** + * @param dir the parent directory of inventory and archive snapshots. + * @param wkhtmltopdfTimeout the maximum to wait for each wkhtmltopdf call when rendering PDF snapshots + */ + public static void init(final File dir, final long wkhtmltopdfSecondsTimeout) { transactionDir = dir; transactionDir.mkdirs(); inventoryDir = new File(transactionDir, State.INVENTORY.dirname); inventory = new Snapshots(inventoryDir); archiveDir = new File(transactionDir, State.ARCHIVE.dirname); archive = new Snapshots(archiveDir); + wkhtmltopdfTimeout = wkhtmltopdfSecondsTimeout; } public static synchronized void migrateIPV6Snapshots() { @@ -228,7 +236,7 @@ public class Transactions { public void run() { executorRunning.incrementAndGet(); try { - Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath); + Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath, wkhtmltopdfTimeout); } catch (Throwable e) {} finally { executorRunning.decrementAndGet(); } @@ -236,7 +244,7 @@ public class Transactions { }; executor.execute(t); } else { - success = Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath); + success = Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath, wkhtmltopdfTimeout); } return success; diff --git a/source/net/yacy/kelondro/util/OS.java b/source/net/yacy/kelondro/util/OS.java index f4fa0dd68..beca081a1 100644 --- a/source/net/yacy/kelondro/util/OS.java +++ b/source/net/yacy/kelondro/util/OS.java @@ -166,19 +166,29 @@ public final class OS { // runs a unix/linux command and returns output as Vector of Strings // this method blocks until the command is executed final Process p = Runtime.getRuntime().exec(command); - return execSynchronousProcess(p); + return readStreams(p); } public static List execSynchronous(final String[] command) throws IOException { // runs a unix/linux command and returns output as Vector of Strings // this method blocks until the command is executed final Process p = Runtime.getRuntime().exec(command); - return execSynchronousProcess(p); + return readStreams(p); } - private static List execSynchronousProcess(Process p) throws IOException { + /** + * Read all lines from both standard and error output from the given process + * @param p a process + * @return all the lines from the process standard and error ouput + * @throws IOException when an unexpected error occurred + */ + public static List readStreams(final Process p) throws IOException { String line; - final List output = new ArrayList(); + final List output = new ArrayList<>(); + + if(p == null) { + return output; + } try (final InputStreamReader streamReader = new InputStreamReader(p.getInputStream()); final BufferedReader in = new BufferedReader(streamReader);) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index bbe96ee7c..a268ff775 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -770,7 +770,8 @@ public final class Switchboard extends serverSwitch { getConfigInt(SwitchboardConstants.HTCACHE_COMPRESSION_LEVEL, SwitchboardConstants.HTCACHE_COMPRESSION_LEVEL_DEFAULT)); final File transactiondir = new File(this.htCachePath, "snapshots"); - Transactions.init(transactiondir); + Transactions.init(transactiondir, getConfigLong(SwitchboardConstants.SNAPSHOTS_WKHTMLTOPDF_TIMEOUT, + SwitchboardConstants.SNAPSHOTS_WKHTMLTOPDF_TIMEOUT_DEFAULT)); // create the surrogates directories this.surrogatesInPath = diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index b1145057e..b745edda9 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -329,10 +329,10 @@ public final class SwitchboardConstants { /** Default value controlling whether a self-signed certificate is acceptable from a remote Solr instance with authentication credentials. */ public static final boolean FEDERATED_SERVICE_SOLR_INDEXING_AUTHENTICATED_ALLOW_SELF_SIGNED_DEFAULT = false; - /** Key of the setting controlling wheter to use or not an embedded Solr instance */ + /** Key of the setting controlling whether to use or not an embedded Solr instance */ public static final String CORE_SERVICE_FULLTEXT = "core.service.fulltext"; - /** Default setting value controlling wheter to use or not an embedded Solr instance */ + /** Default setting value controlling whether to use or not an embedded Solr instance */ public static final boolean CORE_SERVICE_FULLTEXT_DEFAULT = true; public static final String CORE_SERVICE_RWI = "core.service.rwi.tmp"; @@ -354,6 +354,12 @@ public final class SwitchboardConstants { public static final String CRAWLER_USER_AGENT_MINIMUMDELTA = "crawler.userAgent.minimumdelta"; public static final String CRAWLER_USER_AGENT_CLIENTTIMEOUT = "crawler.userAgent.clienttimeout"; + /** Key of the setting controlling the maximum time to wait for each wkhtmltopdf call when rendering PDF snapshots */ + public static final String SNAPSHOTS_WKHTMLTOPDF_TIMEOUT = "snapshots.wkhtmltopdf.timeout"; + + /** Default maximum time in seconds to wait for each wkhtmltopdf call when rendering PDF snapshots*/ + public static final long SNAPSHOTS_WKHTMLTOPDF_TIMEOUT_DEFAULT = 30; + /* --- debug flags --- */ /** when set to true : do not use the local dht/rwi index (which is not done if we do remote searches) */