in case that loading from the cache fails, load from wkhtmltopdf without

cache using the user agent string given in the crawl profile
pull/1/head
Michael Peter Christen 10 years ago
parent d5bac64421
commit e586e423aa

@ -71,17 +71,22 @@ public class Html2Image {
* @param destination
* @return
*/
public static boolean writeWkhtmltopdf(String url, String proxy, File destination) {
boolean success = writeWkhtmltopdfInternal(url, proxy, destination);
public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, File destination) {
boolean success = writeWkhtmltopdfInternal(url, proxy, destination, null, false);
if (success) return true;
if (proxy == null) return false;
ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url);
return writeWkhtmltopdfInternal(url, null, destination);
return writeWkhtmltopdfInternal(url, null, destination, userAgent, true);
}
private static boolean writeWkhtmltopdfInternal(String url, String proxy, File destination) {
private static boolean writeWkhtmltopdfInternal(String url, String proxy, File destination, String userAgent, boolean ignoreErrors) {
final File wkhtmltopdf = wkhtmltopdfMac.exists() ? wkhtmltopdfMac : wkhtmltopdfDebian;
String commandline = wkhtmltopdf.getAbsolutePath() + " -q --title " + url + (proxy == null ? " " : " --proxy " + proxy + " ") + (OS.isMacArchitecture ? "--load-error-handling ignore " : "--ignore-load-errors ") + url + " " + destination.getAbsolutePath();
String commandline =
wkhtmltopdf.getAbsolutePath() + " -q --title " + url +
(userAgent == null ? "" : "--custom-header 'User-Agent' '" + userAgent + "' --custom-header-propagation") +
(proxy == null ? " " : " --proxy " + proxy + " ") +
(ignoreErrors ? (OS.isMacArchitecture ? "--load-error-handling ignore " : "--ignore-load-errors ") : "") +
url + " " + destination.getAbsolutePath();
try {
List<String> message;
if (!usexvfb) {

@ -70,14 +70,14 @@ public class Snapshots {
* @param proxy - a string of the form 'http://<host>:<port>
* @return
*/
public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy) {
public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy, String userAgent) {
Collection<File> oldPaths = findPaths(url, depth);
if (replaceOld) {
for (File oldPath: oldPaths) oldPath.delete();
}
File path = definePath(url, "pdf", depth, date);
path.getParentFile().mkdirs();
boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, path);
boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, userAgent, path);
return success ? path : null;
}

@ -217,7 +217,7 @@ public final class LoaderDispatcher {
String ext = MultiProtocolURL.getFileExtension(file).toLowerCase();
boolean extok = ext.length() == 0 || file.length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext);
if (depthok && extok) {
File snapshotFile = sb.snapshots.downloadPDFSnapshot(request.url(), request.depth(), new Date(), crawlProfile.snapshotReplaceold(), sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null);
File snapshotFile = sb.snapshots.downloadPDFSnapshot(request.url(), request.depth(), new Date(), crawlProfile.snapshotReplaceold(), sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, agent.userAgent);
log.info("SNAPSHOT - " + (snapshotFile == null ? "could not generate snapshot for " + request.url().toNormalform(true) : "wrote " + snapshotFile + " for " + request.url().toNormalform(true)));
} else {
//if (!depthok) log.warn("SNAPSHOT: depth not ok, " + (crawlProfile == null ? "profile = null" : "entry.depth() = " + request.depth() + ", profile.snapshotMaxdepth() = " + crawlProfile.snapshotMaxdepth()));

Loading…
Cancel
Save