fixed font size and print page generation in pdf snapshots

pull/1/head
Michael Peter Christen 10 years ago
parent 24f68a4eb7
commit 7db2888336

@ -258,10 +258,10 @@ public class snapshot {
SolrDocument sd = sb.index.fulltext().getMetadata(durl.hash()); SolrDocument sd = sb.index.fulltext().getMetadata(durl.hash());
boolean success = false; boolean success = false;
if (sd == null) { if (sd == null) {
success = Transactions.store(durl, new Date(), 99, false, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null)); success = Transactions.store(durl, new Date(), 99, false, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, sb.getConfig("crawler.http.acceptLanguage", null));
} else { } else {
SolrInputDocument sid = sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(sd); SolrInputDocument sid = sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(sd);
success = Transactions.store(sid, false, true, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null)); success = Transactions.store(sid, false, true, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, sb.getConfig("crawler.http.acceptLanguage", null));
} }
if (success) { if (success) {
pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.ANY); pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.ANY);

@ -73,14 +73,15 @@ public class ClientIdentification {
public static Agent yacyIntranetCrawlerAgent = null; // defined later in static public static Agent yacyIntranetCrawlerAgent = null; // defined later in static
public final static String googleAgentName = "Googlebot"; public final static String googleAgentName = "Googlebot";
public final static Agent googleAgentAgent = new Agent("Googlebot/2.1 (+http://www.google.com/bot.html)", new String[]{"Googlebot", "Googlebot-Mobile"}, minimumGlobalDeltaInit / 2, clientTimeoutInit); public final static Agent googleAgentAgent = new Agent("Googlebot/2.1 (+http://www.google.com/bot.html)", new String[]{"Googlebot", "Googlebot-Mobile"}, minimumGlobalDeltaInit / 2, clientTimeoutInit);
public final static String browserAgentName = "Random Browser";
public final static Agent browserAgent = new Agent(browserAgents[random.nextInt(browserAgents.length)], new String[]{"Mozilla"}, minimumLocalDeltaInit, clientTimeoutInit);
public final static String yacyProxyAgentName = "YaCyProxy"; public final static String yacyProxyAgentName = "YaCyProxy";
public final static Agent yacyProxyAgent = new Agent("yacy - this is a proxy access through YaCy from a browser, not a robot (the yacy bot user agent is 'yacybot')", new String[]{"yacy"}, minimumGlobalDeltaInit, clientTimeoutInit); public final static Agent yacyProxyAgent = new Agent("yacy - this is a proxy access through YaCy from a browser, not a robot (the yacy bot user agent is 'yacybot')", new String[]{"yacy"}, minimumGlobalDeltaInit, clientTimeoutInit);
public final static String customAgentName = "Custom Agent"; public final static String customAgentName = "Custom Agent";
public final static String browserAgentName = "Random Browser";
public static Agent browserAgent;
static { static {
generateYaCyBot("new"); generateYaCyBot("new");
browserAgent = new Agent(browserAgents[random.nextInt(browserAgents.length)], new String[]{"Mozilla"}, minimumLocalDeltaInit, clientTimeoutInit);
agents.put(googleAgentName, googleAgentAgent); agents.put(googleAgentName, googleAgentAgent);
agents.put(browserAgentName, browserAgent); agents.put(browserAgentName, browserAgent);
agents.put(yacyProxyAgentName, yacyProxyAgent); agents.put(yacyProxyAgentName, yacyProxyAgent);

@ -52,7 +52,7 @@ public class Html2Image {
// to install wkhtmltopdf, download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html // to install wkhtmltopdf, download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html
// to install imagemagick, download from http://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip // to install imagemagick, download from http://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip
// the convert command from imagemagick needs ghostscript, if not present on older macs, download a version of gs from http://pages.uoregon.edu/koch/ // the convert command from imagemagick needs ghostscript, if not present on older macs, download a version of gs from http://pages.uoregon.edu/koch/
private final static File wkhtmltopdfMac = new File("/usr/local/bin/wkhtmltopdf"); private final static File wkhtmltopdfMac = new File("/usr/local/bin/wkhtmltopdf"); // sometimes this is also the path on debian
private final static File convertMac1 = new File("/opt/local/bin/convert"); private final static File convertMac1 = new File("/opt/local/bin/convert");
private final static File convertMac2 = new File("/opt/ImageMagick/bin/convert"); private final static File convertMac2 = new File("/opt/ImageMagick/bin/convert");
@ -81,7 +81,7 @@ public class Html2Image {
public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, final String acceptLanguage, File destination) { public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, final String acceptLanguage, File destination) {
boolean success = false; boolean success = false;
for (boolean ignoreErrors: new boolean[]{false, true}) { for (boolean ignoreErrors: new boolean[]{false, true}) {
success = writeWkhtmltopdfInternal(url, proxy, destination, null, acceptLanguage, ignoreErrors); success = writeWkhtmltopdfInternal(url, proxy, destination, userAgent, acceptLanguage, ignoreErrors);
if (success) break; if (success) break;
if (!success && proxy != null) { if (!success && proxy != null) {
ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url); ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url);
@ -106,7 +106,7 @@ public class Html2Image {
(proxy == null ? "" : "--proxy " + proxy + " ") + (proxy == null ? "" : "--proxy " + proxy + " ") +
(ignoreErrors ? (OS.isMacArchitecture ? "--load-error-handling ignore " : "--ignore-load-errors ") : "") + // some versions do not have that flag and fail if attempting to use it... (ignoreErrors ? (OS.isMacArchitecture ? "--load-error-handling ignore " : "--ignore-load-errors ") : "") + // some versions do not have that flag and fail if attempting to use it...
//"--footer-font-name 'Courier' --footer-font-size 9 --footer-left [webpage] --footer-right [date]/[time]([page]/[topage]) " + //"--footer-font-name 'Courier' --footer-font-size 9 --footer-left [webpage] --footer-right [date]/[time]([page]/[topage]) " +
"--footer-left [webpage] --footer-right '[date]/[time]([page]/[topage])' " + "--footer-left [webpage] --footer-right '[date]/[time]([page]/[topage])' --footer-font-size 7 " +
url + " " + destination.getAbsolutePath(); url + " " + destination.getAbsolutePath();
try { try {
ConcurrentLog.info("Html2Pdf", "creating pdf from url " + url + " with command: " + commandline); ConcurrentLog.info("Html2Pdf", "creating pdf from url " + url + " with command: " + commandline);

@ -146,7 +146,7 @@ public class Transactions {
} }
} }
public static boolean store(final SolrInputDocument doc, final boolean concurrency, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) { public static boolean store(final SolrInputDocument doc, final boolean concurrency, final boolean loadImage, final boolean replaceOld, final String proxy, final String acceptLanguage) {
// GET METADATA FROM DOC // GET METADATA FROM DOC
final String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); final String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
@ -160,7 +160,7 @@ public class Transactions {
return false; return false;
} }
boolean success = loadImage ? store(url, date, depth, concurrency, replaceOld, proxy, agent, acceptLanguage) : true; boolean success = loadImage ? store(url, date, depth, concurrency, replaceOld, proxy, acceptLanguage) : true;
if (success) { if (success) {
// STORE METADATA FOR THE IMAGE // STORE METADATA FOR THE IMAGE
File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY); File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY);
@ -189,7 +189,7 @@ public class Transactions {
} }
public static boolean store(final DigestURL url, final Date date, final int depth, final boolean concurrency, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) { public static boolean store(final DigestURL url, final Date date, final int depth, final boolean concurrency, final boolean replaceOld, final String proxy, final String acceptLanguage) {
// CLEAN UP OLD DATA (if wanted) // CLEAN UP OLD DATA (if wanted)
Collection<File> oldPaths = Transactions.findPaths(url, depth, null, Transactions.State.INVENTORY); Collection<File> oldPaths = Transactions.findPaths(url, depth, null, Transactions.State.INVENTORY);
@ -211,7 +211,7 @@ public class Transactions {
public void run() { public void run() {
executorRunning.incrementAndGet(); executorRunning.incrementAndGet();
try { try {
Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath); Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath);
} catch (Throwable e) {} finally { } catch (Throwable e) {} finally {
executorRunning.decrementAndGet(); executorRunning.decrementAndGet();
} }
@ -219,7 +219,7 @@ public class Transactions {
}; };
executor.execute(t); executor.execute(t);
} else { } else {
success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath); success = Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath);
} }
return success; return success;

@ -580,7 +580,7 @@ public class Segment {
String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase(); String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase();
if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) { if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) {
// STORE IMAGE AND METADATA // STORE IMAGE AND METADATA
Transactions.store(vector, true, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, crawlProfile.getAgent(), acceptLanguage); Transactions.store(vector, true, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, acceptLanguage);
} }
} }

Loading…
Cancel
Save