diff --git a/htroot/api/snapshot.java b/htroot/api/snapshot.java index 3c5d84737..b75465a25 100644 --- a/htroot/api/snapshot.java +++ b/htroot/api/snapshot.java @@ -258,10 +258,10 @@ public class snapshot { SolrDocument sd = sb.index.fulltext().getMetadata(durl.hash()); boolean success = false; if (sd == null) { - success = Transactions.store(durl, new Date(), 99, false, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null)); + success = Transactions.store(durl, new Date(), 99, false, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, sb.getConfig("crawler.http.acceptLanguage", null)); } else { SolrInputDocument sid = sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(sd); - success = Transactions.store(sid, false, true, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null)); + success = Transactions.store(sid, false, true, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, sb.getConfig("crawler.http.acceptLanguage", null)); } if (success) { pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.ANY); diff --git a/source/net/yacy/cora/protocol/ClientIdentification.java b/source/net/yacy/cora/protocol/ClientIdentification.java index e6b4a54f4..4c64f315c 100644 --- a/source/net/yacy/cora/protocol/ClientIdentification.java +++ b/source/net/yacy/cora/protocol/ClientIdentification.java @@ -73,14 +73,15 @@ public class ClientIdentification { public static Agent yacyIntranetCrawlerAgent = null; // defined later in static public final static String googleAgentName = "Googlebot"; public final static Agent googleAgentAgent = new Agent("Googlebot/2.1 (+http://www.google.com/bot.html)", new String[]{"Googlebot", "Googlebot-Mobile"}, minimumGlobalDeltaInit / 2, clientTimeoutInit); - public final static String browserAgentName = "Random Browser"; - public final static Agent browserAgent = new Agent(browserAgents[random.nextInt(browserAgents.length)], new String[]{"Mozilla"}, minimumLocalDeltaInit, clientTimeoutInit); public final static String yacyProxyAgentName = "YaCyProxy"; public final static Agent yacyProxyAgent = new Agent("yacy - this is a proxy access through YaCy from a browser, not a robot (the yacy bot user agent is 'yacybot')", new String[]{"yacy"}, minimumGlobalDeltaInit, clientTimeoutInit); public final static String customAgentName = "Custom Agent"; - + public final static String browserAgentName = "Random Browser"; + public static Agent browserAgent; + static { generateYaCyBot("new"); + browserAgent = new Agent(browserAgents[random.nextInt(browserAgents.length)], new String[]{"Mozilla"}, minimumLocalDeltaInit, clientTimeoutInit); agents.put(googleAgentName, googleAgentAgent); agents.put(browserAgentName, browserAgent); agents.put(yacyProxyAgentName, yacyProxyAgent); diff --git a/source/net/yacy/cora/util/Html2Image.java b/source/net/yacy/cora/util/Html2Image.java index e3d2235d1..387ccd2f5 100644 --- a/source/net/yacy/cora/util/Html2Image.java +++ b/source/net/yacy/cora/util/Html2Image.java @@ -52,7 +52,7 @@ public class Html2Image { // to install wkhtmltopdf, download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html // to install imagemagick, download from http://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip // the convert command from imagemagick needs ghostscript, if not present on older macs, download a version of gs from http://pages.uoregon.edu/koch/ - private final static File wkhtmltopdfMac = new File("/usr/local/bin/wkhtmltopdf"); + private final static File wkhtmltopdfMac = new File("/usr/local/bin/wkhtmltopdf"); // sometimes this is also the path on debian private final static File convertMac1 = new File("/opt/local/bin/convert"); private final static File convertMac2 = new File("/opt/ImageMagick/bin/convert"); @@ -81,7 +81,7 @@ public class Html2Image { public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, final String acceptLanguage, File destination) { boolean success = false; for (boolean ignoreErrors: new boolean[]{false, true}) { - success = writeWkhtmltopdfInternal(url, proxy, destination, null, acceptLanguage, ignoreErrors); + success = writeWkhtmltopdfInternal(url, proxy, destination, userAgent, acceptLanguage, ignoreErrors); if (success) break; if (!success && proxy != null) { ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url); @@ -106,7 +106,7 @@ public class Html2Image { (proxy == null ? "" : "--proxy " + proxy + " ") + (ignoreErrors ? (OS.isMacArchitecture ? "--load-error-handling ignore " : "--ignore-load-errors ") : "") + // some versions do not have that flag and fail if attempting to use it... //"--footer-font-name 'Courier' --footer-font-size 9 --footer-left [webpage] --footer-right [date]/[time]([page]/[topage]) " + - "--footer-left [webpage] --footer-right '[date]/[time]([page]/[topage])' " + + "--footer-left [webpage] --footer-right '[date]/[time]([page]/[topage])' --footer-font-size 7 " + url + " " + destination.getAbsolutePath(); try { ConcurrentLog.info("Html2Pdf", "creating pdf from url " + url + " with command: " + commandline); diff --git a/source/net/yacy/crawler/data/Transactions.java b/source/net/yacy/crawler/data/Transactions.java index c37fd7cb7..11d4180a7 100644 --- a/source/net/yacy/crawler/data/Transactions.java +++ b/source/net/yacy/crawler/data/Transactions.java @@ -146,7 +146,7 @@ public class Transactions { } } - public static boolean store(final SolrInputDocument doc, final boolean concurrency, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) { + public static boolean store(final SolrInputDocument doc, final boolean concurrency, final boolean loadImage, final boolean replaceOld, final String proxy, final String acceptLanguage) { // GET METADATA FROM DOC final String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); @@ -160,7 +160,7 @@ public class Transactions { return false; } - boolean success = loadImage ? store(url, date, depth, concurrency, replaceOld, proxy, agent, acceptLanguage) : true; + boolean success = loadImage ? store(url, date, depth, concurrency, replaceOld, proxy, acceptLanguage) : true; if (success) { // STORE METADATA FOR THE IMAGE File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY); @@ -189,7 +189,7 @@ public class Transactions { } - public static boolean store(final DigestURL url, final Date date, final int depth, final boolean concurrency, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) { + public static boolean store(final DigestURL url, final Date date, final int depth, final boolean concurrency, final boolean replaceOld, final String proxy, final String acceptLanguage) { // CLEAN UP OLD DATA (if wanted) Collection oldPaths = Transactions.findPaths(url, depth, null, Transactions.State.INVENTORY); @@ -211,7 +211,7 @@ public class Transactions { public void run() { executorRunning.incrementAndGet(); try { - Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath); + Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath); } catch (Throwable e) {} finally { executorRunning.decrementAndGet(); } @@ -219,7 +219,7 @@ public class Transactions { }; executor.execute(t); } else { - success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath); + success = Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath); } return success; diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 295ca338d..de2bdd757 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -580,7 +580,7 @@ public class Segment { String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase(); if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) { // STORE IMAGE AND METADATA - Transactions.store(vector, true, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, crawlProfile.getAgent(), acceptLanguage); + Transactions.store(vector, true, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, acceptLanguage); } }