From 932faafffebfab27708d47afb51cf2a614a040c2 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 16 Dec 2014 12:09:57 +0100 Subject: [PATCH] reactivated on-demand snapshot loading --- htroot/api/snapshot.java | 14 ++- .../net/yacy/crawler/data/Transactions.java | 87 +++++++++++-------- source/net/yacy/search/index/Segment.java | 2 +- 3 files changed, 60 insertions(+), 43 deletions(-) diff --git a/htroot/api/snapshot.java b/htroot/api/snapshot.java index c86ab9e7b..221296c30 100644 --- a/htroot/api/snapshot.java +++ b/htroot/api/snapshot.java @@ -26,6 +26,7 @@ import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.util.Collection; +import java.util.Date; import java.util.Map; import java.util.TreeMap; @@ -248,16 +249,21 @@ public class snapshot { } if (pdf || pngjpg) { - Collection pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.ANY); + Collection pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.INVENTORY); File pdfFile = null; if (pdfSnapshots.size() == 0) { // if the client is authenticated, we create the pdf on the fly! if (!authenticated) return null; SolrDocument sd = sb.index.fulltext().getMetadata(durl.hash()); - SolrInputDocument sid = sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(sd); - boolean success = Transactions.store(sid, true, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null)); + boolean success = false; + if (sd == null) { + success = Transactions.store(durl, new Date(), 99, false, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null)); + } else { + SolrInputDocument sid = sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(sd); + success = Transactions.store(sid, false, true, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null)); + } if (success) { - pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.INVENTORY); + pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.ANY); if (pdfSnapshots.size() != 0) pdfFile = pdfSnapshots.iterator().next(); } } else { diff --git a/source/net/yacy/crawler/data/Transactions.java b/source/net/yacy/crawler/data/Transactions.java index 2ebcd77d4..c37fd7cb7 100644 --- a/source/net/yacy/crawler/data/Transactions.java +++ b/source/net/yacy/crawler/data/Transactions.java @@ -146,7 +146,7 @@ public class Transactions { } } - public static boolean store(final SolrInputDocument doc, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) { + public static boolean store(final SolrInputDocument doc, final boolean concurrency, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) { // GET METADATA FROM DOC final String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); @@ -160,55 +160,66 @@ public class Transactions { return false; } + boolean success = loadImage ? store(url, date, depth, concurrency, replaceOld, proxy, agent, acceptLanguage) : true; + if (success) { + // STORE METADATA FOR THE IMAGE + File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY); + metadataPath.getParentFile().mkdirs(); + try { + if (doc != null) { + FileOutputStream fos = new FileOutputStream(metadataPath); + OutputStreamWriter osw = new OutputStreamWriter(fos); + osw.write(XML_PREFIX); + osw.write(WHITESPACE); osw.write("\n-->\n"); // placeholder for transaction information properties (a hack to attach metadata to metadata) + osw.write("\n"); + EnhancedXMLResponseWriter.writeDoc(osw, doc); + osw.write("\n"); + osw.write("\n"); + osw.close(); + fos.close(); + Transactions.announceStorage(url, depth, date, State.INVENTORY); + } + } catch (IOException e) { + ConcurrentLog.logException(e); + success = false; + } + } + + return success; + } + + + public static boolean store(final DigestURL url, final Date date, final int depth, final boolean concurrency, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) { + // CLEAN UP OLD DATA (if wanted) Collection oldPaths = Transactions.findPaths(url, depth, null, Transactions.State.INVENTORY); - if (replaceOld) { + if (replaceOld && oldPaths != null) { for (File oldPath: oldPaths) oldPath.delete(); } - // STORE METADATA FOR THE IMAGE File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY); metadataPath.getParentFile().mkdirs(); boolean success = true; - try { - if (doc != null) { - FileOutputStream fos = new FileOutputStream(metadataPath); - OutputStreamWriter osw = new OutputStreamWriter(fos); - osw.write(XML_PREFIX); - osw.write(WHITESPACE); osw.write("\n-->\n"); // placeholder for transaction information properties (a hack to attach metadata to metadata) - osw.write("\n"); - EnhancedXMLResponseWriter.writeDoc(osw, doc); - osw.write("\n"); - osw.write("\n"); - osw.close(); - fos.close(); - Transactions.announceStorage(url, depth, date, State.INVENTORY); - } - } catch (IOException e) { - ConcurrentLog.logException(e); - success = false; - } // STORE AN IMAGE - if (success && loadImage) { - final File pdfPath = Transactions.definePath(url, depth, date, "pdf", Transactions.State.INVENTORY); - if (executorRunning.intValue() < Runtime.getRuntime().availableProcessors()) { - Thread t = new Thread(){ - @Override - public void run() { - executorRunning.incrementAndGet(); - try { - Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath); - } catch (Throwable e) {} finally { - executorRunning.decrementAndGet(); - } + final String urls = url.toNormalform(true); + final File pdfPath = Transactions.definePath(url, depth, date, "pdf", Transactions.State.INVENTORY); + if (concurrency && executorRunning.intValue() < Runtime.getRuntime().availableProcessors()) { + Thread t = new Thread(){ + @Override + public void run() { + executorRunning.incrementAndGet(); + try { + Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath); + } catch (Throwable e) {} finally { + executorRunning.decrementAndGet(); } - }; - executor.execute(t); - } else { - success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath); - } + } + }; + executor.execute(t); + } else { + success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath); } return success; diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 7e8ff154f..b6d3ea2ca 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -580,7 +580,7 @@ public class Segment { String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase(); if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) { // STORE IMAGE AND METADATA - Transactions.store(vector, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, crawlProfile.getAgent(), acceptLanguage); + Transactions.store(vector, true, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, crawlProfile.getAgent(), acceptLanguage); } }