YaCy can now create web page snapshots as pdf documents which can later

be transcoded into jpg for image previews. To create such pdfs you must do: Add wkhtmltopdf and imagemagick to your OS, which you can do: On a Mac download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html and downloadh ttp://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip In Debian do "apt-get install wkhtmltopdf imagemagick" Then check in /Settings_p.html?page=ProxyAccess: "Transparent Proxy" and "Always Fresh" - this is used by wkhtmltopdf to fetch web pages using the YaCy proxy. Using "Always Fresh" it is possible to get all pages from the proxy cache. Finally, you will see a new option when starting an expert web crawl. You can set a maximum depth for crawling which should cause a pdf generation. The resulting pdfs are then available in DATA/HTCACHE/SNAPSHOTS/<host>.<port>/<depth>/<shard>/<urlhash>.<date>.pdf
10 years ago · 97f6089a41
parent 41d00350e4
commit 97f6089a41
12 changed files with 97 additions and 42 deletions
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@ -460,6 +460,28 @@
 	      </dl>
        </fieldset>
 	    #(/agentSelect)#
+        #(snapshotSelect)#<input type="hidden" name="snapshotsMaxDepth" id="snapshotsMaxDepth" value="-1" />::
+        <fieldset>
+          <legend>Snapshot Creation</legend>
+          <dl>
+          <dt><label for="snapshot">Max Depth for Snapshots</label></dt>
+          <dd>
+            <span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
+            Snapshots are pictures of web pages that can be created during crawling time. These pictures will be stored as pdf at first into subdirectories
+            of HTCACHE/SNAPSHOTS/ and are computed to jpg from the pdfs later. Snapshot generation can be controlled using a depth parameter; that
+            means a snapshot is only be generated if the crawl depth of a document is smaller or equal to the given number here. If the number is set to -1,
+            no snapshots are generated.
+            </span></span>
+			<input type="text" name="snapshotsMaxDepth" id="snapshotsMaxDepth" size="2" maxlength="2" value="-1" />
+	      </dd>
+          <dt><label for="snapshot">Multiple Snapshot Versions</label></dt>
+          <dd>
+            <input type="radio" name="snapshotsReplaceOld" value="on" checked="checked"/> replace old snapshots with new one&nbsp;&nbsp;&nbsp;
+			<input type="radio" name="snapshotsReplaceOld" value="off" /> add new versions for each crawl
+          </dd>
+	      </dl>
+        </fieldset>
+	    #(/snapshotSelect)#
        <fieldset>
          <legend>Index Administration</legend>
          <dl>
--- a/htroot/CrawlStartExpert.java
+++ b/htroot/CrawlStartExpert.java
@ -29,6 +29,7 @@ import java.util.List;

 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.cora.util.Html2Image;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.search.Switchboard;
 import net.yacy.search.schema.CollectionSchema;
@ -511,6 +512,15 @@ public class CrawlStartExpert {
                ClientIdentification.yacyInternetCrawlerAgentName);


+        // ---------- Snapshot generation
+        if (sb.getConfigBool("isTransparentProxy", false) &&
+            sb.getConfigBool("proxyAlwaysFresh", false) &&
+             Html2Image.wkhtmltopdfAvailable() && Html2Image.convertAvailable()) {
+            prop.put("snapshotSelect", 1);
+        } else {
+            prop.put("snapshotSelect", 0);
+        }
+
        // ---------- Index Administration
        // Do Local Indexing
        if (post == null) {
@ -548,7 +558,7 @@ public class CrawlStartExpert {
                prop.put("collection", collectionEnabled ? defaultCollection : "");
            }
        }
-
+        
        // return rewrite properties
        return prop;
    }
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -436,6 +436,11 @@ public class Crawler_p {
                    // check crawlurl was given in sitecrawl
                    if ("url".equals(crawlingMode) && rootURLs.size() == 0) hasCrawlstartDataOK = false;
                }
+               
+                String snapshotsMaxDepthString = post.get("snapshotsMaxDepth", "-1");
+                int snapshotsMaxDepth = Integer.parseInt(snapshotsMaxDepthString);
+                boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
+                
                // prepare a new crawling profile
                final CrawlProfile profile;
                byte[] handle;
@ -462,7 +467,8 @@ public class Crawler_p {
                            indexMedia,
                            storeHTCache,
                            crawlOrder,
-                            -1, // temporary; stub commit
+                            snapshotsMaxDepth,
+                            snapshotsReplaceOld,
                            cachePolicy,
                            collection,
                            agentName);
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -152,7 +152,7 @@ public class QuickCrawlLink_p {
                        obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                        indexText, indexMedia,
                        storeHTCache, remoteIndexing,
-                        -1,
+                        -1, true,
                        CacheStrategy.IFFRESH,
                        collection,
                        ClientIdentification.yacyIntranetCrawlerAgentName);
--- a/source/net/yacy/cora/util/Html2Image.java
+++ b/source/net/yacy/cora/util/Html2Image.java
@ -37,12 +37,8 @@ import java.awt.Graphics;
 import java.awt.image.BufferedImage;
 import java.beans.PropertyChangeEvent;
 import java.beans.PropertyChangeListener;
-import java.io.BufferedReader;
 import java.io.File;
 import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.List;

 public class Html2Image {
    
@ -58,11 +54,11 @@ public class Html2Image {
    private final static File convertDebian = new File("/usr/bin/convert");


-    public boolean wkhtmltopdfAvailable() {
+    public static boolean wkhtmltopdfAvailable() {
        return wkhtmltopdfMac.exists() || wkhtmltopdfDebian.exists();
    }
    
-    public boolean convertAvailable() {
+    public static boolean convertAvailable() {
        return convertMac.exists() || convertDebian.exists();
    }
    
@ -77,7 +73,7 @@ public class Html2Image {
        final File wkhtmltopdf = wkhtmltopdfMac.exists() ? wkhtmltopdfMac : wkhtmltopdfDebian;
        
        try {
-            OS.execSynchronous(wkhtmltopdf.getAbsolutePath() + (proxy == null ? " " : " --proxy " + proxy + " ") + url + " " + destination.getAbsolutePath());
+            OS.execSynchronous(wkhtmltopdf.getAbsolutePath() + " --title " + url + (proxy == null ? " " : " --proxy " + proxy + " ") + url + " " + destination.getAbsolutePath());
            return destination.exists();
        } catch (IOException e) {
            e.printStackTrace();
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@ -293,7 +293,7 @@ public final class CrawlSwitchboard {
                sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true),
                true,
                sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false),
-                -1,
+                -1, true,
                CacheStrategy.IFFRESH,
                "robot_" + CRAWL_PROFILE_PROXY,
                ClientIdentification.yacyProxyAgentName);
@ -323,7 +323,7 @@ public final class CrawlSwitchboard {
                true,
                false,
                false,
-                -1,
+                -1, true,
                CacheStrategy.IFFRESH,
                "robot_" + CRAWL_PROFILE_REMOTE,
                ClientIdentification.yacyInternetCrawlerAgentName);
@ -353,7 +353,7 @@ public final class CrawlSwitchboard {
                false,
                true,
                false,
-                -1,
+                -1, true,
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
                ClientIdentification.yacyIntranetCrawlerAgentName);
@ -383,7 +383,7 @@ public final class CrawlSwitchboard {
                true,
                true,
                false,
-                -1,
+                -1, true,
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
                ClientIdentification.yacyIntranetCrawlerAgentName);
@ -414,7 +414,7 @@ public final class CrawlSwitchboard {
                false,
                true,
                false,
-                -1,
+                -1, true,
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
                ClientIdentification.browserAgentName);
@ -444,7 +444,7 @@ public final class CrawlSwitchboard {
                false,
                true,
                false,
-                -1,
+                -1, true,
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
                ClientIdentification.yacyIntranetCrawlerAgentName);
@ -474,7 +474,7 @@ public final class CrawlSwitchboard {
                true,
                true,
                false,
-                -1,
+                -1, true,
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
                ClientIdentification.yacyIntranetCrawlerAgentName);
@ -504,7 +504,7 @@ public final class CrawlSwitchboard {
                false,
                false,
                false,
-                -1,
+                -1, true,
                CacheStrategy.NOCACHE,
                "robot_" + CRAWL_PROFILE_SURROGATE,
                ClientIdentification.yacyIntranetCrawlerAgentName);
@ -537,7 +537,7 @@ public final class CrawlSwitchboard {
                true,
                false,
                false,
-                -1,
+                -1, true,
                CacheStrategy.NOCACHE,
                collection,
                ClientIdentification.yacyIntranetCrawlerAgentName);
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -86,7 +86,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public static final String INDEXING_URL_MUSTNOTMATCH     = "indexURLMustNotMatch";
    public static final String INDEXING_CONTENT_MUSTMATCH    = "indexContentMustMatch";
    public static final String INDEXING_CONTENT_MUSTNOTMATCH = "indexContentMustNotMatch";
-    public static final String LOADPREVIEWMAXDEPTH           = "loadpreviewmaxdepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1
+    public static final String SNAPSHOTS_MAXDEPTH            = "snapshotsMaxDepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1
+    public static final String SNAPSHOTS_REPLACEOLD          = "snapshotsReplaceOld"; // if this is set to true, only one version of a snapshot per day is stored, otherwise we store also different versions per day

    private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
    private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
@ -142,7 +143,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                 final boolean indexMedia,
                 final boolean storeHTCache,
                 final boolean remoteIndexing,
-                 final int loadPreviewMaxdepth,
+                 final int snapshotsMaxDepth,
+                 final boolean snapshotsReplaceOld,
                 final CacheStrategy cacheStrategy,
                 final String collections,
                 final String userAgentName) {
@ -178,7 +180,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(INDEX_MEDIA,      indexMedia);
        put(STORE_HTCACHE,    storeHTCache);
        put(REMOTE_INDEXING,  remoteIndexing);
-        put(LOADPREVIEWMAXDEPTH, loadPreviewMaxdepth);
+        put(SNAPSHOTS_MAXDEPTH, snapshotsMaxDepth);
+        put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld);
        put(CACHE_STRAGEGY,   cacheStrategy.toString());
        put(COLLECTIONS,      CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
    }
@ -575,8 +578,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return (r.equals(Boolean.TRUE.toString()));
    }
    
-    public int loadPreviewMaxdepth() {
-        final String r = get(LOADPREVIEWMAXDEPTH);
+    public int snapshotMaxdepth() {
+        final String r = get(SNAPSHOTS_MAXDEPTH);
        if (r == null) return -1;
        try {
            final int i = Integer.parseInt(r);
@ -588,6 +591,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
    }

+    public boolean snapshotReplaceold() {
+        final String r = get(SNAPSHOTS_REPLACEOLD);
+        if (r == null) return false;
+        return (r.equals(Boolean.TRUE.toString()));
+    }
+
    /**
     * get a recrawl date for a given age in minutes
     * @param oldTimeMinutes
--- a/source/net/yacy/crawler/data/Snapshots.java
+++ b/source/net/yacy/crawler/data/Snapshots.java
@ -31,6 +31,8 @@ import org.apache.solr.common.SolrDocument;
 import net.yacy.cora.date.GenericFormatter;
 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.cora.util.Html2Image;
 import net.yacy.search.index.Fulltext;
 import net.yacy.search.schema.CollectionSchema;

@ -68,13 +70,15 @@ public class Snapshots {
     * @param proxy - a string of the form 'http://<host>:<port>
     * @return
     */
-    public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, String proxy) {
+    public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy) {
+        Collection<File> oldPaths = findPaths(url, depth);
+        if (replaceOld) {
+            for (File oldPath: oldPaths) oldPath.delete();
+        }
        File path = definePath(url, "pdf", depth, date);
        path.getParentFile().mkdirs();
-        
-        // STUB
-        
-        return path;
+        boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, path);
+        return success ? path : null;
    }
    
    /**
@ -122,9 +126,9 @@ public class Snapshots {
     * @param ext
     * @return a set of files for snapshots of the url
     */
-    public Collection<File> findPaths(final DigestURL url, final String ext) {
+    public Collection<File> findPaths(final DigestURL url) {
        for (int i = 0; i < 100; i++) {
-            Collection<File> paths = findPaths(url, ext, i);
+            Collection<File> paths = findPaths(url, i);
            if (paths.size() > 0) return paths;
        }
        return new ArrayList<>(0);
@ -138,20 +142,23 @@ public class Snapshots {
     * @param depth
     * @return a set of files for snapshots of the url
     */
-    public Collection<File> findPaths(final DigestURL url, final String ext, final int depth) {
+    public Collection<File> findPaths(final DigestURL url, final int depth) {
        String id = ASCII.String(url.hash());
        File pathToShard = pathToShard(url, depth);
-        String[] list = pathToShard.list();
+        String[] list = pathToShard.exists() && pathToShard.isDirectory() ? pathToShard.list() : null; // may be null if path does not exist
        ArrayList<File> paths = new ArrayList<>();
-        for (String f: list) {
-            if (f.startsWith(id) && f.endsWith(ext)) paths.add(new File(pathToShard, f));
+        if (list != null) {
+            final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
+            for (String f: list) {
+                if (f.startsWith(id) && f.endsWith(ext)) paths.add(new File(pathToShard, f));
+            }
        }
        return paths;
    }
    
    private File pathToShard(final DigestURL url, final int depth) {
        String id = ASCII.String(url.hash());
-        File pathToHostDir = new File(storageLocation, url.getHost() + ":" + url.getPort());
+        File pathToHostDir = new File(storageLocation, url.getHost() + "." + url.getPort());
        File pathToDepthDir = new File(pathToHostDir, depth < 10 ? "0" + depth : Integer.toString(depth));
        File pathToShard = new File(pathToDepthDir, id.substring(0, 2));
        return pathToShard;
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@ -24,10 +24,12 @@

 package net.yacy.crawler.retrieval;

+import java.io.File;
 import java.io.IOException;
 import java.util.Date;

 import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.federate.solr.FailCategory;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.HeaderFramework;
@ -76,8 +78,11 @@ public final class HTTPLoader {
        Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
        
        // load pdf in case that is wanted. This can later be used to compute a web page preview in the search results
-        if (entry.depth() <= profile.loadPreviewMaxdepth() && "html|shtml|php".indexOf(entry.url().getFile()) >= 0) {
-            sb.snapshots.downloadPDFSnapshot(entry.url(), entry.depth(), new Date(), "http://127.0.0.1:" + sb.getConfigInt("port", 8090));
+        boolean depthok = profile != null && entry.depth() <= profile.snapshotMaxdepth();
+        boolean extok = entry.url().getFile().length() == 0 || "html|shtml|php".indexOf(MultiProtocolURL.getFileExtension(entry.url().getFile())) >= 0;
+        if (depthok && extok) {
+            File snapshotFile = sb.snapshots.downloadPDFSnapshot(entry.url(), entry.depth(), new Date(), profile.snapshotReplaceold(), sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null);
+            this.log.info("SNAPSHOT - " + (snapshotFile == null ? "could not generate snapshot for " + entry.url().toNormalform(true) : "wrote " + snapshotFile + " for " + entry.url().toNormalform(true)));
        }
        return doc;
    }
--- a/source/net/yacy/data/ymark/YMarkCrawlStart.java
+++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java
@ -186,7 +186,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
 		                crawlingQ,
 		                true, true, true, false,
 		                true, true, false,
-		                -1,
+		                -1, true,
 		                CacheStrategy.IFFRESH,
 		                "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
 		                ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard
--- a/source/net/yacy/http/ProxyCacheHandler.java
+++ b/source/net/yacy/http/ProxyCacheHandler.java
@ -70,7 +70,7 @@ public class ProxyCacheHandler extends AbstractRemoteHandler implements Handler
                final net.yacy.crawler.retrieval.Request yacyRequest = new net.yacy.crawler.retrieval.Request(
                        null,
                        url,
-                        proxyHeaders.referer() == null ? null : new DigestURL(proxyHeaders.referer().toString()).hash(),
+                        proxyHeaders.referer() == null ? null : new DigestURL(proxyHeaders.referer().toNormalform(true)).hash(),
                        "",
                        cachedResponseHeader.lastModified(),
                        sb.crawler.defaultProxyProfile.handle(),
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -346,7 +346,6 @@ public final class Switchboard extends serverSwitch {
        this.htDocsPath =
            getDataPath(SwitchboardConstants.HTDOCS_PATH, SwitchboardConstants.HTDOCS_PATH_DEFAULT);
        this.log.config("HTDOCS Path:    " + this.htDocsPath.toString());
-        this.snapshots = new Snapshots(new File(this.htDocsPath, "SNAPSHOTS"));
        this.workPath = getDataPath(SwitchboardConstants.WORK_PATH, SwitchboardConstants.WORK_PATH_DEFAULT);
        this.workPath.mkdirs();
        // if default work files exist, copy them (don't overwrite existing!)
@ -695,7 +694,8 @@ public final class Switchboard extends serverSwitch {
        final long maxCacheSize =
            1024L * 1024L * Long.parseLong(getConfig(SwitchboardConstants.PROXY_CACHE_SIZE, "2")); // this is megabyte
        Cache.init(this.htCachePath, this.peers.mySeed().hash, maxCacheSize);
-
+        this.snapshots = new Snapshots(new File(this.htCachePath, "SNAPSHOTS"));
+        
        // create the surrogates directories
        this.surrogatesInPath =
            getDataPath(