yacy_search_server/source/net/yacy/crawler/data/Snapshots.java

/**
 *  DocumentImage
 *  Copyright 2014 by Michael Peter Christen
 *  First released 29.11.2014 at http://yacy.net
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package net.yacy.crawler.data;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;

import org.apache.solr.common.SolrDocument;

import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.Html2Image;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionSchema;

/**
 * This class hosts document snapshots.
 * 
 * The storage is organized in the following hierarchy:
 * - in the root path are subpaths for each host:port
 * - in the host:port path are subpaths for the crawl depth, two digits length
 * - in the crawl depth path are subpaths for the first two charaters of the url-hash, called shard
 * - in the shard path are files, named with <urlhash>'.'<date>.<ext>
 * .. where the <date> has the form "yyyyMMdd" and ext may be one of {pdf,jpg,png,xml,json}.
 * The pdf is created with wxhtmltopdf, jpg/png is created with convert
 * and the xml/json is an extract from solr.
 * 
 * The construction of the file name with the date allows to make several copies of the same document
 * for different snapshot-times. The usage of the crawl depth makes it easier to extract a specific part
 * of the domain.
 */
public class Snapshots {

    private File storageLocation;
    
    public Snapshots(File location) {
        this.storageLocation = location;
    }

    /**
     * Load a pdf snapshot of a document.
     * A proxy must be given to ensure that multiple loads containing i.e. image are cached
     * Use http://localhost:<thisport> as proxy.
     * @param url
     * @param depth
     * @param date
     * @param proxy - a string of the form 'http://<host>:<port>
     * @return
     */
    public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy, String userAgent) {
        Collection<File> oldPaths = findPaths(url, depth);
        if (replaceOld) {
            for (File oldPath: oldPaths) oldPath.delete();
        }
        File path = definePath(url, "pdf", depth, date);
        path.getParentFile().mkdirs();
        boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, userAgent, path);
        return success ? path : null;
    }
    
    /**
     * Compute the path of a snapshot. This does not create the snapshot, only gives a path.
     * Also, the path to the storage location is not created.
     * @param url
     * @param ext
     * @param depth
     * @param date
     * @return a file to the snapshot
     */
    public File definePath(final DigestURL url, final String ext, final int depth, final Date date) {
        String id = ASCII.String(url.hash());
        String ds = GenericFormatter.SHORT_DAY_FORMATTER.format(date);
        File path = new File(pathToShard(url, depth), id + "." + ds + "." + ext);
        return path;
    }

    /**
     * get the depth to a document, helper method for definePath to determine the depth value
     * @param url
     * @param fulltext
     * @return the crawldepth of the document
     */
    public int getDepth(final DigestURL url, final Fulltext fulltext) {
        Integer depth = null;
        if (fulltext.getDefaultConfiguration().contains(CollectionSchema.crawldepth_i)) {
            try {
                SolrDocument doc = fulltext.getDefaultConnector().getDocumentById(ASCII.String(url.hash()), CollectionSchema.crawldepth_i.getSolrFieldName());
                if (doc != null) {
                    depth = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
                }
            } catch (IOException e) {
            }
        }
        return depth == null ? 0 : depth;
    }

    /**
     * for a given url, get all paths for storage locations.
     * The locations are all for the single url but may represent different storage times.
     * This method is inefficient because it tests all different depths, it would be better to use
     * findPaths/3 with a given depth.
     * @param url
     * @param ext
     * @return a set of files for snapshots of the url
     */
    public Collection<File> findPaths(final DigestURL url) {
        for (int i = 0; i < 100; i++) {
            Collection<File> paths = findPaths(url, i);
            if (paths.size() > 0) return paths;
        }
        return new ArrayList<>(0);
    }
    
    /**
     * for a given url, get all paths for storage locations.
     * The locations are all for the single url but may represent different storage times.
     * @param url
     * @param ext
     * @param depth
     * @return a set of files for snapshots of the url
     */
    public Collection<File> findPaths(final DigestURL url, final int depth) {
        String id = ASCII.String(url.hash());
        File pathToShard = pathToShard(url, depth);
        String[] list = pathToShard.exists() && pathToShard.isDirectory() ? pathToShard.list() : null; // may be null if path does not exist
        ArrayList<File> paths = new ArrayList<>();
        if (list != null) {
            final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
            for (String f: list) {
                if (f.startsWith(id) && f.endsWith(ext)) paths.add(new File(pathToShard, f));
            }
        }
        return paths;
    }
    
    private File pathToShard(final DigestURL url, final int depth) {
        String id = ASCII.String(url.hash());
        File pathToHostDir = new File(storageLocation, url.getHost() + "." + url.getPort());
        File pathToDepthDir = new File(pathToHostDir, depth < 10 ? "0" + depth : Integer.toString(depth));
        File pathToShard = new File(pathToDepthDir, id.substring(0, 2));
        return pathToShard;
    }

}
added new web page snapshot infrastructure which will lead to the ability to have web page previews in the search results. (This is a stub, no function available with this yet...) 10 years ago			`/**`
			`* DocumentImage`
			`* Copyright 2014 by Michael Peter Christen`
			`* First released 29.11.2014 at http://yacy.net`
			`*`
			`* This library is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* This library is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public License`
			`* along with this program in the file lgpl21.txt`
			`* If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`package net.yacy.crawler.data;`

			`import java.io.File;`
			`import java.io.IOException;`
			`import java.util.ArrayList;`
			`import java.util.Collection;`
			`import java.util.Date;`

			`import org.apache.solr.common.SolrDocument;`

			`import net.yacy.cora.date.GenericFormatter;`
			`import net.yacy.cora.document.encoding.ASCII;`
			`import net.yacy.cora.document.id.DigestURL;`
YaCy can now create web page snapshots as pdf documents which can later be transcoded into jpg for image previews. To create such pdfs you must do: Add wkhtmltopdf and imagemagick to your OS, which you can do: On a Mac download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html and downloadh ttp://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip In Debian do "apt-get install wkhtmltopdf imagemagick" Then check in /Settings_p.html?page=ProxyAccess: "Transparent Proxy" and "Always Fresh" - this is used by wkhtmltopdf to fetch web pages using the YaCy proxy. Using "Always Fresh" it is possible to get all pages from the proxy cache. Finally, you will see a new option when starting an expert web crawl. You can set a maximum depth for crawling which should cause a pdf generation. The resulting pdfs are then available in DATA/HTCACHE/SNAPSHOTS/<host>.<port>/<depth>/<shard>/<urlhash>.<date>.pdf 10 years ago			`import net.yacy.cora.document.id.MultiProtocolURL;`
			`import net.yacy.cora.util.Html2Image;`
added new web page snapshot infrastructure which will lead to the ability to have web page previews in the search results. (This is a stub, no function available with this yet...) 10 years ago			`import net.yacy.search.index.Fulltext;`
			`import net.yacy.search.schema.CollectionSchema;`

			`/**`
			`* This class hosts document snapshots.`
			`*`
			`* The storage is organized in the following hierarchy:`
			`* - in the root path are subpaths for each host:port`
			`* - in the host:port path are subpaths for the crawl depth, two digits length`
			`* - in the crawl depth path are subpaths for the first two charaters of the url-hash, called shard`
			`* - in the shard path are files, named with <urlhash>'.'<date>.<ext>`
			`* .. where the <date> has the form "yyyyMMdd" and ext may be one of {pdf,jpg,png,xml,json}.`
			`* The pdf is created with wxhtmltopdf, jpg/png is created with convert`
			`* and the xml/json is an extract from solr.`
			`*`
			`* The construction of the file name with the date allows to make several copies of the same document`
			`* for different snapshot-times. The usage of the crawl depth makes it easier to extract a specific part`
			`* of the domain.`
			`*/`
			`public class Snapshots {`

			`private File storageLocation;`

			`public Snapshots(File location) {`
			`this.storageLocation = location;`
			`}`

			`/**`
			`* Load a pdf snapshot of a document.`
			`* A proxy must be given to ensure that multiple loads containing i.e. image are cached`
			`* Use http://localhost:<thisport> as proxy.`
			`* @param url`
			`* @param depth`
			`* @param date`
			`* @param proxy - a string of the form 'http://<host>:<port>`
			`* @return`
			`*/`
in case that loading from the cache fails, load from wkhtmltopdf without cache using the user agent string given in the crawl profile 10 years ago			`public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy, String userAgent) {`
YaCy can now create web page snapshots as pdf documents which can later be transcoded into jpg for image previews. To create such pdfs you must do: Add wkhtmltopdf and imagemagick to your OS, which you can do: On a Mac download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html and downloadh ttp://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip In Debian do "apt-get install wkhtmltopdf imagemagick" Then check in /Settings_p.html?page=ProxyAccess: "Transparent Proxy" and "Always Fresh" - this is used by wkhtmltopdf to fetch web pages using the YaCy proxy. Using "Always Fresh" it is possible to get all pages from the proxy cache. Finally, you will see a new option when starting an expert web crawl. You can set a maximum depth for crawling which should cause a pdf generation. The resulting pdfs are then available in DATA/HTCACHE/SNAPSHOTS/<host>.<port>/<depth>/<shard>/<urlhash>.<date>.pdf 10 years ago			`Collection<File> oldPaths = findPaths(url, depth);`
			`if (replaceOld) {`
			`for (File oldPath: oldPaths) oldPath.delete();`
			`}`
added new web page snapshot infrastructure which will lead to the ability to have web page previews in the search results. (This is a stub, no function available with this yet...) 10 years ago			`File path = definePath(url, "pdf", depth, date);`
			`path.getParentFile().mkdirs();`
in case that loading from the cache fails, load from wkhtmltopdf without cache using the user agent string given in the crawl profile 10 years ago			`boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, userAgent, path);`
YaCy can now create web page snapshots as pdf documents which can later be transcoded into jpg for image previews. To create such pdfs you must do: Add wkhtmltopdf and imagemagick to your OS, which you can do: On a Mac download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html and downloadh ttp://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip In Debian do "apt-get install wkhtmltopdf imagemagick" Then check in /Settings_p.html?page=ProxyAccess: "Transparent Proxy" and "Always Fresh" - this is used by wkhtmltopdf to fetch web pages using the YaCy proxy. Using "Always Fresh" it is possible to get all pages from the proxy cache. Finally, you will see a new option when starting an expert web crawl. You can set a maximum depth for crawling which should cause a pdf generation. The resulting pdfs are then available in DATA/HTCACHE/SNAPSHOTS/<host>.<port>/<depth>/<shard>/<urlhash>.<date>.pdf 10 years ago			`return success ? path : null;`
added new web page snapshot infrastructure which will lead to the ability to have web page previews in the search results. (This is a stub, no function available with this yet...) 10 years ago			`}`

			`/**`
			`* Compute the path of a snapshot. This does not create the snapshot, only gives a path.`
			`* Also, the path to the storage location is not created.`
			`* @param url`
			`* @param ext`
			`* @param depth`
			`* @param date`
			`* @return a file to the snapshot`
			`*/`
			`public File definePath(final DigestURL url, final String ext, final int depth, final Date date) {`
			`String id = ASCII.String(url.hash());`
			`String ds = GenericFormatter.SHORT_DAY_FORMATTER.format(date);`
			`File path = new File(pathToShard(url, depth), id + "." + ds + "." + ext);`
			`return path;`
			`}`

			`/**`
			`* get the depth to a document, helper method for definePath to determine the depth value`
			`* @param url`
			`* @param fulltext`
			`* @return the crawldepth of the document`
			`*/`
			`public int getDepth(final DigestURL url, final Fulltext fulltext) {`
			`Integer depth = null;`
			`if (fulltext.getDefaultConfiguration().contains(CollectionSchema.crawldepth_i)) {`
			`try {`
			`SolrDocument doc = fulltext.getDefaultConnector().getDocumentById(ASCII.String(url.hash()), CollectionSchema.crawldepth_i.getSolrFieldName());`
			`if (doc != null) {`
			`depth = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());`
			`}`
			`} catch (IOException e) {`
			`}`
			`}`
			`return depth == null ? 0 : depth;`
			`}`

			`/**`
			`* for a given url, get all paths for storage locations.`
			`* The locations are all for the single url but may represent different storage times.`
			`* This method is inefficient because it tests all different depths, it would be better to use`
			`* findPaths/3 with a given depth.`
			`* @param url`
			`* @param ext`
			`* @return a set of files for snapshots of the url`
			`*/`
YaCy can now create web page snapshots as pdf documents which can later be transcoded into jpg for image previews. To create such pdfs you must do: Add wkhtmltopdf and imagemagick to your OS, which you can do: On a Mac download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html and downloadh ttp://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip In Debian do "apt-get install wkhtmltopdf imagemagick" Then check in /Settings_p.html?page=ProxyAccess: "Transparent Proxy" and "Always Fresh" - this is used by wkhtmltopdf to fetch web pages using the YaCy proxy. Using "Always Fresh" it is possible to get all pages from the proxy cache. Finally, you will see a new option when starting an expert web crawl. You can set a maximum depth for crawling which should cause a pdf generation. The resulting pdfs are then available in DATA/HTCACHE/SNAPSHOTS/<host>.<port>/<depth>/<shard>/<urlhash>.<date>.pdf 10 years ago			`public Collection<File> findPaths(final DigestURL url) {`
added new web page snapshot infrastructure which will lead to the ability to have web page previews in the search results. (This is a stub, no function available with this yet...) 10 years ago			`for (int i = 0; i < 100; i++) {`
YaCy can now create web page snapshots as pdf documents which can later be transcoded into jpg for image previews. To create such pdfs you must do: Add wkhtmltopdf and imagemagick to your OS, which you can do: On a Mac download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html and downloadh ttp://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip In Debian do "apt-get install wkhtmltopdf imagemagick" Then check in /Settings_p.html?page=ProxyAccess: "Transparent Proxy" and "Always Fresh" - this is used by wkhtmltopdf to fetch web pages using the YaCy proxy. Using "Always Fresh" it is possible to get all pages from the proxy cache. Finally, you will see a new option when starting an expert web crawl. You can set a maximum depth for crawling which should cause a pdf generation. The resulting pdfs are then available in DATA/HTCACHE/SNAPSHOTS/<host>.<port>/<depth>/<shard>/<urlhash>.<date>.pdf 10 years ago			`Collection<File> paths = findPaths(url, i);`
added new web page snapshot infrastructure which will lead to the ability to have web page previews in the search results. (This is a stub, no function available with this yet...) 10 years ago			`if (paths.size() > 0) return paths;`
			`}`
			`return new ArrayList<>(0);`
			`}`

			`/**`
			`* for a given url, get all paths for storage locations.`
			`* The locations are all for the single url but may represent different storage times.`
			`* @param url`
			`* @param ext`
			`* @param depth`
			`* @return a set of files for snapshots of the url`
			`*/`
YaCy can now create web page snapshots as pdf documents which can later be transcoded into jpg for image previews. To create such pdfs you must do: Add wkhtmltopdf and imagemagick to your OS, which you can do: On a Mac download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html and downloadh ttp://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip In Debian do "apt-get install wkhtmltopdf imagemagick" Then check in /Settings_p.html?page=ProxyAccess: "Transparent Proxy" and "Always Fresh" - this is used by wkhtmltopdf to fetch web pages using the YaCy proxy. Using "Always Fresh" it is possible to get all pages from the proxy cache. Finally, you will see a new option when starting an expert web crawl. You can set a maximum depth for crawling which should cause a pdf generation. The resulting pdfs are then available in DATA/HTCACHE/SNAPSHOTS/<host>.<port>/<depth>/<shard>/<urlhash>.<date>.pdf 10 years ago			`public Collection<File> findPaths(final DigestURL url, final int depth) {`
added new web page snapshot infrastructure which will lead to the ability to have web page previews in the search results. (This is a stub, no function available with this yet...) 10 years ago			`String id = ASCII.String(url.hash());`
			`File pathToShard = pathToShard(url, depth);`
YaCy can now create web page snapshots as pdf documents which can later be transcoded into jpg for image previews. To create such pdfs you must do: Add wkhtmltopdf and imagemagick to your OS, which you can do: On a Mac download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html and downloadh ttp://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip In Debian do "apt-get install wkhtmltopdf imagemagick" Then check in /Settings_p.html?page=ProxyAccess: "Transparent Proxy" and "Always Fresh" - this is used by wkhtmltopdf to fetch web pages using the YaCy proxy. Using "Always Fresh" it is possible to get all pages from the proxy cache. Finally, you will see a new option when starting an expert web crawl. You can set a maximum depth for crawling which should cause a pdf generation. The resulting pdfs are then available in DATA/HTCACHE/SNAPSHOTS/<host>.<port>/<depth>/<shard>/<urlhash>.<date>.pdf 10 years ago			`String[] list = pathToShard.exists() && pathToShard.isDirectory() ? pathToShard.list() : null; // may be null if path does not exist`
added new web page snapshot infrastructure which will lead to the ability to have web page previews in the search results. (This is a stub, no function available with this yet...) 10 years ago			`ArrayList<File> paths = new ArrayList<>();`
YaCy can now create web page snapshots as pdf documents which can later be transcoded into jpg for image previews. To create such pdfs you must do: Add wkhtmltopdf and imagemagick to your OS, which you can do: On a Mac download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html and downloadh ttp://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip In Debian do "apt-get install wkhtmltopdf imagemagick" Then check in /Settings_p.html?page=ProxyAccess: "Transparent Proxy" and "Always Fresh" - this is used by wkhtmltopdf to fetch web pages using the YaCy proxy. Using "Always Fresh" it is possible to get all pages from the proxy cache. Finally, you will see a new option when starting an expert web crawl. You can set a maximum depth for crawling which should cause a pdf generation. The resulting pdfs are then available in DATA/HTCACHE/SNAPSHOTS/<host>.<port>/<depth>/<shard>/<urlhash>.<date>.pdf 10 years ago			`if (list != null) {`
			`final String ext = MultiProtocolURL.getFileExtension(url.getFileName());`
			`for (String f: list) {`
			`if (f.startsWith(id) && f.endsWith(ext)) paths.add(new File(pathToShard, f));`
			`}`
added new web page snapshot infrastructure which will lead to the ability to have web page previews in the search results. (This is a stub, no function available with this yet...) 10 years ago			`}`
			`return paths;`
			`}`

			`private File pathToShard(final DigestURL url, final int depth) {`
			`String id = ASCII.String(url.hash());`
YaCy can now create web page snapshots as pdf documents which can later be transcoded into jpg for image previews. To create such pdfs you must do: Add wkhtmltopdf and imagemagick to your OS, which you can do: On a Mac download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html and downloadh ttp://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip In Debian do "apt-get install wkhtmltopdf imagemagick" Then check in /Settings_p.html?page=ProxyAccess: "Transparent Proxy" and "Always Fresh" - this is used by wkhtmltopdf to fetch web pages using the YaCy proxy. Using "Always Fresh" it is possible to get all pages from the proxy cache. Finally, you will see a new option when starting an expert web crawl. You can set a maximum depth for crawling which should cause a pdf generation. The resulting pdfs are then available in DATA/HTCACHE/SNAPSHOTS/<host>.<port>/<depth>/<shard>/<urlhash>.<date>.pdf 10 years ago			`File pathToHostDir = new File(storageLocation, url.getHost() + "." + url.getPort());`
added new web page snapshot infrastructure which will lead to the ability to have web page previews in the search results. (This is a stub, no function available with this yet...) 10 years ago			`File pathToDepthDir = new File(pathToHostDir, depth < 10 ? "0" + depth : Integer.toString(depth));`
			`File pathToShard = new File(pathToDepthDir, id.substring(0, 2));`
			`return pathToShard;`
			`}`

			`}`