diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 1f333286f..5fc6e2d4d 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -462,6 +462,7 @@ public class Crawler_p { indexMedia, storeHTCache, crawlOrder, + -1, // temporary; stub commit cachePolicy, collection, agentName); diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 756b0a4b9..dfad2b5a2 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -152,6 +152,7 @@ public class QuickCrawlLink_p { obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, indexText, indexMedia, storeHTCache, remoteIndexing, + -1, CacheStrategy.IFFRESH, collection, ClientIdentification.yacyIntranetCrawlerAgentName); diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index ce70b58e5..7fb2af71e 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -293,6 +293,7 @@ public final class CrawlSwitchboard { sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true), true, sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false), + -1, CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_PROXY, ClientIdentification.yacyProxyAgentName); @@ -322,6 +323,7 @@ public final class CrawlSwitchboard { true, false, false, + -1, CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_REMOTE, ClientIdentification.yacyInternetCrawlerAgentName); @@ -351,6 +353,7 @@ public final class CrawlSwitchboard { false, true, false, + -1, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName); @@ -380,6 +383,7 @@ public final class CrawlSwitchboard { true, true, false, + -1, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName); @@ -410,6 +414,7 @@ public final class CrawlSwitchboard { false, true, false, + -1, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT, ClientIdentification.browserAgentName); @@ -439,6 +444,7 @@ public final class CrawlSwitchboard { false, true, false, + -1, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName); @@ -468,6 +474,7 @@ public final class CrawlSwitchboard { true, true, false, + -1, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName); @@ -497,6 +504,7 @@ public final class CrawlSwitchboard { false, false, false, + -1, CacheStrategy.NOCACHE, "robot_" + CRAWL_PROFILE_SURROGATE, ClientIdentification.yacyIntranetCrawlerAgentName); @@ -529,6 +537,7 @@ public final class CrawlSwitchboard { true, false, false, + -1, CacheStrategy.NOCACHE, collection, ClientIdentification.yacyIntranetCrawlerAgentName); diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 909427d57..c44cd6fea 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -86,6 +86,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M public static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch"; public static final String INDEXING_CONTENT_MUSTMATCH = "indexContentMustMatch"; public static final String INDEXING_CONTENT_MUSTNOTMATCH = "indexContentMustNotMatch"; + public static final String LOADPREVIEWMAXDEPTH = "loadpreviewmaxdepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1 private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null; private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null; @@ -141,6 +142,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M final boolean indexMedia, final boolean storeHTCache, final boolean remoteIndexing, + final int loadPreviewMaxdepth, final CacheStrategy cacheStrategy, final String collections, final String userAgentName) { @@ -176,6 +178,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M put(INDEX_MEDIA, indexMedia); put(STORE_HTCACHE, storeHTCache); put(REMOTE_INDEXING, remoteIndexing); + put(LOADPREVIEWMAXDEPTH, loadPreviewMaxdepth); put(CACHE_STRAGEGY, cacheStrategy.toString()); put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll("")); } @@ -565,11 +568,25 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M if (r == null) return false; return (r.equals(Boolean.TRUE.toString())); } + public boolean remoteIndexing() { final String r = get(REMOTE_INDEXING); if (r == null) return false; return (r.equals(Boolean.TRUE.toString())); } + + public int loadPreviewMaxdepth() { + final String r = get(LOADPREVIEWMAXDEPTH); + if (r == null) return -1; + try { + final int i = Integer.parseInt(r); + if (i < 0) return -1; + return i; + } catch (final NumberFormatException e) { + ConcurrentLog.logException(e); + return -1; + } + } /** * get a recrawl date for a given age in minutes diff --git a/source/net/yacy/crawler/data/Snapshots.java b/source/net/yacy/crawler/data/Snapshots.java new file mode 100644 index 000000000..404176af1 --- /dev/null +++ b/source/net/yacy/crawler/data/Snapshots.java @@ -0,0 +1,160 @@ +/** + * DocumentImage + * Copyright 2014 by Michael Peter Christen + * First released 29.11.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see <http://www.gnu.org/licenses/>. + */ + +package net.yacy.crawler.data; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; + +import org.apache.solr.common.SolrDocument; + +import net.yacy.cora.date.GenericFormatter; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.search.index.Fulltext; +import net.yacy.search.schema.CollectionSchema; + +/** + * This class hosts document snapshots. + * + * The storage is organized in the following hierarchy: + * - in the root path are subpaths for each host:port + * - in the host:port path are subpaths for the crawl depth, two digits length + * - in the crawl depth path are subpaths for the first two charaters of the url-hash, called shard + * - in the shard path are files, named with <urlhash>'.'<date>.<ext> + * .. where the <date> has the form "yyyyMMdd" and ext may be one of {pdf,jpg,png,xml,json}. + * The pdf is created with wxhtmltopdf, jpg/png is created with convert + * and the xml/json is an extract from solr. + * + * The construction of the file name with the date allows to make several copies of the same document + * for different snapshot-times. The usage of the crawl depth makes it easier to extract a specific part + * of the domain. + */ +public class Snapshots { + + private File storageLocation; + + public Snapshots(File location) { + this.storageLocation = location; + } + + /** + * Load a pdf snapshot of a document. + * A proxy must be given to ensure that multiple loads containing i.e. image are cached + * Use http://localhost:<thisport> as proxy. + * @param url + * @param depth + * @param date + * @param proxy - a string of the form 'http://<host>:<port> + * @return + */ + public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, String proxy) { + File path = definePath(url, "pdf", depth, date); + path.getParentFile().mkdirs(); + + // STUB + + return path; + } + + /** + * Compute the path of a snapshot. This does not create the snapshot, only gives a path. + * Also, the path to the storage location is not created. + * @param url + * @param ext + * @param depth + * @param date + * @return a file to the snapshot + */ + public File definePath(final DigestURL url, final String ext, final int depth, final Date date) { + String id = ASCII.String(url.hash()); + String ds = GenericFormatter.SHORT_DAY_FORMATTER.format(date); + File path = new File(pathToShard(url, depth), id + "." + ds + "." + ext); + return path; + } + + /** + * get the depth to a document, helper method for definePath to determine the depth value + * @param url + * @param fulltext + * @return the crawldepth of the document + */ + public int getDepth(final DigestURL url, final Fulltext fulltext) { + Integer depth = null; + if (fulltext.getDefaultConfiguration().contains(CollectionSchema.crawldepth_i)) { + try { + SolrDocument doc = fulltext.getDefaultConnector().getDocumentById(ASCII.String(url.hash()), CollectionSchema.crawldepth_i.getSolrFieldName()); + if (doc != null) { + depth = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName()); + } + } catch (IOException e) { + } + } + return depth == null ? 0 : depth; + } + + /** + * for a given url, get all paths for storage locations. + * The locations are all for the single url but may represent different storage times. + * This method is inefficient because it tests all different depths, it would be better to use + * findPaths/3 with a given depth. + * @param url + * @param ext + * @return a set of files for snapshots of the url + */ + public Collection<File> findPaths(final DigestURL url, final String ext) { + for (int i = 0; i < 100; i++) { + Collection<File> paths = findPaths(url, ext, i); + if (paths.size() > 0) return paths; + } + return new ArrayList<>(0); + } + + /** + * for a given url, get all paths for storage locations. + * The locations are all for the single url but may represent different storage times. + * @param url + * @param ext + * @param depth + * @return a set of files for snapshots of the url + */ + public Collection<File> findPaths(final DigestURL url, final String ext, final int depth) { + String id = ASCII.String(url.hash()); + File pathToShard = pathToShard(url, depth); + String[] list = pathToShard.list(); + ArrayList<File> paths = new ArrayList<>(); + for (String f: list) { + if (f.startsWith(id) && f.endsWith(ext)) paths.add(new File(pathToShard, f)); + } + return paths; + } + + private File pathToShard(final DigestURL url, final int depth) { + String id = ASCII.String(url.hash()); + File pathToHostDir = new File(storageLocation, url.getHost() + ":" + url.getPort()); + File pathToDepthDir = new File(pathToHostDir, depth < 10 ? "0" + depth : Integer.toString(depth)); + File pathToShard = new File(pathToDepthDir, id.substring(0, 2)); + return pathToShard; + } + +} diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index a1a898694..748b314f0 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -25,6 +25,7 @@ package net.yacy.crawler.retrieval; import java.io.IOException; +import java.util.Date; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.solr.FailCategory; @@ -68,10 +69,16 @@ public final class HTTPLoader { } public Response load(final Request entry, CrawlProfile profile, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { + // load fulltext of html page Latency.updateBeforeLoad(entry.url()); final long start = System.currentTimeMillis(); final Response doc = load(entry, profile, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType, agent); Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start); + + // load pdf in case that is wanted. This can later be used to compute a web page preview in the search results + if (entry.depth() <= profile.loadPreviewMaxdepth() && "html|shtml|php".indexOf(entry.url().getFile()) >= 0) { + sb.snapshots.downloadPDFSnapshot(entry.url(), entry.depth(), new Date(), "http://127.0.0.1:" + sb.getConfigInt("port", 8090)); + } return doc; } diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java index 7db41a887..6b0899a6d 100644 --- a/source/net/yacy/data/ymark/YMarkCrawlStart.java +++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java @@ -186,6 +186,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{ crawlingQ, true, true, true, false, true, true, false, + -1, CacheStrategy.IFFRESH, "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard diff --git a/source/net/yacy/kelondro/index/BufferedObjectIndex.java b/source/net/yacy/kelondro/index/BufferedObjectIndex.java index 12773d7c3..a20db7249 100644 --- a/source/net/yacy/kelondro/index/BufferedObjectIndex.java +++ b/source/net/yacy/kelondro/index/BufferedObjectIndex.java @@ -58,6 +58,10 @@ public class BufferedObjectIndex implements Index, Iterable<Row.Entry> { this.entryComparator = new Row.EntryComparator(backend.row().objectOrder); } + public boolean isOnDemand() { + return this.backend instanceof OnDemandOpenFileIndex; + } + @Override public byte[] smallestKey() { if (this.buffer == null || this.buffer.isEmpty()) return this.backend.smallestKey(); diff --git a/source/net/yacy/kelondro/util/OS.java b/source/net/yacy/kelondro/util/OS.java index 1a5b0e3e1..452a4b815 100644 --- a/source/net/yacy/kelondro/util/OS.java +++ b/source/net/yacy/kelondro/util/OS.java @@ -31,7 +31,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; -import java.util.Vector; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.util.ConcurrentLog; @@ -197,13 +196,13 @@ public final class OS { FileUtils.deletedelete(starterFile); } - public static Vector<String> execSynchronous(final String command) throws IOException { + public static List<String> execSynchronous(final String command) throws IOException { // runs a unix/linux command and returns output as Vector of Strings // this method blocks until the command is executed final Process p = Runtime.getRuntime().exec(command); final BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream())); String text; - final Vector<String> output = new Vector<String>(); + final List<String> output = new ArrayList<String>(); while ((text = in.readLine()) != null) { output.add(text); } @@ -212,9 +211,16 @@ public final class OS { } public static void main(final String[] args) { + try { + List<String> v = execSynchronous("/usr/local/bin/wkhtmltoimage"); + for (String r: v) java.lang.System.out.println(r); + } catch (IOException e) { + } + /* if (args[0].equals("-m")) { java.lang.System.out.println("Maximum possible memory: " + Integer.toString(getWin32MaxHeap()) + "m"); } + */ } } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 7cede36e4..571b8221f 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -122,6 +122,7 @@ import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlQueues; +import net.yacy.crawler.data.Snapshots; import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.data.ResultImages; import net.yacy.crawler.data.ResultURLs; @@ -243,6 +244,7 @@ public final class Switchboard extends serverSwitch { public File queuesRoot; public File surrogatesInPath; //public File surrogatesOutPath; + public Snapshots snapshots; public Segment index; public LoaderDispatcher loader; public CrawlSwitchboard crawler; @@ -344,6 +346,7 @@ public final class Switchboard extends serverSwitch { this.htDocsPath = getDataPath(SwitchboardConstants.HTDOCS_PATH, SwitchboardConstants.HTDOCS_PATH_DEFAULT); this.log.config("HTDOCS Path: " + this.htDocsPath.toString()); + this.snapshots = new Snapshots(new File(this.htDocsPath, "SNAPSHOTS")); this.workPath = getDataPath(SwitchboardConstants.WORK_PATH, SwitchboardConstants.WORK_PATH_DEFAULT); this.workPath.mkdirs(); // if default work files exist, copy them (don't overwrite existing!) @@ -3853,27 +3856,6 @@ public final class Switchboard extends serverSwitch { i++; } } - - /* - public File getPDF(DigestURL url) { - String depth = "00"; - String idstub = ASCII.String(url.hash()).substring(0, 6); - if (this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.crawldepth_i)) { - try { - SolrDocument doc = this.index.fulltext().getDefaultConnector().getDocumentById(ASCII.String(url.hash()), CollectionSchema.crawldepth_i.getSolrFieldName()); - if (doc != null) { - depth = (String) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName()); - if (depth == null) depth = "00"; else if (depth.length() < 2) depth = "0" + depth; - } - } catch (IOException e) { - } - } - File pathToPdf = new File(this.htCachePath, url.getHost() + ":" + url.getPort()); - - File pdfFile = new File(pathToPdf, depth + "-" + idstub); - - } - */ public void checkInterruption() throws InterruptedException { final Thread curThread = Thread.currentThread();