added new web page snapshot infrastructure which will lead to the

ability to have web page previews in the search results.
(This is a stub, no function available with this yet...)
pull/1/head
Michael Peter Christen 10 years ago
parent aa0faeabc5
commit ad0da5f246

@ -462,6 +462,7 @@ public class Crawler_p {
indexMedia, indexMedia,
storeHTCache, storeHTCache,
crawlOrder, crawlOrder,
-1, // temporary; stub commit
cachePolicy, cachePolicy,
collection, collection,
agentName); agentName);

@ -152,6 +152,7 @@ public class QuickCrawlLink_p {
obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
indexText, indexMedia, indexText, indexMedia,
storeHTCache, remoteIndexing, storeHTCache, remoteIndexing,
-1,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
collection, collection,
ClientIdentification.yacyIntranetCrawlerAgentName); ClientIdentification.yacyIntranetCrawlerAgentName);

@ -293,6 +293,7 @@ public final class CrawlSwitchboard {
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true), sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true),
true, true,
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false), sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false),
-1,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY, "robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName); ClientIdentification.yacyProxyAgentName);
@ -322,6 +323,7 @@ public final class CrawlSwitchboard {
true, true,
false, false,
false, false,
-1,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE, "robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName); ClientIdentification.yacyInternetCrawlerAgentName);
@ -351,6 +353,7 @@ public final class CrawlSwitchboard {
false, false,
true, true,
false, false,
-1,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName); ClientIdentification.yacyIntranetCrawlerAgentName);
@ -380,6 +383,7 @@ public final class CrawlSwitchboard {
true, true,
true, true,
false, false,
-1,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName); ClientIdentification.yacyIntranetCrawlerAgentName);
@ -410,6 +414,7 @@ public final class CrawlSwitchboard {
false, false,
true, true,
false, false,
-1,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT, "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName); ClientIdentification.browserAgentName);
@ -439,6 +444,7 @@ public final class CrawlSwitchboard {
false, false,
true, true,
false, false,
-1,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName); ClientIdentification.yacyIntranetCrawlerAgentName);
@ -468,6 +474,7 @@ public final class CrawlSwitchboard {
true, true,
true, true,
false, false,
-1,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName); ClientIdentification.yacyIntranetCrawlerAgentName);
@ -497,6 +504,7 @@ public final class CrawlSwitchboard {
false, false,
false, false,
false, false,
-1,
CacheStrategy.NOCACHE, CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE, "robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName); ClientIdentification.yacyIntranetCrawlerAgentName);
@ -529,6 +537,7 @@ public final class CrawlSwitchboard {
true, true,
false, false,
false, false,
-1,
CacheStrategy.NOCACHE, CacheStrategy.NOCACHE,
collection, collection,
ClientIdentification.yacyIntranetCrawlerAgentName); ClientIdentification.yacyIntranetCrawlerAgentName);

@ -86,6 +86,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch"; public static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch";
public static final String INDEXING_CONTENT_MUSTMATCH = "indexContentMustMatch"; public static final String INDEXING_CONTENT_MUSTMATCH = "indexContentMustMatch";
public static final String INDEXING_CONTENT_MUSTNOTMATCH = "indexContentMustNotMatch"; public static final String INDEXING_CONTENT_MUSTNOTMATCH = "indexContentMustNotMatch";
public static final String LOADPREVIEWMAXDEPTH = "loadpreviewmaxdepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1
private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null; private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null; private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
@ -141,6 +142,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final boolean indexMedia, final boolean indexMedia,
final boolean storeHTCache, final boolean storeHTCache,
final boolean remoteIndexing, final boolean remoteIndexing,
final int loadPreviewMaxdepth,
final CacheStrategy cacheStrategy, final CacheStrategy cacheStrategy,
final String collections, final String collections,
final String userAgentName) { final String userAgentName) {
@ -176,6 +178,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(INDEX_MEDIA, indexMedia); put(INDEX_MEDIA, indexMedia);
put(STORE_HTCACHE, storeHTCache); put(STORE_HTCACHE, storeHTCache);
put(REMOTE_INDEXING, remoteIndexing); put(REMOTE_INDEXING, remoteIndexing);
put(LOADPREVIEWMAXDEPTH, loadPreviewMaxdepth);
put(CACHE_STRAGEGY, cacheStrategy.toString()); put(CACHE_STRAGEGY, cacheStrategy.toString());
put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll("")); put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
} }
@ -565,11 +568,25 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean remoteIndexing() { public boolean remoteIndexing() {
final String r = get(REMOTE_INDEXING); final String r = get(REMOTE_INDEXING);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public int loadPreviewMaxdepth() {
final String r = get(LOADPREVIEWMAXDEPTH);
if (r == null) return -1;
try {
final int i = Integer.parseInt(r);
if (i < 0) return -1;
return i;
} catch (final NumberFormatException e) {
ConcurrentLog.logException(e);
return -1;
}
}
/** /**
* get a recrawl date for a given age in minutes * get a recrawl date for a given age in minutes

@ -0,0 +1,160 @@
/**
* DocumentImage
* Copyright 2014 by Michael Peter Christen
* First released 29.11.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler.data;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionSchema;
/**
* This class hosts document snapshots.
*
* The storage is organized in the following hierarchy:
* - in the root path are subpaths for each host:port
* - in the host:port path are subpaths for the crawl depth, two digits length
* - in the crawl depth path are subpaths for the first two charaters of the url-hash, called shard
* - in the shard path are files, named with <urlhash>'.'<date>.<ext>
* .. where the <date> has the form "yyyyMMdd" and ext may be one of {pdf,jpg,png,xml,json}.
* The pdf is created with wxhtmltopdf, jpg/png is created with convert
* and the xml/json is an extract from solr.
*
* The construction of the file name with the date allows to make several copies of the same document
* for different snapshot-times. The usage of the crawl depth makes it easier to extract a specific part
* of the domain.
*/
public class Snapshots {
private File storageLocation;
public Snapshots(File location) {
this.storageLocation = location;
}
/**
* Load a pdf snapshot of a document.
* A proxy must be given to ensure that multiple loads containing i.e. image are cached
* Use http://localhost:<thisport> as proxy.
* @param url
* @param depth
* @param date
* @param proxy - a string of the form 'http://<host>:<port>
* @return
*/
public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, String proxy) {
File path = definePath(url, "pdf", depth, date);
path.getParentFile().mkdirs();
// STUB
return path;
}
/**
* Compute the path of a snapshot. This does not create the snapshot, only gives a path.
* Also, the path to the storage location is not created.
* @param url
* @param ext
* @param depth
* @param date
* @return a file to the snapshot
*/
public File definePath(final DigestURL url, final String ext, final int depth, final Date date) {
String id = ASCII.String(url.hash());
String ds = GenericFormatter.SHORT_DAY_FORMATTER.format(date);
File path = new File(pathToShard(url, depth), id + "." + ds + "." + ext);
return path;
}
/**
* get the depth to a document, helper method for definePath to determine the depth value
* @param url
* @param fulltext
* @return the crawldepth of the document
*/
public int getDepth(final DigestURL url, final Fulltext fulltext) {
Integer depth = null;
if (fulltext.getDefaultConfiguration().contains(CollectionSchema.crawldepth_i)) {
try {
SolrDocument doc = fulltext.getDefaultConnector().getDocumentById(ASCII.String(url.hash()), CollectionSchema.crawldepth_i.getSolrFieldName());
if (doc != null) {
depth = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
}
} catch (IOException e) {
}
}
return depth == null ? 0 : depth;
}
/**
* for a given url, get all paths for storage locations.
* The locations are all for the single url but may represent different storage times.
* This method is inefficient because it tests all different depths, it would be better to use
* findPaths/3 with a given depth.
* @param url
* @param ext
* @return a set of files for snapshots of the url
*/
public Collection<File> findPaths(final DigestURL url, final String ext) {
for (int i = 0; i < 100; i++) {
Collection<File> paths = findPaths(url, ext, i);
if (paths.size() > 0) return paths;
}
return new ArrayList<>(0);
}
/**
* for a given url, get all paths for storage locations.
* The locations are all for the single url but may represent different storage times.
* @param url
* @param ext
* @param depth
* @return a set of files for snapshots of the url
*/
public Collection<File> findPaths(final DigestURL url, final String ext, final int depth) {
String id = ASCII.String(url.hash());
File pathToShard = pathToShard(url, depth);
String[] list = pathToShard.list();
ArrayList<File> paths = new ArrayList<>();
for (String f: list) {
if (f.startsWith(id) && f.endsWith(ext)) paths.add(new File(pathToShard, f));
}
return paths;
}
private File pathToShard(final DigestURL url, final int depth) {
String id = ASCII.String(url.hash());
File pathToHostDir = new File(storageLocation, url.getHost() + ":" + url.getPort());
File pathToDepthDir = new File(pathToHostDir, depth < 10 ? "0" + depth : Integer.toString(depth));
File pathToShard = new File(pathToDepthDir, id.substring(0, 2));
return pathToShard;
}
}

@ -25,6 +25,7 @@
package net.yacy.crawler.retrieval; package net.yacy.crawler.retrieval;
import java.io.IOException; import java.io.IOException;
import java.util.Date;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.solr.FailCategory;
@ -68,10 +69,16 @@ public final class HTTPLoader {
} }
public Response load(final Request entry, CrawlProfile profile, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { public Response load(final Request entry, CrawlProfile profile, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
// load fulltext of html page
Latency.updateBeforeLoad(entry.url()); Latency.updateBeforeLoad(entry.url());
final long start = System.currentTimeMillis(); final long start = System.currentTimeMillis();
final Response doc = load(entry, profile, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType, agent); final Response doc = load(entry, profile, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType, agent);
Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start); Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
// load pdf in case that is wanted. This can later be used to compute a web page preview in the search results
if (entry.depth() <= profile.loadPreviewMaxdepth() && "html|shtml|php".indexOf(entry.url().getFile()) >= 0) {
sb.snapshots.downloadPDFSnapshot(entry.url(), entry.depth(), new Date(), "http://127.0.0.1:" + sb.getConfigInt("port", 8090));
}
return doc; return doc;
} }

@ -186,6 +186,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
crawlingQ, crawlingQ,
true, true, true, false, true, true, true, false,
true, true, false, true, true, false,
-1,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard

@ -58,6 +58,10 @@ public class BufferedObjectIndex implements Index, Iterable<Row.Entry> {
this.entryComparator = new Row.EntryComparator(backend.row().objectOrder); this.entryComparator = new Row.EntryComparator(backend.row().objectOrder);
} }
public boolean isOnDemand() {
return this.backend instanceof OnDemandOpenFileIndex;
}
@Override @Override
public byte[] smallestKey() { public byte[] smallestKey() {
if (this.buffer == null || this.buffer.isEmpty()) return this.backend.smallestKey(); if (this.buffer == null || this.buffer.isEmpty()) return this.backend.smallestKey();

@ -31,7 +31,6 @@ import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Properties; import java.util.Properties;
import java.util.Vector;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
@ -197,13 +196,13 @@ public final class OS {
FileUtils.deletedelete(starterFile); FileUtils.deletedelete(starterFile);
} }
public static Vector<String> execSynchronous(final String command) throws IOException { public static List<String> execSynchronous(final String command) throws IOException {
// runs a unix/linux command and returns output as Vector of Strings // runs a unix/linux command and returns output as Vector of Strings
// this method blocks until the command is executed // this method blocks until the command is executed
final Process p = Runtime.getRuntime().exec(command); final Process p = Runtime.getRuntime().exec(command);
final BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream())); final BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream()));
String text; String text;
final Vector<String> output = new Vector<String>(); final List<String> output = new ArrayList<String>();
while ((text = in.readLine()) != null) { while ((text = in.readLine()) != null) {
output.add(text); output.add(text);
} }
@ -212,9 +211,16 @@ public final class OS {
} }
public static void main(final String[] args) { public static void main(final String[] args) {
try {
List<String> v = execSynchronous("/usr/local/bin/wkhtmltoimage");
for (String r: v) java.lang.System.out.println(r);
} catch (IOException e) {
}
/*
if (args[0].equals("-m")) { if (args[0].equals("-m")) {
java.lang.System.out.println("Maximum possible memory: " + Integer.toString(getWin32MaxHeap()) + "m"); java.lang.System.out.println("Maximum possible memory: " + Integer.toString(getWin32MaxHeap()) + "m");
} }
*/
} }
} }

@ -122,6 +122,7 @@ import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.Snapshots;
import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.data.ResultImages; import net.yacy.crawler.data.ResultImages;
import net.yacy.crawler.data.ResultURLs; import net.yacy.crawler.data.ResultURLs;
@ -243,6 +244,7 @@ public final class Switchboard extends serverSwitch {
public File queuesRoot; public File queuesRoot;
public File surrogatesInPath; public File surrogatesInPath;
//public File surrogatesOutPath; //public File surrogatesOutPath;
public Snapshots snapshots;
public Segment index; public Segment index;
public LoaderDispatcher loader; public LoaderDispatcher loader;
public CrawlSwitchboard crawler; public CrawlSwitchboard crawler;
@ -344,6 +346,7 @@ public final class Switchboard extends serverSwitch {
this.htDocsPath = this.htDocsPath =
getDataPath(SwitchboardConstants.HTDOCS_PATH, SwitchboardConstants.HTDOCS_PATH_DEFAULT); getDataPath(SwitchboardConstants.HTDOCS_PATH, SwitchboardConstants.HTDOCS_PATH_DEFAULT);
this.log.config("HTDOCS Path: " + this.htDocsPath.toString()); this.log.config("HTDOCS Path: " + this.htDocsPath.toString());
this.snapshots = new Snapshots(new File(this.htDocsPath, "SNAPSHOTS"));
this.workPath = getDataPath(SwitchboardConstants.WORK_PATH, SwitchboardConstants.WORK_PATH_DEFAULT); this.workPath = getDataPath(SwitchboardConstants.WORK_PATH, SwitchboardConstants.WORK_PATH_DEFAULT);
this.workPath.mkdirs(); this.workPath.mkdirs();
// if default work files exist, copy them (don't overwrite existing!) // if default work files exist, copy them (don't overwrite existing!)
@ -3853,27 +3856,6 @@ public final class Switchboard extends serverSwitch {
i++; i++;
} }
} }
/*
public File getPDF(DigestURL url) {
String depth = "00";
String idstub = ASCII.String(url.hash()).substring(0, 6);
if (this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.crawldepth_i)) {
try {
SolrDocument doc = this.index.fulltext().getDefaultConnector().getDocumentById(ASCII.String(url.hash()), CollectionSchema.crawldepth_i.getSolrFieldName());
if (doc != null) {
depth = (String) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
if (depth == null) depth = "00"; else if (depth.length() < 2) depth = "0" + depth;
}
} catch (IOException e) {
}
}
File pathToPdf = new File(this.htCachePath, url.getHost() + ":" + url.getPort());
File pdfFile = new File(pathToPdf, depth + "-" + idstub);
}
*/
public void checkInterruption() throws InterruptedException { public void checkInterruption() throws InterruptedException {
final Thread curThread = Thread.currentThread(); final Thread curThread = Thread.currentThread();

Loading…
Cancel
Save