added new web page snapshot infrastructure which will lead to the

ability to have web page previews in the search results.
(This is a stub, no function available with this yet...)
pull/1/head
Michael Peter Christen 10 years ago
parent aa0faeabc5
commit ad0da5f246

@ -462,6 +462,7 @@ public class Crawler_p {
indexMedia,
storeHTCache,
crawlOrder,
-1, // temporary; stub commit
cachePolicy,
collection,
agentName);

@ -152,6 +152,7 @@ public class QuickCrawlLink_p {
obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
indexText, indexMedia,
storeHTCache, remoteIndexing,
-1,
CacheStrategy.IFFRESH,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName);

@ -293,6 +293,7 @@ public final class CrawlSwitchboard {
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true),
true,
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false),
-1,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName);
@ -322,6 +323,7 @@ public final class CrawlSwitchboard {
true,
false,
false,
-1,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName);
@ -351,6 +353,7 @@ public final class CrawlSwitchboard {
false,
true,
false,
-1,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName);
@ -380,6 +383,7 @@ public final class CrawlSwitchboard {
true,
true,
false,
-1,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName);
@ -410,6 +414,7 @@ public final class CrawlSwitchboard {
false,
true,
false,
-1,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName);
@ -439,6 +444,7 @@ public final class CrawlSwitchboard {
false,
true,
false,
-1,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName);
@ -468,6 +474,7 @@ public final class CrawlSwitchboard {
true,
true,
false,
-1,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName);
@ -497,6 +504,7 @@ public final class CrawlSwitchboard {
false,
false,
false,
-1,
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName);
@ -529,6 +537,7 @@ public final class CrawlSwitchboard {
true,
false,
false,
-1,
CacheStrategy.NOCACHE,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName);

@ -86,6 +86,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch";
public static final String INDEXING_CONTENT_MUSTMATCH = "indexContentMustMatch";
public static final String INDEXING_CONTENT_MUSTNOTMATCH = "indexContentMustNotMatch";
public static final String LOADPREVIEWMAXDEPTH = "loadpreviewmaxdepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1
private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
@ -141,6 +142,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final boolean indexMedia,
final boolean storeHTCache,
final boolean remoteIndexing,
final int loadPreviewMaxdepth,
final CacheStrategy cacheStrategy,
final String collections,
final String userAgentName) {
@ -176,6 +178,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(INDEX_MEDIA, indexMedia);
put(STORE_HTCACHE, storeHTCache);
put(REMOTE_INDEXING, remoteIndexing);
put(LOADPREVIEWMAXDEPTH, loadPreviewMaxdepth);
put(CACHE_STRAGEGY, cacheStrategy.toString());
put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
}
@ -565,11 +568,25 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean remoteIndexing() {
final String r = get(REMOTE_INDEXING);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public int loadPreviewMaxdepth() {
final String r = get(LOADPREVIEWMAXDEPTH);
if (r == null) return -1;
try {
final int i = Integer.parseInt(r);
if (i < 0) return -1;
return i;
} catch (final NumberFormatException e) {
ConcurrentLog.logException(e);
return -1;
}
}
/**
* get a recrawl date for a given age in minutes

@ -0,0 +1,160 @@
/**
* DocumentImage
* Copyright 2014 by Michael Peter Christen
* First released 29.11.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler.data;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionSchema;
/**
* This class hosts document snapshots.
*
* The storage is organized in the following hierarchy:
* - in the root path are subpaths for each host:port
* - in the host:port path are subpaths for the crawl depth, two digits length
* - in the crawl depth path are subpaths for the first two charaters of the url-hash, called shard
* - in the shard path are files, named with <urlhash>'.'<date>.<ext>
* .. where the <date> has the form "yyyyMMdd" and ext may be one of {pdf,jpg,png,xml,json}.
* The pdf is created with wxhtmltopdf, jpg/png is created with convert
* and the xml/json is an extract from solr.
*
* The construction of the file name with the date allows to make several copies of the same document
* for different snapshot-times. The usage of the crawl depth makes it easier to extract a specific part
* of the domain.
*/
public class Snapshots {
private File storageLocation;
public Snapshots(File location) {
this.storageLocation = location;
}
/**
* Load a pdf snapshot of a document.
* A proxy must be given to ensure that multiple loads containing i.e. image are cached
* Use http://localhost:<thisport> as proxy.
* @param url
* @param depth
* @param date
* @param proxy - a string of the form 'http://<host>:<port>
* @return
*/
public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, String proxy) {
File path = definePath(url, "pdf", depth, date);
path.getParentFile().mkdirs();
// STUB
return path;
}
/**
* Compute the path of a snapshot. This does not create the snapshot, only gives a path.
* Also, the path to the storage location is not created.
* @param url
* @param ext
* @param depth
* @param date
* @return a file to the snapshot
*/
public File definePath(final DigestURL url, final String ext, final int depth, final Date date) {
String id = ASCII.String(url.hash());
String ds = GenericFormatter.SHORT_DAY_FORMATTER.format(date);
File path = new File(pathToShard(url, depth), id + "." + ds + "." + ext);
return path;
}
/**
* get the depth to a document, helper method for definePath to determine the depth value
* @param url
* @param fulltext
* @return the crawldepth of the document
*/
public int getDepth(final DigestURL url, final Fulltext fulltext) {
Integer depth = null;
if (fulltext.getDefaultConfiguration().contains(CollectionSchema.crawldepth_i)) {
try {
SolrDocument doc = fulltext.getDefaultConnector().getDocumentById(ASCII.String(url.hash()), CollectionSchema.crawldepth_i.getSolrFieldName());
if (doc != null) {
depth = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
}
} catch (IOException e) {
}
}
return depth == null ? 0 : depth;
}
/**
* for a given url, get all paths for storage locations.
* The locations are all for the single url but may represent different storage times.
* This method is inefficient because it tests all different depths, it would be better to use
* findPaths/3 with a given depth.
* @param url
* @param ext
* @return a set of files for snapshots of the url
*/
public Collection<File> findPaths(final DigestURL url, final String ext) {
for (int i = 0; i < 100; i++) {
Collection<File> paths = findPaths(url, ext, i);
if (paths.size() > 0) return paths;
}
return new ArrayList<>(0);
}
/**
* for a given url, get all paths for storage locations.
* The locations are all for the single url but may represent different storage times.
* @param url
* @param ext
* @param depth
* @return a set of files for snapshots of the url
*/
public Collection<File> findPaths(final DigestURL url, final String ext, final int depth) {
String id = ASCII.String(url.hash());
File pathToShard = pathToShard(url, depth);
String[] list = pathToShard.list();
ArrayList<File> paths = new ArrayList<>();
for (String f: list) {
if (f.startsWith(id) && f.endsWith(ext)) paths.add(new File(pathToShard, f));
}
return paths;
}
private File pathToShard(final DigestURL url, final int depth) {
String id = ASCII.String(url.hash());
File pathToHostDir = new File(storageLocation, url.getHost() + ":" + url.getPort());
File pathToDepthDir = new File(pathToHostDir, depth < 10 ? "0" + depth : Integer.toString(depth));
File pathToShard = new File(pathToDepthDir, id.substring(0, 2));
return pathToShard;
}
}

@ -25,6 +25,7 @@
package net.yacy.crawler.retrieval;
import java.io.IOException;
import java.util.Date;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailCategory;
@ -68,10 +69,16 @@ public final class HTTPLoader {
}
public Response load(final Request entry, CrawlProfile profile, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
// load fulltext of html page
Latency.updateBeforeLoad(entry.url());
final long start = System.currentTimeMillis();
final Response doc = load(entry, profile, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType, agent);
Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
// load pdf in case that is wanted. This can later be used to compute a web page preview in the search results
if (entry.depth() <= profile.loadPreviewMaxdepth() && "html|shtml|php".indexOf(entry.url().getFile()) >= 0) {
sb.snapshots.downloadPDFSnapshot(entry.url(), entry.depth(), new Date(), "http://127.0.0.1:" + sb.getConfigInt("port", 8090));
}
return doc;
}

@ -186,6 +186,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
crawlingQ,
true, true, true, false,
true, true, false,
-1,
CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard

@ -58,6 +58,10 @@ public class BufferedObjectIndex implements Index, Iterable<Row.Entry> {
this.entryComparator = new Row.EntryComparator(backend.row().objectOrder);
}
public boolean isOnDemand() {
return this.backend instanceof OnDemandOpenFileIndex;
}
@Override
public byte[] smallestKey() {
if (this.buffer == null || this.buffer.isEmpty()) return this.backend.smallestKey();

@ -31,7 +31,6 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Vector;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.util.ConcurrentLog;
@ -197,13 +196,13 @@ public final class OS {
FileUtils.deletedelete(starterFile);
}
public static Vector<String> execSynchronous(final String command) throws IOException {
public static List<String> execSynchronous(final String command) throws IOException {
// runs a unix/linux command and returns output as Vector of Strings
// this method blocks until the command is executed
final Process p = Runtime.getRuntime().exec(command);
final BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream()));
String text;
final Vector<String> output = new Vector<String>();
final List<String> output = new ArrayList<String>();
while ((text = in.readLine()) != null) {
output.add(text);
}
@ -212,9 +211,16 @@ public final class OS {
}
public static void main(final String[] args) {
try {
List<String> v = execSynchronous("/usr/local/bin/wkhtmltoimage");
for (String r: v) java.lang.System.out.println(r);
} catch (IOException e) {
}
/*
if (args[0].equals("-m")) {
java.lang.System.out.println("Maximum possible memory: " + Integer.toString(getWin32MaxHeap()) + "m");
}
*/
}
}

@ -122,6 +122,7 @@ import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.Snapshots;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.data.ResultImages;
import net.yacy.crawler.data.ResultURLs;
@ -243,6 +244,7 @@ public final class Switchboard extends serverSwitch {
public File queuesRoot;
public File surrogatesInPath;
//public File surrogatesOutPath;
public Snapshots snapshots;
public Segment index;
public LoaderDispatcher loader;
public CrawlSwitchboard crawler;
@ -344,6 +346,7 @@ public final class Switchboard extends serverSwitch {
this.htDocsPath =
getDataPath(SwitchboardConstants.HTDOCS_PATH, SwitchboardConstants.HTDOCS_PATH_DEFAULT);
this.log.config("HTDOCS Path: " + this.htDocsPath.toString());
this.snapshots = new Snapshots(new File(this.htDocsPath, "SNAPSHOTS"));
this.workPath = getDataPath(SwitchboardConstants.WORK_PATH, SwitchboardConstants.WORK_PATH_DEFAULT);
this.workPath.mkdirs();
// if default work files exist, copy them (don't overwrite existing!)
@ -3853,27 +3856,6 @@ public final class Switchboard extends serverSwitch {
i++;
}
}
/*
public File getPDF(DigestURL url) {
String depth = "00";
String idstub = ASCII.String(url.hash()).substring(0, 6);
if (this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.crawldepth_i)) {
try {
SolrDocument doc = this.index.fulltext().getDefaultConnector().getDocumentById(ASCII.String(url.hash()), CollectionSchema.crawldepth_i.getSolrFieldName());
if (doc != null) {
depth = (String) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
if (depth == null) depth = "00"; else if (depth.length() < 2) depth = "0" + depth;
}
} catch (IOException e) {
}
}
File pathToPdf = new File(this.htCachePath, url.getHost() + ":" + url.getPort());
File pdfFile = new File(pathToPdf, depth + "-" + idstub);
}
*/
public void checkInterruption() throws InterruptedException {
final Thread curThread = Thread.currentThread();

Loading…
Cancel
Save