enhanced the snapshot functionality:

- snapshots can now also be xml files which are extracted from the solr
index and stored as individual xml files in the snapshot directory along
the pdf and jpg images
- a transaction layer was placed above of the snapshot directory to
distinguish snapshots into 'inventory' and 'archive'. This may be used
to do transactions of index fragments using archived solr search results
between peers. This is currently unfinished, we need a protocol to move
snapshots from inventory to archive
- the SNAPSHOT directory was renamed to snapshot and contains now two
snapshot subdirectories: inventory and archive
- snapshots may now be generated by everyone, not only such peers
running on a server with xkhtml2pdf installed. The expert crawl starts
provides the option for snapshots to everyone. PDF snapshots are now
optional and the option is only shown if xkhtml2pdf is installed.
- the snapshot api now provides the request for historised xml files,
i.e. call:
http://localhost:8090/api/snapshot.xml?urlhash=Q3dQopFh1hyQ
The result of such xml files is identical with solr search results with
only one hit.
The pdf generation has been moved from the http loading process to the
solr document storage process. This may slow down the process a lot and
a different version of the process may be needed.
pull/1/head
Michael Peter Christen 10 years ago
parent 4111d42c81
commit 8df8ffbb6d

@ -460,28 +460,34 @@
</dl>
</fieldset>
#(/agentSelect)#
#(snapshotSelect)#<input type="hidden" name="snapshotsMaxDepth" id="snapshotsMaxDepth" value="-1" />::
<fieldset>
<legend>Snapshot Creation</legend>
<dl>
<dt><label for="snapshot">Max Depth for Snapshots</label></dt>
<dt><label for="snapshotMaxdepth">Max Depth for Snapshots</label></dt>
<dd>
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
Snapshots are pictures of web pages that can be created during crawling time. These pictures will be stored as pdf at first into subdirectories
of HTCACHE/SNAPSHOTS/ and are computed to jpg from the pdfs later. Snapshot generation can be controlled using a depth parameter; that
Snapshots are xml metadata and pictures of web pages that can be created during crawling time.
The xml data is stored in the same way as a Solr search result with one hit and the pictures will be stored as pdf into subdirectories
of HTCACHE/snapshots/. From the pdfs the jpg thumbnails are computed. Snapshot generation can be controlled using a depth parameter; that
means a snapshot is only be generated if the crawl depth of a document is smaller or equal to the given number here. If the number is set to -1,
no snapshots are generated.
</span></span>
<input type="text" name="snapshotsMaxDepth" id="snapshotsMaxDepth" size="2" maxlength="2" value="#[snapshotsMaxDepth]#" />
</dd>
<dt><label for="snapshot">Multiple Snapshot Versions</label></dt>
<dt><label for="snapshotVersion">Multiple Snapshot Versions</label></dt>
<dd>
<input type="radio" name="snapshotsReplaceOld" value="on" checked="checked"/> replace old snapshots with new one&nbsp;&nbsp;&nbsp;
<input type="radio" name="snapshotsReplaceOld" value="off" /> add new versions for each crawl
</dd>
#(snapshotEnableImages)#
<input type="hidden" name="snapshotsLoadImage" id="snapshotsLoadImage" value="false"/>::
<dt><label for="snapshotImage">Image Creation</label></dt>
<dd>
<input type="checkbox" name="snapshotsLoadImage" id="snapshotsLoadImage" #(snapshotsLoadImageChecked)#::checked="checked"#(/snapshotsLoadImageChecked)# />
</dd>
#(/snapshotEnableImages)#
</dl>
</fieldset>
#(/snapshotSelect)#
<fieldset>
<legend>Index Administration</legend>
<dl>

@ -515,13 +515,14 @@ public class CrawlStartExpert {
// ---------- Snapshot generation
boolean wkhtmltopdfAvailable = Html2Image.wkhtmltopdfAvailable();
boolean convertAvailable = Html2Image.convertAvailable();
prop.put("snapshotsMaxDepth", post == null ? "-1" : post.get("snapshotsMaxDepth", "-1"));
if (sb.getConfigBool("isTransparentProxy", false) &&
sb.getConfigBool("proxyAlwaysFresh", false) &&
wkhtmltopdfAvailable && convertAvailable) {
prop.put("snapshotSelect", 1);
prop.put("snapshotSelect_snapshotsMaxDepth", post == null ? "-1" : post.get("snapshotsMaxDepth", "-1"));
prop.put("snapshotEnableImages", 1);
prop.put("snapshotEnableImages_snapshotsLoadImageChecked", post == null ? 0 : post.getBoolean("snapshotsLoadImage") ? 1 : 0);
} else {
prop.put("snapshotSelect", 0);
prop.put("snapshotEnableImages", 0);
}
// ---------- Index Administration

@ -442,6 +442,7 @@ public class Crawler_p {
String snapshotsMaxDepthString = post.get("snapshotsMaxDepth", "-1");
int snapshotsMaxDepth = Integer.parseInt(snapshotsMaxDepthString);
boolean snapshotsLoadImage = post.getBoolean("snapshotsLoadImage");
boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
// prepare a new crawling profile
@ -471,6 +472,7 @@ public class Crawler_p {
storeHTCache,
crawlOrder,
snapshotsMaxDepth,
snapshotsLoadImage,
snapshotsReplaceOld,
cachePolicy,
collection,

@ -152,7 +152,7 @@ public class QuickCrawlLink_p {
obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
indexText, indexMedia,
storeHTCache, remoteIndexing,
-1, true,
-1, false, true,
CacheStrategy.IFFRESH,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName);

@ -29,6 +29,9 @@ import java.util.Collection;
import java.util.Date;
import java.util.Map;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.RSSFeed;
@ -39,6 +42,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.Html2Image;
import net.yacy.crawler.data.Snapshots;
import net.yacy.crawler.data.Transactions;
import net.yacy.document.ImageParser;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
@ -68,8 +72,10 @@ public class snapshot {
Integer depth = depthx == -1 ? null : depthx;
String orderx = post == null ? "ANY" : post.get("order", "ANY");
Snapshots.Order order = Snapshots.Order.valueOf(orderx);
String statex = post == null ? Transactions.State.INVENTORY.name() : post.get("state", Transactions.State.INVENTORY.name());
Transactions.State state = Transactions.State.valueOf(statex);
String host = post == null ? null : post.get("host");
Map<String, Date> iddate = sb.snapshots.select(host, depth, order, maxcount);
Map<String, Date> iddate = Transactions.select(host, depth, order, maxcount, state);
// now select the URL from the index for these ids in iddate and make an RSS feed
RSSFeed rssfeed = new RSSFeed(Integer.MAX_VALUE);
rssfeed.setChannel(new RSSMessage("Snapshot list for host = " + host + ", depth = " + depth + ", order = " + order + ", maxcount = " + maxcount, "", ""));
@ -89,6 +95,7 @@ public class snapshot {
}
if (post == null) return null;
final boolean xml = ext.equals("xml");
final boolean pdf = ext.equals("pdf");
if (pdf && !authenticated) return null;
final boolean pngjpg = ext.equals("png") || ext.equals("jpg");
@ -112,61 +119,85 @@ public class snapshot {
}
if (durl == null) return null;
url = durl.toNormalform(true);
Collection<File> snapshots = sb.snapshots.findPaths(durl, "pdf");
File pdfFile = null;
if (snapshots.size() == 0) {
// if the client is authenticated, we create the pdf on the fly!
if (!authenticated) return null;
pdfFile = sb.snapshots.downloadPDFSnapshot(durl, 99, new Date(), true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent.userAgent);
} else {
pdfFile = snapshots.iterator().next();
}
if (pdfFile == null) return null;
if (pdf) {
if (xml) {
Collection<File> xmlSnapshots = Transactions.findPaths(durl, "xml", Transactions.State.ANY);
File xmlFile = null;
if (xmlSnapshots.size() == 0) {
return null;
}
xmlFile = xmlSnapshots.iterator().next();
try {
byte[] pdfBinary = FileUtils.read(pdfFile);
return new ByteArrayInputStream(pdfBinary);
byte[] xmlBinary = FileUtils.read(xmlFile);
return new ByteArrayInputStream(xmlBinary);
} catch (IOException e) {
ConcurrentLog.logException(e);
return null;
}
}
if (pngjpg) {
int width = Math.min(post.getInt("width", DEFAULT_WIDTH), DEFAULT_WIDTH);
int height = Math.min(post.getInt("height", DEFAULT_HEIGHT), DEFAULT_HEIGHT);
String imageFileStub = pdfFile.getAbsolutePath(); imageFileStub = imageFileStub.substring(0, imageFileStub.length() - 3); // cut off extension
File imageFile = new File(imageFileStub + DEFAULT_WIDTH + "." + DEFAULT_HEIGHT + "." + DEFAULT_EXT);
if (!imageFile.exists() && authenticated) {
Html2Image.pdf2image(pdfFile, imageFile, DEFAULT_WIDTH, DEFAULT_HEIGHT, DEFAULT_DENSITY, DEFAULT_QUALITY);
if (pdf || pngjpg) {
Collection<File> pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.ANY);
File pdfFile = null;
if (pdfSnapshots.size() == 0) {
// if the client is authenticated, we create the pdf on the fly!
if (!authenticated) return null;
SolrDocument sd = sb.index.fulltext().getMetadata(durl.hash());
SolrInputDocument sid = sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(sd);
boolean success = Transactions.store(sid, true, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent);
if (success) {
pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.INVENTORY);
if (pdfSnapshots.size() != 0) pdfFile = pdfSnapshots.iterator().next();
}
} else {
pdfFile = pdfSnapshots.iterator().next();
}
if (!imageFile.exists()) return null;
if (width == DEFAULT_WIDTH && height == DEFAULT_HEIGHT) {
if (pdfFile == null) return null;
if (pdf) {
try {
byte[] imageBinary = FileUtils.read(imageFile);
return new ByteArrayInputStream(imageBinary);
byte[] pdfBinary = FileUtils.read(pdfFile);
return new ByteArrayInputStream(pdfBinary);
} catch (IOException e) {
ConcurrentLog.logException(e);
return null;
}
}
// lets read the file and scale
Image image;
try {
image = ImageParser.parse(imageFile.getAbsolutePath(), FileUtils.read(imageFile));
final Image scaled = image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
final MediaTracker mediaTracker = new MediaTracker(new Container());
mediaTracker.addImage(scaled, 0);
try {mediaTracker.waitForID(0);} catch (final InterruptedException e) {}
return scaled;
} catch (IOException e) {
ConcurrentLog.logException(e);
return null;
if (pngjpg) {
int width = Math.min(post.getInt("width", DEFAULT_WIDTH), DEFAULT_WIDTH);
int height = Math.min(post.getInt("height", DEFAULT_HEIGHT), DEFAULT_HEIGHT);
String imageFileStub = pdfFile.getAbsolutePath(); imageFileStub = imageFileStub.substring(0, imageFileStub.length() - 3); // cut off extension
File imageFile = new File(imageFileStub + DEFAULT_WIDTH + "." + DEFAULT_HEIGHT + "." + DEFAULT_EXT);
if (!imageFile.exists() && authenticated) {
Html2Image.pdf2image(pdfFile, imageFile, DEFAULT_WIDTH, DEFAULT_HEIGHT, DEFAULT_DENSITY, DEFAULT_QUALITY);
}
if (!imageFile.exists()) return null;
if (width == DEFAULT_WIDTH && height == DEFAULT_HEIGHT) {
try {
byte[] imageBinary = FileUtils.read(imageFile);
return new ByteArrayInputStream(imageBinary);
} catch (IOException e) {
ConcurrentLog.logException(e);
return null;
}
}
// lets read the file and scale
Image image;
try {
image = ImageParser.parse(imageFile.getAbsolutePath(), FileUtils.read(imageFile));
final Image scaled = image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
final MediaTracker mediaTracker = new MediaTracker(new Container());
mediaTracker.addImage(scaled, 0);
try {mediaTracker.waitForID(0);} catch (final InterruptedException e) {}
return scaled;
} catch (IOException e) {
ConcurrentLog.logException(e);
return null;
}
}
}
return null;
}
}

@ -35,6 +35,8 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.XML;
@ -181,7 +183,7 @@ public class EnhancedXMLResponseWriter implements QueryResponseWriter {
while (fidx2 < sz && fieldName.equals(fields.get(fidx2).name())) {
fidx2++;
}
SchemaField sf = schema.getFieldOrNull(fieldName);
SchemaField sf = schema == null ? null : schema.getFieldOrNull(fieldName);
if (sf == null) {
sf = new SchemaField(fieldName, new TextField());
}
@ -189,6 +191,7 @@ public class EnhancedXMLResponseWriter implements QueryResponseWriter {
if (fidx1 + 1 == fidx2) {
if (sf.multiValued()) {
startTagOpen(writer, "arr", fieldName);
writer.write(lb);
String sv = value.stringValue();
writeField(writer, type.getTypeName(), null, sv); //sf.write(this, null, f1);
writer.write("</arr>");
@ -197,6 +200,7 @@ public class EnhancedXMLResponseWriter implements QueryResponseWriter {
}
} else {
startTagOpen(writer, "arr", fieldName);
writer.write(lb);
for (int i = fidx1; i < fidx2; i++) {
String sv = fields.get(i).stringValue();
writeField(writer, type.getTypeName(), null, sv); //sf.write(this, null, (Fieldable)this.tlst.get(i));
@ -209,8 +213,29 @@ public class EnhancedXMLResponseWriter implements QueryResponseWriter {
writer.write("</doc>");
writer.write(lb);
}
public static final void writeDoc(final Writer writer, final SolrInputDocument sid) throws IOException {
startTagOpen(writer, "doc", null);
for (String key: sid.getFieldNames()) {
SolrInputField sif = sid.getField(key);
Object value = sif.getValue();
if (value == null) {
} else if (value instanceof Collection<?>) {
startTagOpen(writer, "arr", key);
writer.write(lb);
for (Object o: (Collection<?>) value) {
writeField(writer, null, o);
}
writer.write("</arr>"); writer.write(lb);
} else {
writeField(writer, key, value);
}
}
writer.write("</doc>");
writer.write(lb);
}
private static final void writeDoc(final Writer writer, final SolrDocument doc) throws IOException {
public static final void writeDoc(final Writer writer, final SolrDocument doc) throws IOException {
startTagOpen(writer, "doc", null);
final Map<String, Object> fields = doc.getFieldValueMap();
for (String key: fields.keySet()) {
@ -219,6 +244,7 @@ public class EnhancedXMLResponseWriter implements QueryResponseWriter {
if (value == null) {
} else if (value instanceof Collection<?>) {
startTagOpen(writer, "arr", key);
writer.write(lb);
for (Object o: ((Collection<?>) value)) {
writeField(writer, null, o);
}

@ -80,10 +80,16 @@ public class Html2Image {
*/
public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, File destination) {
boolean success = writeWkhtmltopdfInternal(url, proxy, destination, null, true);
if (success) return true;
if (proxy == null) return false;
ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url);
return writeWkhtmltopdfInternal(url, null, destination, userAgent, true);
if (!success && proxy != null) {
ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url);
success = writeWkhtmltopdfInternal(url, null, destination, userAgent, true);
}
if (success) {
ConcurrentLog.info("Html2Image", "wrote " + destination.toString() + " for " + url);
} else {
ConcurrentLog.warn("Html2Image", "could not generate snapshot for " + url);
}
return success;
}
private static boolean writeWkhtmltopdfInternal(String url, String proxy, File destination, String userAgent, boolean ignoreErrors) {

@ -293,7 +293,7 @@ public final class CrawlSwitchboard {
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true),
true,
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false),
-1, true,
-1, false, true,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName);
@ -323,7 +323,7 @@ public final class CrawlSwitchboard {
true,
false,
false,
-1, true,
-1, false, true,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName);
@ -353,7 +353,7 @@ public final class CrawlSwitchboard {
false,
true,
false,
-1, true,
-1, false, true,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName);
@ -383,7 +383,7 @@ public final class CrawlSwitchboard {
true,
true,
false,
-1, true,
-1, false, true,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName);
@ -414,7 +414,7 @@ public final class CrawlSwitchboard {
false,
true,
false,
-1, true,
-1, false, true,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName);
@ -444,7 +444,7 @@ public final class CrawlSwitchboard {
false,
true,
false,
-1, true,
-1, false, true,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName);
@ -474,7 +474,7 @@ public final class CrawlSwitchboard {
true,
true,
false,
-1, true,
-1, false, true,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName);
@ -504,7 +504,7 @@ public final class CrawlSwitchboard {
false,
false,
false,
-1, true,
-1, false, true,
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName);
@ -537,7 +537,7 @@ public final class CrawlSwitchboard {
true,
false,
false,
-1, true,
-1, false, true,
CacheStrategy.NOCACHE,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName);

@ -88,7 +88,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String INDEXING_CONTENT_MUSTNOTMATCH = "indexContentMustNotMatch";
public static final String SNAPSHOTS_MAXDEPTH = "snapshotsMaxDepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1
public static final String SNAPSHOTS_REPLACEOLD = "snapshotsReplaceOld"; // if this is set to true, only one version of a snapshot per day is stored, otherwise we store also different versions per day
public static final String SNAPSHOTS_LOADIMAGE = "snapshotsLoadImage"; // if true, an image is loaded
private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
private Pattern crawlernodepthlimitmatch = null;
@ -144,6 +145,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final boolean storeHTCache,
final boolean remoteIndexing,
final int snapshotsMaxDepth,
final boolean snapshotsLoadImage,
final boolean snapshotsReplaceOld,
final CacheStrategy cacheStrategy,
final String collections,
@ -181,6 +183,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(STORE_HTCACHE, storeHTCache);
put(REMOTE_INDEXING, remoteIndexing);
put(SNAPSHOTS_MAXDEPTH, snapshotsMaxDepth);
put(SNAPSHOTS_LOADIMAGE, snapshotsLoadImage);
put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld);
put(CACHE_STRAGEGY, cacheStrategy.toString());
put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
@ -590,6 +593,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return -1;
}
}
public boolean snapshotLoadImage() {
final String r = get(SNAPSHOTS_LOADIMAGE);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean snapshotReplaceold() {
final String r = get(SNAPSHOTS_REPLACEOLD);

@ -37,7 +37,6 @@ import org.apache.solr.common.SolrDocument;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.Html2Image;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionSchema;
@ -65,6 +64,7 @@ public class Snapshots {
public Snapshots(File location) {
this.storageLocation = location;
this.storageLocation.mkdirs();
// scan the location to fill the directory
this.directory = new HashMap<>();
for (String domain: location.list()) {
@ -98,31 +98,6 @@ public class Snapshots {
}
}
}
/**
* Load a pdf snapshot of a document.
* A proxy must be given to ensure that multiple loads containing i.e. image are cached
* Use http://localhost:<thisport> as proxy.
* @param url
* @param depth
* @param date
* @param proxy - a string of the form 'http://<host>:<port>
* @return
*/
public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy, String userAgent) {
Collection<File> oldPaths = findPaths(url, depth, "pdf");
if (replaceOld) {
for (File oldPath: oldPaths) oldPath.delete();
}
File path = definePath(url, depth, date, "pdf");
path.getParentFile().mkdirs();
boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, userAgent, path);
if (success) {
announceStorage(url, depth, date);
return path;
}
return null;
}
/**
* Compute the path of a snapshot. This does not create the snapshot, only gives a path.
@ -140,7 +115,7 @@ public class Snapshots {
return path;
}
private void announceStorage(final DigestURL url, final int depth, final Date date) {
public void announceStorage(final DigestURL url, final int depth, final Date date) {
String id = ASCII.String(url.hash());
String ds = GenericFormatter.SHORT_MINUTE_FORMATTER.format(date);
TreeMap<Integer, TreeSet<String>> domaindepth = this.directory.get(pathToHostDir(url));

@ -0,0 +1,200 @@
/**
* Transactions
* Copyright 2014 by Michael Peter Christen
* First released 08.12.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler.data;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.responsewriter.EnhancedXMLResponseWriter;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.Html2Image;
import net.yacy.crawler.data.Snapshots.Order;
import net.yacy.search.schema.CollectionSchema;
/**
* This is a static class holding one or several Snapshot directories
* Transacted snapshots are moved from the inventory snapshot directory to the archive snapshot directory.
*
*/
public class Transactions {
private final static String XML_PREFIX = "<response>\n<!--\n";
private final static char[] WHITESPACE = new char[132];
private final static int WHITESPACE_START = XML_PREFIX.length();
private final static int WHITESPACE_LENGTH = WHITESPACE.length;
private final static String SNAPSHOT_INVENTORY_DIR = "inventory";
private final static String SNAPSHOT_ARCHIVE_DIR = "archive";
private static File transactionDir = null, inventoryDir = null, archiveDir = null;
private static Snapshots inventory = null, archive = null;
static {
for (int i = 0; i < WHITESPACE.length; i++) WHITESPACE[i] = 32;
}
public static enum State {
INVENTORY, ARCHIVE, ANY;
}
public static void init(File dir) {
transactionDir = dir;
transactionDir.mkdirs();
inventoryDir = new File(transactionDir, SNAPSHOT_INVENTORY_DIR);
inventory = new Snapshots(inventoryDir);
archiveDir = new File(transactionDir, SNAPSHOT_ARCHIVE_DIR);
archive = new Snapshots(archiveDir);
}
public static boolean store(SolrInputDocument doc, boolean loadImage, boolean replaceOld, String proxy, ClientIdentification.Agent agent) {
// GET METADATA FROM DOC
String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
Date date = (Date) doc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
int depth = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
DigestURL url;
try {
url = new DigestURL(urls);
} catch (MalformedURLException e) {
ConcurrentLog.logException(e);
return false;
}
// STORE AN IMAGE
Collection<File> oldPaths = Transactions.findPaths(url, depth, "pdf", Transactions.State.INVENTORY);
if (replaceOld) {
for (File oldPath: oldPaths) oldPath.delete();
}
File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY);
metadataPath.getParentFile().mkdirs();
boolean success = true;
if (loadImage) {
File pdfPath = Transactions.definePath(url, depth, date, "pdf", Transactions.State.INVENTORY);
success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, pdfPath);
}
// STORE METADATA FOR THE IMAGE
if (success) try {
if (doc != null) {
FileOutputStream fos = new FileOutputStream(metadataPath);
OutputStreamWriter osw = new OutputStreamWriter(fos);
osw.write(XML_PREFIX);
osw.write(WHITESPACE); osw.write("\n-->\n"); // placeholder for transaction information properties (a hack to attach metadata to metadata)
osw.write("<result name=\"response\" numFound=\"1\" start=\"0\">\n");
EnhancedXMLResponseWriter.writeDoc(osw, doc);
osw.write("</result>\n");
osw.write("</response>\n");
osw.close();
fos.close();
Transactions.announceStorage(url, depth, date);
}
return true;
} catch (IOException e) {
ConcurrentLog.logException(e);
return false;
}
return false;
}
/**
* select a set of urlhashes from the snapshot directory. The selection either ordered
* by generation date (upwards == OLDESTFIRST or downwards == LATESTFIRST) or with any
* order. The result set can be selected either with a given host or a depth
* @param host selected host or null for all hosts
* @param depth selected depth or null for all depths
* @param order Order.ANY, Order.OLDESTFIRST or Order.LATESTFIRST
* @param maxcount the maximum number of hosthashes. If unlimited, submit Integer.MAX_VALUE
* @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY
* @return a map of hosthashes with the associated creation date
*/
public static Map<String, Date> select(String host, Integer depth, final Order order, int maxcount, State state) {
Map<String, Date> result = new HashMap<>();
if (state == State.INVENTORY || state == State.ANY) result.putAll(inventory.select(host, depth, order, maxcount));
if (state == State.ARCHIVE || state == State.ANY) result.putAll(archive.select(host, depth, order, maxcount));
return result;
}
/**
* Compute the path of a snapshot. This does not create the snapshot, only gives a path.
* Also, the path to the storage location is not created.
* @param url
* @param depth
* @param date
* @param ext
* @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY
* @return a file to the snapshot
*/
public static File definePath(final DigestURL url, final int depth, final Date date, final String ext, State state) {
if (state == State.ANY) throw new RuntimeException("definePath must be selected with INVENTORY or ARCHIVE state");
if (state == State.INVENTORY) return inventory.definePath(url, depth, date, ext);
if (state == State.ARCHIVE) return archive.definePath(url, depth, date, ext);
return null;
}
public static void announceStorage(final DigestURL url, final int depth, final Date date) {
inventory.announceStorage(url, depth, date);
}
/**
* for a given url, get all paths for storage locations.
* The locations are all for the single url but may represent different storage times.
* This method is inefficient because it tests all different depths, it would be better to use
* findPaths/3 with a given depth.
* @param url
* @param ext
* @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY
* @return a set of files for snapshots of the url
*/
public static Collection<File> findPaths(final DigestURL url, final String ext, State state) {
Collection<File> result = new ArrayList<>();
if (state == State.INVENTORY || state == State.ANY) result.addAll(inventory.findPaths(url, ext));
if (state == State.ARCHIVE || state == State.ANY) result.addAll(archive.findPaths(url, ext));
return result;
}
/**
* for a given url, get all paths for storage locations.
* The locations are all for the single url but may represent different storage times.
* @param url
* @param ext
* @param depth
* @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY
* @return a set of files for snapshots of the url
*/
public static Collection<File> findPaths(final DigestURL url, final int depth, final String ext, State state) {
Collection<File> result = new ArrayList<>();
if (state == State.INVENTORY || state == State.ANY) result.addAll(inventory.findPaths(url, depth, ext));
if (state == State.ARCHIVE || state == State.ANY) result.addAll(archive.findPaths(url, depth, ext));
return result;
}
}

@ -186,7 +186,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
crawlingQ,
true, true, true, false,
true, true, false,
-1, true,
-1, false, true,
CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard

@ -31,23 +31,15 @@
package net.yacy.document.parser.html;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.util.Enumeration;
import java.util.Properties;
import java.util.Stack;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.io.CharBuffer;

@ -42,7 +42,6 @@ import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
@ -61,7 +60,6 @@ import net.yacy.crawler.retrieval.SMBLoader;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.parser.htmlParser;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
@ -206,22 +204,6 @@ public final class LoaderDispatcher {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
}
// before we return pages from the cache, check if we are requested to produce snapshots which will be generated newly every time
if (protocol.equals("http") || protocol.equals("https")) {
// load pdf in case that is wanted. This can later be used to compute a web page preview in the search results
boolean depthok = crawlProfile != null && request.depth() <= crawlProfile.snapshotMaxdepth();
String file = request.url().getFile();
String ext = MultiProtocolURL.getFileExtension(file).toLowerCase();
boolean extok = ext.length() == 0 || file.length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext);
if (depthok && extok) {
File snapshotFile = sb.snapshots.downloadPDFSnapshot(request.url(), request.depth(), new Date(), crawlProfile.snapshotReplaceold(), sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, agent.userAgent);
log.info("SNAPSHOT - " + (snapshotFile == null ? "could not generate snapshot for " + request.url().toNormalform(true) : "wrote " + snapshotFile + " for " + request.url().toNormalform(true)));
} else {
//if (!depthok) log.warn("SNAPSHOT: depth not ok, " + (crawlProfile == null ? "profile = null" : "entry.depth() = " + request.depth() + ", profile.snapshotMaxdepth() = " + crawlProfile.snapshotMaxdepth()));
//if (!extok) log.warn("SNAPSHOT: ext not ok, entry.url().getFile() = " + request.url().getFile());
}
}
// check if we have the page in the cache
if (cacheStrategy != CacheStrategy.NOCACHE && crawlProfile != null) {

@ -122,12 +122,12 @@ import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.Snapshots;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.data.ResultImages;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.data.ResultURLs.EventOrigin;
import net.yacy.crawler.data.Transactions;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.crawler.robots.RobotsTxt;
@ -244,7 +244,6 @@ public final class Switchboard extends serverSwitch {
public File queuesRoot;
public File surrogatesInPath;
//public File surrogatesOutPath;
public Snapshots snapshots;
public Segment index;
public LoaderDispatcher loader;
public CrawlSwitchboard crawler;
@ -698,9 +697,8 @@ public final class Switchboard extends serverSwitch {
final long maxCacheSize =
1024L * 1024L * Long.parseLong(getConfig(SwitchboardConstants.PROXY_CACHE_SIZE, "2")); // this is megabyte
Cache.init(this.htCachePath, this.peers.mySeed().hash, maxCacheSize);
final File snapshotdir = new File(this.htCachePath, "SNAPSHOTS");
snapshotdir.mkdirs(); // create directory if missing
this.snapshots = new Snapshots(snapshotdir);
final File transactiondir = new File(this.htCachePath, "snapshots");
Transactions.init(transactiondir);
// create the surrogates directories
this.surrogatesInPath =
@ -2848,12 +2846,14 @@ public final class Switchboard extends serverSwitch {
url,
referrerURL,
collections,
profile,
queueEntry.getResponseHeader(),
document,
condenser,
searchEvent,
sourceName,
getConfigBool(SwitchboardConstants.DHT_ENABLED, false));
getConfigBool(SwitchboardConstants.DHT_ENABLED, false),
sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null);
final RSSFeed feed =
EventChannel.channels(queueEntry.initiator() == null
? EventChannel.PROXY

@ -165,11 +165,13 @@ public class DocumentIndex extends Segment {
null,
null,
null,
null,
document,
condenser,
null,
DocumentIndex.class.getName() + ".add",
false);
false,
null);
}
return rows;
}

@ -59,10 +59,13 @@ import net.yacy.cora.util.ByteBuffer;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.LookAheadIterator;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Transactions;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.parser.htmlParser;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.citation.CitationReferenceFactory;
import net.yacy.kelondro.data.word.Word;
@ -537,12 +540,14 @@ public class Segment {
final DigestURL url,
final DigestURL referrerURL,
final Map<String, Pattern> collections,
final CrawlProfile crawlProfile,
final ResponseHeader responseHeader,
final Document document,
final Condenser condenser,
final SearchEvent searchEvent,
final String sourceName, // contains the crawl profile hash if this comes from a web crawl
final boolean storeToRWI
final boolean storeToRWI,
final String proxy
) {
final long startTime = System.currentTimeMillis();
@ -567,10 +572,21 @@ public class Segment {
// ENRICH DOCUMENT WITH RANKING INFORMATION
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);
// CREATE SNAPSHOT
if ((url.getProtocol().equals("http") || url.getProtocol().equals("https")) &&
crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth()) {
// load pdf in case that is wanted. This can later be used to compute a web page preview in the search results
String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase();
if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) {
// STORE IMAGE AND METADATA
Transactions.store(vector, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, crawlProfile.getAgent());
}
}
// STORE TO SOLR
String error = null;
this.putDocument(vector);
List<SolrInputDocument> webgraph = vector.getWebgraphDocuments();
String error = null;
if (webgraph != null && webgraph.size() > 0) {
// write the edges to the webgraph solr index

Loading…
Cancel
Save