/**
* Transactions
* Copyright 2014 by Michael Peter Christen
* First released 08.12.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see .
*/
package net.yacy.crawler.data;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.responsewriter.EnhancedXMLResponseWriter;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.Html2Image;
import net.yacy.crawler.data.Snapshots.Order;
import net.yacy.crawler.data.Snapshots.Revisions;
import net.yacy.search.schema.CollectionSchema;
/**
* This is a static class holding one or several Snapshot directories
* Transacted snapshots are moved from the inventory snapshot directory to the archive snapshot directory.
*
*/
public class Transactions {
private final static String XML_PREFIX = "\n\n"); // placeholder for transaction information properties (a hack to attach metadata to metadata)
osw.write("\n");
EnhancedXMLResponseWriter.writeDoc(osw, doc);
osw.write("\n");
osw.write("\n");
osw.close();
fos.close();
Transactions.announceStorage(url, depth, date, State.INVENTORY);
}
} catch (IOException e) {
ConcurrentLog.logException(e);
success = false;
}
}
return success;
}
public static boolean store(final DigestURL url, final Date date, final int depth, final boolean concurrency, final boolean replaceOld, final String proxy, final String acceptLanguage) {
// CLEAN UP OLD DATA (if wanted)
Collection oldPaths = Transactions.findPaths(url, depth, null, Transactions.State.INVENTORY);
if (replaceOld && oldPaths != null) {
for (File oldPath: oldPaths) oldPath.delete();
}
// STORE METADATA FOR THE IMAGE
File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY);
metadataPath.getParentFile().mkdirs();
boolean success = true;
// STORE AN IMAGE
final String urls = url.toNormalform(true);
final File pdfPath = Transactions.definePath(url, depth, date, "pdf", Transactions.State.INVENTORY);
if (concurrency && executorRunning.intValue() < Runtime.getRuntime().availableProcessors()) {
Thread t = new Thread(){
@Override
public void run() {
executorRunning.incrementAndGet();
try {
Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath);
} catch (Throwable e) {} finally {
executorRunning.decrementAndGet();
}
}
};
executor.execute(t);
} else {
success = Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath);
}
return success;
}
/**
* Announce the commit of a snapshot: this will move all data for the given urlhash from the inventory to the archive
* The index within the snapshot management will update also.
* @param urlhash
* @return a revision object from the moved document if the commit has succeeded, null if something went wrong
*/
public static Revisions commit(String urlhash) {
return transact(urlhash, State.INVENTORY, State.ARCHIVE);
}
/**
* Announce the rollback of a snapshot: this will move all data for the given urlhash from the archive to the inventory
* The index within the snapshot management will update also.
* @param urlhash
* @return a revision object from the moved document if the commit has succeeded, null if something went wrong
*/
public static Revisions rollback(String urlhash) {
return transact(urlhash, State.ARCHIVE, State.INVENTORY);
}
private static Revisions transact(final String urlhash, final State from, final State to) {
Revisions r = Transactions.getRevisions(from, urlhash);
if (r == null) return null;
// we take all pathtoxml and move that to archive
for (File f: r.pathtoxml) {
String name = f.getName();
String nameStub = name.substring(0, name.length() - 4);
File sourceParent = f.getParentFile();
File targetParent = new File(sourceParent.getAbsolutePath().replace("/" + from.dirname + "/", "/" + to.dirname + "/"));
targetParent.mkdirs();
// list all files in the parent directory
for (String a: sourceParent.list()) {
if (a.startsWith(nameStub)) {
new File(sourceParent, a).renameTo(new File(targetParent, a));
}
}
// delete empty directories
while (sourceParent.list().length == 0) {
sourceParent.delete();
sourceParent = sourceParent.getParentFile();
}
}
// announce the movement
DigestURL durl;
try {
durl = new DigestURL(r.url);
Transactions.announceDeletion(durl, r.depth, from);
Transactions.announceStorage(durl, r.depth, r.dates[0], to);
return r;
} catch (MalformedURLException e) {
ConcurrentLog.logException(e);
}
return null;
}
/**
* select a set of urlhashes from the snapshot directory. The selection either ordered
* by generation date (upwards == OLDESTFIRST or downwards == LATESTFIRST) or with any
* order. The result set can be selected either with a given host or a depth
* @param host selected host or null for all hosts
* @param depth selected depth or null for all depths
* @param order Order.ANY, Order.OLDESTFIRST or Order.LATESTFIRST
* @param maxcount the maximum number of hosthashes. If unlimited, submit Integer.MAX_VALUE
* @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY
* @return a map of hosthashes with the associated creation date
*/
public static LinkedHashMap select(String host, Integer depth, final Order order, int maxcount, State state) {
LinkedHashMap result = new LinkedHashMap<>();
if (state == State.INVENTORY || state == State.ANY) result.putAll(inventory.select(host, depth, order, maxcount));
if (state == State.ARCHIVE || state == State.ANY) result.putAll(archive.select(host, depth, order, maxcount));
return result;
}
/**
* Compute the path of a snapshot. This does not create the snapshot, only gives a path.
* Also, the path to the storage location is not created.
* @param url
* @param depth
* @param date
* @param ext
* @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY
* @return a file to the snapshot
*/
public static File definePath(final DigestURL url, final int depth, final Date date, final String ext, State state) {
if (state == State.ANY) throw new RuntimeException("definePath must be selected with INVENTORY or ARCHIVE state");
if (state == State.INVENTORY) return inventory.definePath(url, depth, date, ext);
if (state == State.ARCHIVE) return archive.definePath(url, depth, date, ext);
return null;
}
/**
* Write information about the storage of a snapshot to the Snapshot-internal index.
* The actual writing of files to the target directory must be done elsewehre, this method does not store the snapshot files.
* @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY
* @param url
* @param depth
* @param date
*/
public static void announceStorage(final DigestURL url, final int depth, final Date date, State state) {
if (state == State.INVENTORY || state == State.ANY) inventory.announceStorage(url, depth, date);
if (state == State.ARCHIVE || state == State.ANY) archive.announceStorage(url, depth, date);
}
/**
* Delete information about the storage of a snapshot to the Snapshot-internal index.
* The actual deletion of files in the target directory must be done elsewehre, this method does not store the snapshot files.
* @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY
* @param url
* @param depth
* @param date
*/
public static void announceDeletion(final DigestURL url, final int depth, final State state) {
if (state == State.INVENTORY || state == State.ANY) inventory.announceDeletion(url, depth);
if (state == State.ARCHIVE || state == State.ANY) archive.announceDeletion(url, depth);
}
/**
* for a given url, get all paths for storage locations.
* The locations are all for the single url but may represent different storage times.
* This method is inefficient because it tests all different depths, it would be better to use
* findPaths/3 with a given depth.
* @param url
* @param ext required extension or null if the extension must not be checked
* @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY
* @return a set of files for snapshots of the url
*/
public static Collection findPaths(final DigestURL url, final String ext, State state) {
Collection result = new ArrayList<>();
if (state == State.INVENTORY || state == State.ANY) result.addAll(inventory.findPaths(url, ext));
if (state == State.ARCHIVE || state == State.ANY) result.addAll(archive.findPaths(url, ext));
return result;
}
/**
* for a given url, get all paths for storage locations.
* The locations are all for the single url but may represent different storage times.
* @param url
* @param ext required extension or null if the extension must not be checked
* @param depth
* @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY
* @return a set of files for snapshots of the url
*/
public static Collection findPaths(final DigestURL url, final int depth, final String ext, State state) {
Collection result = new ArrayList<>();
if (state == State.INVENTORY || state == State.ANY) result.addAll(inventory.findPaths(url, depth, ext));
if (state == State.ARCHIVE || state == State.ANY) result.addAll(archive.findPaths(url, depth, ext));
return result;
}
}