yacy_search_server/source/net/yacy/crawler/data/Transactions.java

/**
 *  Transactions
 *  Copyright 2014 by Michael Peter Christen
 *  First released 08.12.2014 at http://yacy.net
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package net.yacy.crawler.data;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.solr.common.SolrInputDocument;

import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.responsewriter.EnhancedXMLResponseWriter;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.Html2Image;
import net.yacy.crawler.data.Snapshots.Order;
import net.yacy.crawler.data.Snapshots.Revisions;
import net.yacy.search.schema.CollectionSchema;

/**
 * This is a static class holding one or several Snapshot directories
 * Transacted snapshots are moved from the inventory snapshot directory to the archive snapshot directory.
 *
 */
public class Transactions {
    
    private final static String XML_PREFIX = "<response>\n<!--\n";
    private final static char[] WHITESPACE = new char[132];
    //private final static int WHITESPACE_START = XML_PREFIX.length();
    //private final static int WHITESPACE_LENGTH = WHITESPACE.length;
    private static File transactionDir = null, inventoryDir = null, archiveDir = null;
    private static Snapshots inventory = null, archive = null;
    private static ExecutorService executor = Executors.newCachedThreadPool();
    private static AtomicInteger executorRunning = new AtomicInteger(0);
    
    static {
        for (int i = 0; i < WHITESPACE.length; i++) WHITESPACE[i] = 32;
    }
    
    public static enum State {
        INVENTORY("inventory"), ARCHIVE("archive"), ANY(null);
        public String dirname;
        State(String dirname) {
            this.dirname = dirname;
        }
    }
    
    public static void init(File dir) {
        transactionDir = dir;
        transactionDir.mkdirs();
        inventoryDir = new File(transactionDir, State.INVENTORY.dirname);
        inventory = new Snapshots(inventoryDir);
        archiveDir = new File(transactionDir, State.ARCHIVE.dirname);
        archive = new Snapshots(archiveDir);
    }

    /**
     * get the number of entries for each of the transaction states
     * @return the total number of different documents for each transaction state
     */
    public static Map<String, Integer> sizes() {
        HashMap<String, Integer> m = new HashMap<>();
        m.put(State.INVENTORY.name(), inventory.size());
        m.put(State.ARCHIVE.name(), archive.size());
        return m;
    }

    public static Revisions getRevisions(final State state, final String urlhash) {
        switch (state) {
            case INVENTORY : return inventory.getRevisions(urlhash);
            case ARCHIVE : return archive.getRevisions(urlhash);
            default : Revisions a = inventory.getRevisions(urlhash); return a == null ? archive.getRevisions(urlhash) : a;
        }
    }
    
    /**
     * get a list of <host>.<port> names in the snapshot directory
     * @return the list of the given state. if the state is ALL or unknown, all lists are combined
     */
    public static Set<String> listHosts(final State state) {
        switch (state) {
            case INVENTORY : return inventory.listHosts();
            case ARCHIVE : return archive.listHosts();
            default : Set<String> a = inventory.listHosts(); a.addAll(archive.listHosts()); return a;
        }
    }
    
    /**
     * list the snapshots for a given host name
     * @param hostport the <host>.<port> identifier for the domain
     * @param depth restrict the result to the given depth or if depth == -1 do not restrict to a depth
     * @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY 
     * @return a map with a set for each depth in the domain of the host name
     */
    public static TreeMap<Integer, Collection<Revisions>> listIDs(final String hostport, final int depth, final State state) {
        switch (state) {
            case INVENTORY : return inventory.listIDs(hostport, depth);
            case ARCHIVE : return archive.listIDs(hostport, depth);
            default : TreeMap<Integer, Collection<Revisions>> a = inventory.listIDs(hostport, depth); a.putAll(archive.listIDs(hostport, depth)); return a;
        }
    }
    
    /**
     * get the number of snapshots for the given host name
     * @param hostport the <host>.<port> identifier for the domain
     * @param depth restrict the result to the given depth or if depth == -1 do not restrict to a depth
     * @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY 
     * @return a count, the total number of documents for the domain and depth
     */
    public static int listIDsSize(final String hostport, final int depth, final State state) {
        switch (state) {
            case INVENTORY : return inventory.listIDsSize(hostport, depth);
            case ARCHIVE : return archive.listIDsSize(hostport, depth);
            default : return inventory.listIDsSize(hostport, depth) + archive.listIDsSize(hostport, depth);
        }
    }
    
    public static boolean store(final SolrInputDocument doc, final boolean concurrency, final boolean loadImage, final boolean replaceOld, final String proxy, final String acceptLanguage) {

        // GET METADATA FROM DOC
        final String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
        final Date date = (Date) doc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
        final int depth = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
        DigestURL url;
        try {
            url = new DigestURL(urls);
        } catch (MalformedURLException e) {
            ConcurrentLog.logException(e);
            return false;
        }
        
        boolean success = loadImage ? store(url, date, depth, concurrency, replaceOld, proxy, acceptLanguage) : true;
        if (success) {
            // STORE METADATA FOR THE IMAGE
            File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY);
            metadataPath.getParentFile().mkdirs();
            try {
                if (doc != null) {
                    FileOutputStream fos = new FileOutputStream(metadataPath);
                    OutputStreamWriter osw = new OutputStreamWriter(fos);
                    osw.write(XML_PREFIX);
                    osw.write(WHITESPACE); osw.write("\n-->\n"); // placeholder for transaction information properties (a hack to attach metadata to metadata)
                    osw.write("<result name=\"response\" numFound=\"1\" start=\"0\">\n");
                    EnhancedXMLResponseWriter.writeDoc(osw, doc);
                    osw.write("</result>\n");
                    osw.write("</response>\n");
                    osw.close();
                    fos.close();
                    Transactions.announceStorage(url, depth, date, State.INVENTORY);
                }
            } catch (IOException e) {
                ConcurrentLog.logException(e);
                success = false;
            }
        }
        
        return success;
    }
    

    public static boolean store(final DigestURL url, final Date date, final int depth, final boolean concurrency, final boolean replaceOld, final String proxy, final String acceptLanguage) {

        // CLEAN UP OLD DATA (if wanted)
        Collection<File> oldPaths = Transactions.findPaths(url, depth, null, Transactions.State.INVENTORY);
        if (replaceOld && oldPaths != null) {
            for (File oldPath: oldPaths) oldPath.delete();
        }
        
        // STORE METADATA FOR THE IMAGE
        File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY);
        metadataPath.getParentFile().mkdirs();
        boolean success = true;
        
        // STORE AN IMAGE
        final String urls = url.toNormalform(true);
        final File pdfPath = Transactions.definePath(url, depth, date, "pdf", Transactions.State.INVENTORY);
        if (concurrency && executorRunning.intValue() < Runtime.getRuntime().availableProcessors()) {
            Thread t = new Thread(){
                @Override
                public void run() {
                    executorRunning.incrementAndGet();
                    try {
                        Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath);
                    } catch (Throwable e) {} finally {
                    executorRunning.decrementAndGet();
                    }
                }
            };
            executor.execute(t);
        } else {
            success = Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath);
        }
        
        return success;
    }

    /**
     * Announce the commit of a snapshot: this will move all data for the given urlhash from the inventory to the archive
     * The index within the snapshot management will update also.
     * @param urlhash
     * @return a revision object from the moved document if the commit has succeeded, null if something went wrong
     */
    public static Revisions commit(String urlhash) {
        return transact(urlhash, State.INVENTORY, State.ARCHIVE);
    }

    /**
     * Announce the rollback of a snapshot: this will move all data for the given urlhash from the archive to the inventory
     * The index within the snapshot management will update also.
     * @param urlhash
     * @return a revision object from the moved document if the commit has succeeded, null if something went wrong
     */
    public static Revisions rollback(String urlhash) {
        return transact(urlhash, State.ARCHIVE, State.INVENTORY);
    }
    
    private static Revisions transact(final String urlhash, final State from, final State to) {
        Revisions r = Transactions.getRevisions(from, urlhash);
        if (r == null) return null;
        // we take all pathtoxml and move that to archive
        for (File f: r.pathtoxml) {
            String name = f.getName();
            String nameStub = name.substring(0, name.length() - 4);
            File sourceParent = f.getParentFile();
            File targetParent = new File(sourceParent.getAbsolutePath().replace("/" + from.dirname + "/", "/" + to.dirname + "/"));
            targetParent.mkdirs();
            // list all files in the parent directory
            for (String a: sourceParent.list()) {
                if (a.startsWith(nameStub)) {
                    new File(sourceParent, a).renameTo(new File(targetParent, a));
                }
            }
            // delete empty directories
            while (sourceParent.list().length == 0) {
                sourceParent.delete();
                sourceParent = sourceParent.getParentFile();
            }
        }
        // announce the movement
        DigestURL durl;
        try {
            durl = new DigestURL(r.url);
            Transactions.announceDeletion(durl, r.depth, from);
            Transactions.announceStorage(durl, r.depth, r.dates[0], to);
            return r;
        } catch (MalformedURLException e) {
            ConcurrentLog.logException(e);
        }
    
        return null;
    }
    
    /**
     * select a set of urlhashes from the snapshot directory. The selection either ordered
     * by generation date (upwards == OLDESTFIRST or downwards == LATESTFIRST) or with any
     * order. The result set can be selected either with a given host or a depth
     * @param host selected host or null for all hosts
     * @param depth selected depth or null for all depths
     * @param order Order.ANY, Order.OLDESTFIRST or Order.LATESTFIRST
     * @param maxcount the maximum number of hosthashes. If unlimited, submit Integer.MAX_VALUE
     * @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY 
     * @return a map of hosthashes with the associated creation date
     */
    public static LinkedHashMap<String, Revisions> select(String host, Integer depth, final Order order, int maxcount, State state) {
        LinkedHashMap<String, Revisions> result = new LinkedHashMap<>();
        if (state == State.INVENTORY || state == State.ANY) result.putAll(inventory.select(host, depth, order, maxcount));
        if (state == State.ARCHIVE || state == State.ANY) result.putAll(archive.select(host, depth, order, maxcount));
        return result;
    }

    /**
     * Compute the path of a snapshot. This does not create the snapshot, only gives a path.
     * Also, the path to the storage location is not created.
     * @param url
     * @param depth
     * @param date
     * @param ext
     * @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY 
     * @return a file to the snapshot
     */
    public static File definePath(final DigestURL url, final int depth, final Date date, final String ext, State state) {
        if (state == State.ANY) throw new RuntimeException("definePath must be selected with INVENTORY or ARCHIVE state");
        if (state == State.INVENTORY) return inventory.definePath(url, depth, date, ext);
        if (state == State.ARCHIVE) return archive.definePath(url, depth, date, ext);
        return null;
    }

    /**
     * Write information about the storage of a snapshot to the Snapshot-internal index.
     * The actual writing of files to the target directory must be done elsewehre, this method does not store the snapshot files.
     * @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY 
     * @param url
     * @param depth
     * @param date
     */
    public static void announceStorage(final DigestURL url, final int depth, final Date date, State state) {
        if (state == State.INVENTORY || state == State.ANY) inventory.announceStorage(url, depth, date); 
        if (state == State.ARCHIVE || state == State.ANY) archive.announceStorage(url, depth, date);
    }

    /**
     * Delete information about the storage of a snapshot to the Snapshot-internal index.
     * The actual deletion of files in the target directory must be done elsewehre, this method does not store the snapshot files.
     * @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY 
     * @param url
     * @param depth
     * @param date
     */
    public static void announceDeletion(final DigestURL url, final int depth, final State state) {
        if (state == State.INVENTORY || state == State.ANY) inventory.announceDeletion(url, depth); 
        if (state == State.ARCHIVE || state == State.ANY) archive.announceDeletion(url, depth);
    }

    /**
     * for a given url, get all paths for storage locations.
     * The locations are all for the single url but may represent different storage times.
     * This method is inefficient because it tests all different depths, it would be better to use
     * findPaths/3 with a given depth.
     * @param url
     * @param ext required extension or null if the extension must not be checked
     * @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY 
     * @return a set of files for snapshots of the url
     */
    public static Collection<File> findPaths(final DigestURL url, final String ext, State state) {
        Collection<File> result = new ArrayList<>();
        if (state == State.INVENTORY || state == State.ANY) result.addAll(inventory.findPaths(url, ext));
        if (state == State.ARCHIVE || state == State.ANY) result.addAll(archive.findPaths(url, ext));
        return result;
    }
    
    /**
     * for a given url, get all paths for storage locations.
     * The locations are all for the single url but may represent different storage times.
     * @param url
     * @param ext required extension or null if the extension must not be checked
     * @param depth
     * @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY 
     * @return a set of files for snapshots of the url
     */
    public static Collection<File> findPaths(final DigestURL url, final int depth, final String ext, State state) {
        Collection<File> result = new ArrayList<>();
        if (state == State.INVENTORY || state == State.ANY) result.addAll(inventory.findPaths(url, depth, ext));
        if (state == State.ARCHIVE || state == State.ANY) result.addAll(archive.findPaths(url, depth, ext));
        return result;
    }
    
}