yacy_search_server/source/net/yacy/crawler/data/Transactions.java

/**
 *  Transactions
 *  Copyright 2014 by Michael Peter Christen
 *  First released 08.12.2014 at http://yacy.net
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package net.yacy.crawler.data;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;

import org.apache.solr.common.SolrInputDocument;

import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.responsewriter.EnhancedXMLResponseWriter;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.Html2Image;
import net.yacy.crawler.data.Snapshots.Order;
import net.yacy.search.schema.CollectionSchema;

/**
 * This is a static class holding one or several Snapshot directories
 * Transacted snapshots are moved from the inventory snapshot directory to the archive snapshot directory.
 *
 */
public class Transactions {
    
    private final static String XML_PREFIX = "<response>\n<!--\n";
    private final static char[] WHITESPACE = new char[132];
    private final static int WHITESPACE_START = XML_PREFIX.length();
    private final static int WHITESPACE_LENGTH = WHITESPACE.length;
    private final static String SNAPSHOT_INVENTORY_DIR = "inventory";
    private final static String SNAPSHOT_ARCHIVE_DIR = "archive";
    private static File transactionDir = null, inventoryDir = null, archiveDir = null;
    private static Snapshots inventory = null, archive = null;
    
    static {
        for (int i = 0; i < WHITESPACE.length; i++) WHITESPACE[i] = 32;
    }
    
    public static enum State {
        INVENTORY, ARCHIVE, ANY;
    }
    
    public static void init(File dir) {
        transactionDir = dir;
        transactionDir.mkdirs();
        inventoryDir = new File(transactionDir, SNAPSHOT_INVENTORY_DIR);
        inventory = new Snapshots(inventoryDir);
        archiveDir = new File(transactionDir, SNAPSHOT_ARCHIVE_DIR);
        archive = new Snapshots(archiveDir);
    }

    public static boolean store(SolrInputDocument doc, boolean loadImage, boolean replaceOld, String proxy, ClientIdentification.Agent agent) {

        // GET METADATA FROM DOC
        String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
        Date date = (Date) doc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
        int depth = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
        DigestURL url;
        try {
            url = new DigestURL(urls);
        } catch (MalformedURLException e) {
            ConcurrentLog.logException(e);
            return false;
        }
        
        // STORE AN IMAGE
        Collection<File> oldPaths = Transactions.findPaths(url, depth, "pdf", Transactions.State.INVENTORY);
        if (replaceOld) {
            for (File oldPath: oldPaths) oldPath.delete();
        }
        File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY);
        metadataPath.getParentFile().mkdirs();
        boolean success = true;
        if (loadImage) {
            File pdfPath = Transactions.definePath(url, depth, date, "pdf", Transactions.State.INVENTORY);
            success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, pdfPath);
        }
        
        // STORE METADATA FOR THE IMAGE
        if (success) try {
            if (doc != null) {
                FileOutputStream fos = new FileOutputStream(metadataPath);
                OutputStreamWriter osw = new OutputStreamWriter(fos);
                osw.write(XML_PREFIX);
                osw.write(WHITESPACE); osw.write("\n-->\n"); // placeholder for transaction information properties (a hack to attach metadata to metadata)
                osw.write("<result name=\"response\" numFound=\"1\" start=\"0\">\n");
                EnhancedXMLResponseWriter.writeDoc(osw, doc);
                osw.write("</result>\n");
                osw.write("</response>\n");
                osw.close();
                fos.close();
                Transactions.announceStorage(url, depth, date);
            }
            return true;
        } catch (IOException e) {
            ConcurrentLog.logException(e);
            return false;
        }
        return false;
    }
    
    
    /**
     * select a set of urlhashes from the snapshot directory. The selection either ordered
     * by generation date (upwards == OLDESTFIRST or downwards == LATESTFIRST) or with any
     * order. The result set can be selected either with a given host or a depth
     * @param host selected host or null for all hosts
     * @param depth selected depth or null for all depths
     * @param order Order.ANY, Order.OLDESTFIRST or Order.LATESTFIRST
     * @param maxcount the maximum number of hosthashes. If unlimited, submit Integer.MAX_VALUE
     * @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY 
     * @return a map of hosthashes with the associated creation date
     */
    public static Map<String, Date> select(String host, Integer depth, final Order order, int maxcount, State state) {
        Map<String, Date> result = new HashMap<>();
        if (state == State.INVENTORY || state == State.ANY) result.putAll(inventory.select(host, depth, order, maxcount));
        if (state == State.ARCHIVE || state == State.ANY) result.putAll(archive.select(host, depth, order, maxcount));
        return result;
    }

    /**
     * Compute the path of a snapshot. This does not create the snapshot, only gives a path.
     * Also, the path to the storage location is not created.
     * @param url
     * @param depth
     * @param date
     * @param ext
     * @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY 
     * @return a file to the snapshot
     */
    public static File definePath(final DigestURL url, final int depth, final Date date, final String ext, State state) {
        if (state == State.ANY) throw new RuntimeException("definePath must be selected with INVENTORY or ARCHIVE state");
        if (state == State.INVENTORY) return inventory.definePath(url, depth, date, ext);
        if (state == State.ARCHIVE) return archive.definePath(url, depth, date, ext);
        return null;
    }
    
    public static void announceStorage(final DigestURL url, final int depth, final Date date) {
        inventory.announceStorage(url, depth, date); 
    }

    /**
     * for a given url, get all paths for storage locations.
     * The locations are all for the single url but may represent different storage times.
     * This method is inefficient because it tests all different depths, it would be better to use
     * findPaths/3 with a given depth.
     * @param url
     * @param ext
     * @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY 
     * @return a set of files for snapshots of the url
     */
    public static Collection<File> findPaths(final DigestURL url, final String ext, State state) {
        Collection<File> result = new ArrayList<>();
        if (state == State.INVENTORY || state == State.ANY) result.addAll(inventory.findPaths(url, ext));
        if (state == State.ARCHIVE || state == State.ANY) result.addAll(archive.findPaths(url, ext));
        return result;
    }
    
    /**
     * for a given url, get all paths for storage locations.
     * The locations are all for the single url but may represent different storage times.
     * @param url
     * @param ext
     * @param depth
     * @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY 
     * @return a set of files for snapshots of the url
     */
    public static Collection<File> findPaths(final DigestURL url, final int depth, final String ext, State state) {
        Collection<File> result = new ArrayList<>();
        if (state == State.INVENTORY || state == State.ANY) result.addAll(inventory.findPaths(url, depth, ext));
        if (state == State.ARCHIVE || state == State.ANY) result.addAll(archive.findPaths(url, depth, ext));
        return result;
    }
    
}
enhanced the snapshot functionality: - snapshots can now also be xml files which are extracted from the solr index and stored as individual xml files in the snapshot directory along the pdf and jpg images - a transaction layer was placed above of the snapshot directory to distinguish snapshots into 'inventory' and 'archive'. This may be used to do transactions of index fragments using archived solr search results between peers. This is currently unfinished, we need a protocol to move snapshots from inventory to archive - the SNAPSHOT directory was renamed to snapshot and contains now two snapshot subdirectories: inventory and archive - snapshots may now be generated by everyone, not only such peers running on a server with xkhtml2pdf installed. The expert crawl starts provides the option for snapshots to everyone. PDF snapshots are now optional and the option is only shown if xkhtml2pdf is installed. - the snapshot api now provides the request for historised xml files, i.e. call: http://localhost:8090/api/snapshot.xml?urlhash=Q3dQopFh1hyQ The result of such xml files is identical with solr search results with only one hit. The pdf generation has been moved from the http loading process to the solr document storage process. This may slow down the process a lot and a different version of the process may be needed. 10 years ago			`/**`
			`* Transactions`
			`* Copyright 2014 by Michael Peter Christen`
			`* First released 08.12.2014 at http://yacy.net`
			`*`
			`* This library is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* This library is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public License`
			`* along with this program in the file lgpl21.txt`
			`* If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`package net.yacy.crawler.data;`

			`import java.io.File;`
			`import java.io.FileOutputStream;`
			`import java.io.IOException;`
			`import java.io.OutputStreamWriter;`
			`import java.net.MalformedURLException;`
			`import java.util.ArrayList;`
			`import java.util.Collection;`
			`import java.util.Date;`
			`import java.util.HashMap;`
			`import java.util.Map;`

			`import org.apache.solr.common.SolrInputDocument;`

			`import net.yacy.cora.document.id.DigestURL;`
			`import net.yacy.cora.federate.solr.responsewriter.EnhancedXMLResponseWriter;`
			`import net.yacy.cora.protocol.ClientIdentification;`
			`import net.yacy.cora.util.ConcurrentLog;`
			`import net.yacy.cora.util.Html2Image;`
			`import net.yacy.crawler.data.Snapshots.Order;`
			`import net.yacy.search.schema.CollectionSchema;`

			`/**`
			`* This is a static class holding one or several Snapshot directories`
			`* Transacted snapshots are moved from the inventory snapshot directory to the archive snapshot directory.`
			`*`
			`*/`
			`public class Transactions {`

			`private final static String XML_PREFIX = "<response>\n<!--\n";`
			`private final static char[] WHITESPACE = new char[132];`
			`private final static int WHITESPACE_START = XML_PREFIX.length();`
			`private final static int WHITESPACE_LENGTH = WHITESPACE.length;`
			`private final static String SNAPSHOT_INVENTORY_DIR = "inventory";`
			`private final static String SNAPSHOT_ARCHIVE_DIR = "archive";`
			`private static File transactionDir = null, inventoryDir = null, archiveDir = null;`
			`private static Snapshots inventory = null, archive = null;`

			`static {`
			`for (int i = 0; i < WHITESPACE.length; i++) WHITESPACE[i] = 32;`
			`}`

			`public static enum State {`
			`INVENTORY, ARCHIVE, ANY;`
			`}`

			`public static void init(File dir) {`
			`transactionDir = dir;`
			`transactionDir.mkdirs();`
			`inventoryDir = new File(transactionDir, SNAPSHOT_INVENTORY_DIR);`
			`inventory = new Snapshots(inventoryDir);`
			`archiveDir = new File(transactionDir, SNAPSHOT_ARCHIVE_DIR);`
			`archive = new Snapshots(archiveDir);`
			`}`

			`public static boolean store(SolrInputDocument doc, boolean loadImage, boolean replaceOld, String proxy, ClientIdentification.Agent agent) {`

			`// GET METADATA FROM DOC`
			`String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());`
			`Date date = (Date) doc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());`
			`int depth = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());`
			`DigestURL url;`
			`try {`
			`url = new DigestURL(urls);`
			`} catch (MalformedURLException e) {`
			`ConcurrentLog.logException(e);`
			`return false;`
			`}`

			`// STORE AN IMAGE`
			`Collection<File> oldPaths = Transactions.findPaths(url, depth, "pdf", Transactions.State.INVENTORY);`
			`if (replaceOld) {`
			`for (File oldPath: oldPaths) oldPath.delete();`
			`}`
			`File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY);`
			`metadataPath.getParentFile().mkdirs();`
			`boolean success = true;`
			`if (loadImage) {`
			`File pdfPath = Transactions.definePath(url, depth, date, "pdf", Transactions.State.INVENTORY);`
			`success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, pdfPath);`
			`}`

			`// STORE METADATA FOR THE IMAGE`
			`if (success) try {`
			`if (doc != null) {`
			`FileOutputStream fos = new FileOutputStream(metadataPath);`
			`OutputStreamWriter osw = new OutputStreamWriter(fos);`
			`osw.write(XML_PREFIX);`
			`osw.write(WHITESPACE); osw.write("\n-->\n"); // placeholder for transaction information properties (a hack to attach metadata to metadata)`
			`osw.write("<result name=\"response\" numFound=\"1\" start=\"0\">\n");`
			`EnhancedXMLResponseWriter.writeDoc(osw, doc);`
			`osw.write("</result>\n");`
			`osw.write("</response>\n");`
			`osw.close();`
			`fos.close();`
			`Transactions.announceStorage(url, depth, date);`
			`}`
			`return true;`
			`} catch (IOException e) {`
			`ConcurrentLog.logException(e);`
			`return false;`
			`}`
			`return false;`
			`}`


			`/**`
			`* select a set of urlhashes from the snapshot directory. The selection either ordered`
			`* by generation date (upwards == OLDESTFIRST or downwards == LATESTFIRST) or with any`
			`* order. The result set can be selected either with a given host or a depth`
			`* @param host selected host or null for all hosts`
			`* @param depth selected depth or null for all depths`
			`* @param order Order.ANY, Order.OLDESTFIRST or Order.LATESTFIRST`
			`* @param maxcount the maximum number of hosthashes. If unlimited, submit Integer.MAX_VALUE`
			`* @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY`
			`* @return a map of hosthashes with the associated creation date`
			`*/`
			`public static Map<String, Date> select(String host, Integer depth, final Order order, int maxcount, State state) {`
			`Map<String, Date> result = new HashMap<>();`
			`if (state == State.INVENTORY \|\| state == State.ANY) result.putAll(inventory.select(host, depth, order, maxcount));`
			`if (state == State.ARCHIVE \|\| state == State.ANY) result.putAll(archive.select(host, depth, order, maxcount));`
			`return result;`
			`}`

			`/**`
			`* Compute the path of a snapshot. This does not create the snapshot, only gives a path.`
			`* Also, the path to the storage location is not created.`
			`* @param url`
			`* @param depth`
			`* @param date`
			`* @param ext`
			`* @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY`
			`* @return a file to the snapshot`
			`*/`
			`public static File definePath(final DigestURL url, final int depth, final Date date, final String ext, State state) {`
			`if (state == State.ANY) throw new RuntimeException("definePath must be selected with INVENTORY or ARCHIVE state");`
			`if (state == State.INVENTORY) return inventory.definePath(url, depth, date, ext);`
			`if (state == State.ARCHIVE) return archive.definePath(url, depth, date, ext);`
			`return null;`
			`}`

			`public static void announceStorage(final DigestURL url, final int depth, final Date date) {`
			`inventory.announceStorage(url, depth, date);`
			`}`

			`/**`
			`* for a given url, get all paths for storage locations.`
			`* The locations are all for the single url but may represent different storage times.`
			`* This method is inefficient because it tests all different depths, it would be better to use`
			`* findPaths/3 with a given depth.`
			`* @param url`
			`* @param ext`
			`* @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY`
			`* @return a set of files for snapshots of the url`
			`*/`
			`public static Collection<File> findPaths(final DigestURL url, final String ext, State state) {`
			`Collection<File> result = new ArrayList<>();`
			`if (state == State.INVENTORY \|\| state == State.ANY) result.addAll(inventory.findPaths(url, ext));`
			`if (state == State.ARCHIVE \|\| state == State.ANY) result.addAll(archive.findPaths(url, ext));`
			`return result;`
			`}`

			`/**`
			`* for a given url, get all paths for storage locations.`
			`* The locations are all for the single url but may represent different storage times.`
			`* @param url`
			`* @param ext`
			`* @param depth`
			`* @param state the wanted transaction state, State.INVENTORY, State.ARCHIVE or State.ANY`
			`* @return a set of files for snapshots of the url`
			`*/`
			`public static Collection<File> findPaths(final DigestURL url, final int depth, final String ext, State state) {`
			`Collection<File> result = new ArrayList<>();`
			`if (state == State.INVENTORY \|\| state == State.ANY) result.addAll(inventory.findPaths(url, depth, ext));`
			`if (state == State.ARCHIVE \|\| state == State.ANY) result.addAll(archive.findPaths(url, depth, ext));`
			`return result;`
			`}`

			`}`