You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
456 lines
20 KiB
456 lines
20 KiB
/**
|
|
* DocumentImage
|
|
* Copyright 2014 by Michael Peter Christen
|
|
* First released 29.11.2014 at http://yacy.net
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program in the file lgpl21.txt
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
package net.yacy.crawler.data;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
import java.io.IOException;
|
|
import java.io.InputStreamReader;
|
|
import java.text.ParseException;
|
|
import java.util.ArrayList;
|
|
import java.util.Collection;
|
|
import java.util.Date;
|
|
import java.util.HashMap;
|
|
import java.util.HashSet;
|
|
import java.util.Iterator;
|
|
import java.util.LinkedHashMap;
|
|
import java.util.Map;
|
|
import java.util.Set;
|
|
import java.util.TreeMap;
|
|
import java.util.TreeSet;
|
|
|
|
import org.apache.solr.common.SolrDocument;
|
|
|
|
import net.yacy.cora.date.GenericFormatter;
|
|
import net.yacy.cora.document.encoding.ASCII;
|
|
import net.yacy.cora.document.id.DigestURL;
|
|
import net.yacy.search.index.Fulltext;
|
|
import net.yacy.search.schema.CollectionSchema;
|
|
|
|
/**
|
|
* This class hosts document snapshots.
|
|
*
|
|
* The storage is organized in the following hierarchy:
|
|
* - in the root path are subpaths for each host:port
|
|
* - in the host:port path are subpaths for the crawl depth, two digits length
|
|
* - in the crawl depth path are subpaths for the first two charaters of the url-hash, called shard
|
|
* - in the shard path are files, named with <urlhash>'.'<date>.<ext>
|
|
* .. where the <date> has the form "yyyyMMdd" and ext may be one of {pdf,jpg,png,xml,json}.
|
|
* The pdf is created with wxhtmltopdf, jpg/png is created with convert
|
|
* and the xml/json is an extract from solr.
|
|
*
|
|
* The construction of the file name with the date allows to make several copies of the same document
|
|
* for different snapshot-times. The usage of the crawl depth makes it easier to extract a specific part
|
|
* of the domain.
|
|
*/
|
|
public class Snapshots {
|
|
|
|
private File storageLocation;
|
|
|
|
private Map<String, TreeMap<Integer, TreeSet<String>>> directory; // a TreeMap for each domain (host.port) where the key is the depth and the value is a Set containing a key/urlhash id to get all files into a specific order to provide a recent view on the documents
|
|
|
|
public Snapshots(final File location) {
|
|
this.storageLocation = location;
|
|
this.storageLocation.mkdirs();
|
|
// scan the location to fill the directory
|
|
this.directory = new HashMap<>();
|
|
for (String hostport: location.list()) {
|
|
TreeMap<Integer, TreeSet<String>> domaindepth = new TreeMap<>();
|
|
this.directory.put(hostport, domaindepth);
|
|
File domaindir = new File(location, hostport);
|
|
if (domaindir.isDirectory()) domainscan: for (String depth: domaindir.list()) {
|
|
TreeSet<String> dateid = new TreeSet<>();
|
|
Integer depthi = -1;
|
|
try {
|
|
depthi = Integer.parseInt(depth);
|
|
} catch (NumberFormatException e) {
|
|
continue domainscan;
|
|
}
|
|
domaindepth.put(depthi, dateid);
|
|
File sharddir = new File(domaindir, depth);
|
|
if (sharddir.isDirectory()) for (String shard: sharddir.list()) {
|
|
File snapshotdir = new File(sharddir, shard);
|
|
if (snapshotdir.isDirectory()) {
|
|
for (String snapshotfile: snapshotdir.list()) {
|
|
if (snapshotfile.endsWith(".xml")) {
|
|
String s = snapshotfile.substring(0, snapshotfile.length() - 4);
|
|
int p = s.indexOf('.');
|
|
assert p == 12;
|
|
if (p > 0) {
|
|
String key = s.substring(p + 1) + '.' + s.substring(0, p);
|
|
dateid.add(key);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (dateid.size() == 0) domaindepth.remove(depthi);
|
|
}
|
|
if (domaindepth.size() == 0) this.directory.remove(hostport);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* get the number of entries in the snapshot directory
|
|
* @return the total number of different documents
|
|
*/
|
|
public int size() {
|
|
int c = 0;
|
|
for (Map<Integer, TreeSet<String>> m: directory.values()) {
|
|
for (TreeSet<String> n: m.values()) {
|
|
c += n.size();
|
|
}
|
|
}
|
|
return c;
|
|
}
|
|
|
|
/**
|
|
* get a list of <host>.<port> names in the snapshot directory
|
|
* @return
|
|
*/
|
|
public Set<String> listHosts() {
|
|
return directory.keySet();
|
|
}
|
|
|
|
public final class Revisions {
|
|
public final int depth;
|
|
public final Date[] dates;
|
|
public final String urlhash;
|
|
public final String url;
|
|
public final File[] pathtoxml;
|
|
public Revisions(final String hostport, final int depth, final String datehash) {
|
|
this.depth = depth;
|
|
int p = datehash.indexOf('.');
|
|
this.dates = new Date[1];
|
|
String datestring = datehash.substring(0, p);
|
|
this.dates[0] = parseDate(datestring);
|
|
this.urlhash = datehash.substring(p + 1);
|
|
this.pathtoxml = new File[1];
|
|
this.pathtoxml[0] = new File(pathToShard(hostport, urlhash, depth), this.urlhash + "." + datestring + ".xml");
|
|
String u = null;
|
|
if (this.pathtoxml[0].exists()) {
|
|
try {
|
|
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(this.pathtoxml[0])));
|
|
String line;
|
|
while ((line = reader.readLine()) != null) {
|
|
if (line.startsWith("<str name=\"sku\">")) {
|
|
u = line.substring(16, line.length() - 6);
|
|
break;
|
|
}
|
|
}
|
|
reader.close();
|
|
} catch (IOException e) {}
|
|
}
|
|
this.url = u;
|
|
}
|
|
}
|
|
|
|
public Revisions getRevisions(String urlhash) {
|
|
if (urlhash == null || urlhash.length() == 0) return null;
|
|
// search for the hash, we must iterate through all entries
|
|
for (Map.Entry<String, TreeMap<Integer, TreeSet<String>>> hostportDomaindepth: this.directory.entrySet()) {
|
|
String hostport = hostportDomaindepth.getKey();
|
|
for (Map.Entry<Integer, TreeSet<String>> depthDateHash: hostportDomaindepth.getValue().entrySet()) {
|
|
int depth = depthDateHash.getKey();
|
|
for (String dateHash: depthDateHash.getValue()) {
|
|
if (dateHash.endsWith(urlhash)) {
|
|
return new Revisions(hostport, depth, dateHash);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* list the snapshots for a given host name
|
|
* @param hostport the <host>.<port> identifier for the domain
|
|
* @param depth restrict the result to the given depth or if depth == -1 do not restrict to a depth
|
|
* @return a map with a set for each depth in the domain of the host name
|
|
*/
|
|
public TreeMap<Integer, Collection<Revisions>> listIDs(final String hostport, final int depth) {
|
|
TreeMap<Integer, Collection<Revisions>> result = new TreeMap<>();
|
|
TreeMap<Integer, TreeSet<String>> list = directory.get(hostport);
|
|
if (list != null) {
|
|
for (Map.Entry<Integer, TreeSet<String>> entry: list.entrySet()) {
|
|
if (depth != -1 && entry.getKey() != depth) continue;
|
|
Collection<Revisions> r = new ArrayList<>(entry.getValue().size());
|
|
for (String datehash: entry.getValue()) {
|
|
r.add(new Revisions(hostport, entry.getKey(), datehash));
|
|
}
|
|
result.put(entry.getKey(), r);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* get the number of snapshots for the given host name
|
|
* @param hostport the <host>.<port> identifier for the domain
|
|
* @param depth restrict the result to the given depth or if depth == -1 do not restrict to a depth
|
|
* @return a count, the total number of documents for the domain and depth
|
|
*/
|
|
public int listIDsSize(final String hostport, final int depth) {
|
|
int count = 0;
|
|
TreeMap<Integer, TreeSet<String>> list = directory.get(hostport);
|
|
if (list != null) {
|
|
for (Map.Entry<Integer, TreeSet<String>> entry: list.entrySet()) {
|
|
if (depth != -1 && entry.getKey() != depth) continue;
|
|
count += entry.getValue().size();
|
|
}
|
|
}
|
|
return count;
|
|
}
|
|
|
|
/**
|
|
* Compute the path of a snapshot. This does not create the snapshot, only gives a path.
|
|
* Also, the path to the storage location is not created.
|
|
* @param url
|
|
* @param ext
|
|
* @param depth
|
|
* @param date
|
|
* @return a file to the snapshot
|
|
*/
|
|
public File definePath(final DigestURL url, final int depth, final Date date, final String ext) {
|
|
String id = ASCII.String(url.hash());
|
|
String ds = GenericFormatter.SHORT_MINUTE_FORMATTER.format(date);
|
|
File path = new File(pathToShard(url, depth), id + "." + ds + "." + ext);
|
|
return path;
|
|
}
|
|
|
|
/**
|
|
* Write information about the storage of a snapshot to the Snapshot-internal index.
|
|
* The actual writing of files to the target directory must be done elsewehre, this method does not store the snapshot files.
|
|
* @param url
|
|
* @param depth
|
|
* @param date
|
|
*/
|
|
public void announceStorage(final DigestURL url, final int depth, final Date date) {
|
|
String id = ASCII.String(url.hash());
|
|
String ds = GenericFormatter.SHORT_MINUTE_FORMATTER.format(date);
|
|
String pathToHostPortDir = pathToHostPortDir(url.getHost(), url.getPort());
|
|
TreeMap<Integer, TreeSet<String>> domaindepth = this.directory.get(pathToHostPortDir);
|
|
if (domaindepth == null) {domaindepth = new TreeMap<Integer, TreeSet<String>>(); this.directory.put(pathToHostPortDir(url.getHost(), url.getPort()), domaindepth);}
|
|
TreeSet<String> dateid = domaindepth.get(depth);
|
|
if (dateid == null) {dateid = new TreeSet<String>(); domaindepth.put(depth, dateid);}
|
|
dateid.add(ds + '.' + id);
|
|
}
|
|
|
|
/**
|
|
* Delete information about the storage of a snapshot to the Snapshot-internal index.
|
|
* The actual deletion of files in the target directory must be done elsewehre, this method does not store the snapshot files.
|
|
* @param url
|
|
* @param depth
|
|
* @param date
|
|
*/
|
|
public Set<Date> announceDeletion(final DigestURL url, final int depth) {
|
|
HashSet<Date> dates = new HashSet<>();
|
|
String id = ASCII.String(url.hash());
|
|
String pathToHostPortDir = pathToHostPortDir(url.getHost(), url.getPort());
|
|
TreeMap<Integer, TreeSet<String>> domaindepth = this.directory.get(pathToHostPortDir);
|
|
if (domaindepth == null) return dates;
|
|
TreeSet<String> dateid = domaindepth.get(depth);
|
|
if (dateid == null) return dates;
|
|
Iterator<String> i = dateid.iterator();
|
|
while (i.hasNext()) {
|
|
String dis = i.next();
|
|
if (dis.endsWith("." + id)) {
|
|
String d = dis.substring(0, dis.length() - id.length() - 1);
|
|
Date date = parseDate(d);
|
|
if (date != null) dates.add(date);
|
|
i.remove();
|
|
}
|
|
}
|
|
if (dateid.size() == 0) domaindepth.remove(depth);
|
|
if (domaindepth.size() == 0) this.directory.remove(pathToHostPortDir);
|
|
return dates;
|
|
}
|
|
|
|
/**
|
|
* Order enum class for the select method
|
|
*/
|
|
public static enum Order {
|
|
ANY, OLDESTFIRST, LATESTFIRST;
|
|
}
|
|
|
|
/**
|
|
* select a set of urlhashes from the snapshot directory. The selection either ordered
|
|
* by generation date (upwards == OLDESTFIRST or downwards == LATESTFIRST) or with any
|
|
* order. The result set can be selected either with a given host or a depth
|
|
* @param host selected host or null for all hosts
|
|
* @param depth selected depth or null for all depths
|
|
* @param order Order.ANY, Order.OLDESTFIRST or Order.LATESTFIRST
|
|
* @param maxcount the maximum number of hosthashes. If unlimited, submit Integer.MAX_VALUE
|
|
* @return a map of hosthashes with the associated creation date
|
|
*/
|
|
public LinkedHashMap<String, Revisions> select(final String host, final Integer depth, final Order order, int maxcount) {
|
|
TreeMap<String, String[]> dateIdResult = new TreeMap<>();
|
|
if (host == null && depth == null) {
|
|
loop: for (Map.Entry<String, TreeMap<Integer, TreeSet<String>>> hostportDepths: this.directory.entrySet()) {
|
|
for (Map.Entry<Integer, TreeSet<String>> depthIds: hostportDepths.getValue().entrySet()) {
|
|
for (String id: depthIds.getValue()) {
|
|
dateIdResult.put(id, new String[]{hostportDepths.getKey(), Integer.toString(depthIds.getKey())});
|
|
if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (host == null && depth != null) {
|
|
loop: for (Map.Entry<String, TreeMap<Integer, TreeSet<String>>> hostportDepths: this.directory.entrySet()) {
|
|
TreeSet<String> ids = hostportDepths.getValue().get(depth);
|
|
if (ids != null) for (String id: ids) {
|
|
dateIdResult.put(id, new String[]{hostportDepths.getKey(), Integer.toString(depth)});
|
|
if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop;
|
|
}
|
|
}
|
|
}
|
|
if (host != null && depth == null) {
|
|
String hostport = pathToHostPortDir(host,80);
|
|
TreeMap<Integer, TreeSet<String>> depthIdsMap = this.directory.get(hostport);
|
|
if (depthIdsMap != null) loop: for (Map.Entry<Integer, TreeSet<String>> depthIds: depthIdsMap.entrySet()) {
|
|
for (String id: depthIds.getValue()) {
|
|
dateIdResult.put(id, new String[]{hostport, Integer.toString(depthIds.getKey())});
|
|
if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop;
|
|
}
|
|
}
|
|
}
|
|
if (host != null && depth != null) {
|
|
String hostport = pathToHostPortDir(host,80);
|
|
TreeMap<Integer, TreeSet<String>> domaindepth = this.directory.get(hostport);
|
|
if (domaindepth != null) {
|
|
TreeSet<String> ids = domaindepth.get(depth);
|
|
if (ids != null) loop: for (String id: ids) {
|
|
dateIdResult.put(id, new String[]{hostport, Integer.toString(depth)});
|
|
if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop;
|
|
}
|
|
}
|
|
}
|
|
LinkedHashMap<String, Revisions> result = new LinkedHashMap<>();
|
|
Iterator<Map.Entry<String, String[]>> i = order == Order.LATESTFIRST ? dateIdResult.descendingMap().entrySet().iterator() : dateIdResult.entrySet().iterator();
|
|
while (i.hasNext() && result.size() < maxcount) {
|
|
Map.Entry<String, String[]> entry = i.next();
|
|
String datehash = entry.getKey();
|
|
int p = datehash.indexOf('.');
|
|
assert p >= 0;
|
|
Revisions r = new Revisions(entry.getValue()[0], Integer.parseInt(entry.getValue()[1]), datehash);
|
|
result.put(datehash.substring(p + 1), r);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
private static Date parseDate(String d) {
|
|
try {
|
|
return GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d, 0).getTime();
|
|
} catch (ParseException e) {
|
|
try {
|
|
return GenericFormatter.SHORT_DAY_FORMATTER.parse(d, 0).getTime();
|
|
} catch (ParseException ee) {
|
|
return null;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* get the depth to a document, helper method for definePath to determine the depth value
|
|
* @param url
|
|
* @param fulltext
|
|
* @return the crawldepth of the document
|
|
*/
|
|
public int getDepth(final DigestURL url, final Fulltext fulltext) {
|
|
Integer depth = null;
|
|
if (fulltext.getDefaultConfiguration().contains(CollectionSchema.crawldepth_i)) {
|
|
try {
|
|
SolrDocument doc = fulltext.getDefaultConnector().getDocumentById(ASCII.String(url.hash()), CollectionSchema.crawldepth_i.getSolrFieldName());
|
|
if (doc != null) {
|
|
depth = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
|
|
}
|
|
} catch (IOException e) {
|
|
}
|
|
}
|
|
return depth == null ? 0 : depth;
|
|
}
|
|
|
|
/**
|
|
* for a given url, get all paths for storage locations.
|
|
* The locations are all for the single url but may represent different storage times.
|
|
* This method is inefficient because it tests all different depths, it would be better to use
|
|
* findPaths/3 with a given depth.
|
|
* @param url
|
|
* @param ext
|
|
* @return a set of files for snapshots of the url
|
|
*/
|
|
public Collection<File> findPaths(final DigestURL url, final String ext) {
|
|
for (int i = 0; i < 100; i++) {
|
|
Collection<File> paths = findPaths(url, i, ext);
|
|
if (paths.size() > 0) return paths;
|
|
}
|
|
return new ArrayList<>(0);
|
|
}
|
|
|
|
// pathtoxml = <storageLocation>/<host>.<port>/<depth>/<shard>/<urlhash>.<date>.xml
|
|
|
|
/**
|
|
* for a given url, get all paths for storage locations.
|
|
* The locations are all for the single url but may represent different storage times.
|
|
* @param url
|
|
* @param ext required extension or null if the extension must not be checked
|
|
* @param depth
|
|
* @return a set of files for snapshots of the url
|
|
*/
|
|
public Collection<File> findPaths(final DigestURL url, final int depth, final String ext) {
|
|
String id = ASCII.String(url.hash());
|
|
File pathToShard = pathToShard(url, depth);
|
|
String[] list = pathToShard.exists() && pathToShard.isDirectory() ? pathToShard.list() : null; // may be null if path does not exist
|
|
ArrayList<File> paths = new ArrayList<>();
|
|
if (list != null) {
|
|
for (String f: list) {
|
|
if (f.startsWith(id) && (ext == null || f.endsWith(ext))) paths.add(new File(pathToShard, f));
|
|
}
|
|
}
|
|
return paths;
|
|
}
|
|
|
|
private File pathToShard(final DigestURL url, final int depth) {
|
|
return pathToShard(pathToHostPortDir(url.getHost(), url.getPort()), ASCII.String(url.hash()), depth);
|
|
}
|
|
|
|
private File pathToShard(final String hostport, final String urlhash, final int depth) {
|
|
File pathToHostDir = new File(storageLocation, hostport);
|
|
File pathToDepthDir = new File(pathToHostDir, pathToDepthDir(depth));
|
|
File pathToShard = new File(pathToDepthDir, pathToShard(urlhash));
|
|
return pathToShard;
|
|
}
|
|
|
|
private String pathToHostPortDir(final String host, final int port) {
|
|
return host + "." + port;
|
|
}
|
|
|
|
private String pathToDepthDir(final int depth) {
|
|
return depth < 10 ? "0" + depth : Integer.toString(depth);
|
|
}
|
|
|
|
private String pathToShard(final String urlhash) {
|
|
return urlhash.substring(0, 2);
|
|
}
|
|
|
|
}
|