added rss feed output to snapshot servlet which can be used to get a

list of latest/oldest entries in the snapshot database. This is an
example:
http://localhost:8090/api/snapshot.rss?depth=2&order=LATESTFIRST&host=yacy.net&maxcount=100

The properties depth, order, host and maxcount can be omited. The
meaning of the fields are:
host: select only urls from this host or all, if not given
depth: select only urls at that crawl depth or all, if not given
maxcount: select at most the given number of urls or 10, if not given
order: either LATESTFIRST to select the youngest entries, OLDESTFIRST to
select the first entries or ANY to select any

The rss feed needs administration rights to work, a call to this servlet
with rss extension must attach login credentials.
pull/1/head
Michael Peter Christen 10 years ago
parent 8b522687e0
commit 4fe4bf29ad

@ -27,13 +27,18 @@ import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Collection; import java.util.Collection;
import java.util.Date; import java.util.Date;
import java.util.Map;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.Html2Image; import net.yacy.cora.util.Html2Image;
import net.yacy.crawler.data.Snapshots;
import net.yacy.document.ImageParser; import net.yacy.document.ImageParser;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
@ -53,9 +58,37 @@ public class snapshot {
final Switchboard sb = (Switchboard) env; final Switchboard sb = (Switchboard) env;
if (post == null) return null; if (post == null) return null;
final boolean authenticated = sb.adminAuthenticated(header) >= 2;
final String ext = header.get("EXT", ""); final String ext = header.get("EXT", "");
if (ext.equals("rss")) {
// create a report about the content of the snapshot directory
if (!authenticated) return null;
int maxcount = post.getInt("maxcount", 10);
int depthx = post.getInt("depth", -1);
Integer depth = depthx == -1 ? null : depthx;
String orderx = post.get("order", "ANY");
Snapshots.Order order = Snapshots.Order.valueOf(orderx);
String host = post.get("host");
Map<String, Date> iddate = sb.snapshots.select(host, depth, order, maxcount);
// now select the URL from the index for these ids in iddate and make an RSS feed
RSSFeed rssfeed = new RSSFeed(Integer.MAX_VALUE);
rssfeed.setChannel(new RSSMessage("Snapshot list for host = " + host + ", depth = " + depth + ", order = " + order + ", maxcount = " + maxcount, "", ""));
for (Map.Entry<String, Date> e: iddate.entrySet()) {
try {
DigestURL u = sb.index.fulltext().getURL(e.getKey());
RSSMessage message = new RSSMessage(u.toNormalform(true), "", u, e.getKey());
message.setPubDate(e.getValue());
rssfeed.addMessage(message);
} catch (IOException ee) {
ConcurrentLog.logException(ee);
}
}
byte[] rssBinary = UTF8.getBytes(rssfeed.toString());
return new ByteArrayInputStream(rssBinary);
}
final boolean pdf = ext.equals("pdf"); final boolean pdf = ext.equals("pdf");
final boolean authenticated = sb.adminAuthenticated(header) >= 2;
if (pdf && !authenticated) return null; if (pdf && !authenticated) return null;
final boolean pngjpg = ext.equals("png") || ext.equals("jpg"); final boolean pngjpg = ext.equals("png") || ext.equals("jpg");
String urlhash = post.get("urlhash", ""); String urlhash = post.get("urlhash", "");
@ -132,6 +165,7 @@ public class snapshot {
} }
return null; return null;
} }
} }

@ -22,16 +22,21 @@ package net.yacy.crawler.data;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Date; import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.Html2Image; import net.yacy.cora.util.Html2Image;
import net.yacy.search.index.Fulltext; import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
@ -56,8 +61,42 @@ public class Snapshots {
private File storageLocation; private File storageLocation;
private Map<String, TreeMap<Integer, TreeSet<String>>> directory; // a TreeMap for each domain where the key is the depth and the value is a Set containing a key/urlhash id to get all files into a specific order to provide a recent view on the documents
public Snapshots(File location) { public Snapshots(File location) {
this.storageLocation = location; this.storageLocation = location;
// scan the location to fill the directory
this.directory = new HashMap<>();
for (String domain: location.list()) {
TreeMap<Integer, TreeSet<String>> domaindepth = new TreeMap<>();
this.directory.put(domain, domaindepth);
File domaindir = new File(location, domain);
if (domaindir.isDirectory()) domainscan: for (String depth: domaindir.list()) {
TreeSet<String> dateid = new TreeSet<>();
Integer depthi = -1;
try {
depthi = Integer.parseInt(depth);
} catch (NumberFormatException e) {
continue domainscan;
}
domaindepth.put(depthi, dateid);
File sharddir = new File(domaindir, depth);
if (sharddir.isDirectory()) for (String shard: sharddir.list()) {
File snapshotdir = new File(sharddir, shard);
if (snapshotdir.isDirectory()) for (String snapshotfile: snapshotdir.list()) {
if (snapshotfile.endsWith(".pdf")) {
String s = snapshotfile.substring(0, snapshotfile.length() - 4);
int p = s.indexOf('.');
assert p == 12;
if (p > 0) {
String key = s.substring(p + 1) + '.' + s.substring(0, p);
dateid.add(key);
}
}
}
}
}
}
} }
/** /**
@ -75,10 +114,14 @@ public class Snapshots {
if (replaceOld) { if (replaceOld) {
for (File oldPath: oldPaths) oldPath.delete(); for (File oldPath: oldPaths) oldPath.delete();
} }
File path = definePath(url, "pdf", depth, date); File path = definePath(url, depth, date, "pdf");
path.getParentFile().mkdirs(); path.getParentFile().mkdirs();
boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, userAgent, path); boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, userAgent, path);
return success ? path : null; if (success) {
announceStorage(url, depth, date);
return path;
}
return null;
} }
/** /**
@ -90,13 +133,91 @@ public class Snapshots {
* @param date * @param date
* @return a file to the snapshot * @return a file to the snapshot
*/ */
public File definePath(final DigestURL url, final String ext, final int depth, final Date date) { public File definePath(final DigestURL url, final int depth, final Date date, final String ext) {
String id = ASCII.String(url.hash()); String id = ASCII.String(url.hash());
String ds = GenericFormatter.SHORT_DAY_FORMATTER.format(date); String ds = GenericFormatter.SHORT_MINUTE_FORMATTER.format(date);
File path = new File(pathToShard(url, depth), id + "." + ds + "." + ext); File path = new File(pathToShard(url, depth), id + "." + ds + "." + ext);
return path; return path;
} }
private void announceStorage(final DigestURL url, final int depth, final Date date) {
String id = ASCII.String(url.hash());
String ds = GenericFormatter.SHORT_MINUTE_FORMATTER.format(date);
TreeMap<Integer, TreeSet<String>> domaindepth = this.directory.get(pathToHostDir(url));
if (domaindepth == null) {domaindepth = new TreeMap<Integer, TreeSet<String>>(); this.directory.put(pathToHostDir(url), domaindepth);}
TreeSet<String> dateid = domaindepth.get(depth);
if (dateid == null) {dateid = new TreeSet<String>(); domaindepth.put(depth, dateid);}
dateid.add(ds + '.' + id);
}
public static enum Order {
ANY, OLDESTFIRST, LATESTFIRST;
}
/**
* select a set of urlhashes from the snapshot directory. The selection either ordered
* by generation date (upwards == OLDESTFIRST or downwards == LATESTFIRST) or with any
* order. The result set can be selected either with a given host or a depth
* @param host selected host or null for all hosts
* @param depth selected depth or null for all depths
* @param order Order.ANY, Order.OLDESTFIRST or Order.LATESTFIRST
* @param maxcount the maximum number of hosthashes. If unlimited, submit Integer.MAX_VALUE
* @return a map of hosthashes with the associated creation date
*/
public Map<String, Date> select(String host, Integer depth, final Order order, int maxcount) {
TreeSet<String> dateIdResult = new TreeSet<>();
if (host == null && depth == null) {
loop: for (TreeMap<Integer, TreeSet<String>> domaindepth: this.directory.values()) {
for (TreeSet<String> keys: domaindepth.values()) {
dateIdResult.addAll(keys);
if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop;
}
}
}
if (host == null && depth != null) {
loop: for (TreeMap<Integer, TreeSet<String>> domaindepth: this.directory.values()) {
TreeSet<String> keys = domaindepth.get(depth);
if (keys != null) dateIdResult.addAll(keys);
if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop;
}
}
if (host != null && depth == null) {
TreeMap<Integer, TreeSet<String>> domaindepth = this.directory.get(pathToHostDir(host,80));
if (domaindepth != null) loop: for (TreeSet<String> keys: domaindepth.values()) {
dateIdResult.addAll(keys);
if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop;
}
}
if (host != null && depth != null) {
TreeMap<Integer, TreeSet<String>> domaindepth = this.directory.get(pathToHostDir(host,80));
if (domaindepth != null) {
TreeSet<String> keys = domaindepth.get(depth);
if (keys != null) dateIdResult.addAll(keys);
}
}
Map<String, Date> result = new HashMap<>();
Iterator<String> i = order == Order.LATESTFIRST ? dateIdResult.descendingIterator() : dateIdResult.iterator();
while (i.hasNext() && result.size() < maxcount) {
String di = i.next();
int p = di.indexOf('.');
assert p >= 0;
String d = di.substring(0, p);
Date date;
try {
date = GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d);
} catch (ParseException e) {
try {
date = GenericFormatter.SHORT_DAY_FORMATTER.parse(d);
} catch (ParseException ee) {
date = new Date();
}
}
result.put(di.substring(p + 1), date);
}
return result;
}
/** /**
* get the depth to a document, helper method for definePath to determine the depth value * get the depth to a document, helper method for definePath to determine the depth value
* @param url * @param url
@ -157,10 +278,26 @@ public class Snapshots {
private File pathToShard(final DigestURL url, final int depth) { private File pathToShard(final DigestURL url, final int depth) {
String id = ASCII.String(url.hash()); String id = ASCII.String(url.hash());
File pathToHostDir = new File(storageLocation, url.getHost() + "." + url.getPort()); File pathToHostDir = new File(storageLocation, pathToHostDir(url));
File pathToDepthDir = new File(pathToHostDir, depth < 10 ? "0" + depth : Integer.toString(depth)); File pathToDepthDir = new File(pathToHostDir, pathToDepthDir(depth));
File pathToShard = new File(pathToDepthDir, id.substring(0, 2)); File pathToShard = new File(pathToDepthDir, pathToShard(id));
return pathToShard; return pathToShard;
} }
private String pathToHostDir(final DigestURL url) {
return pathToHostDir(url.getHost(), url.getPort());
}
private String pathToHostDir(final String host, final int port) {
return host + "." + port;
}
private String pathToDepthDir(final int depth) {
return depth < 10 ? "0" + depth : Integer.toString(depth);
}
private String pathToShard(final String urlhash) {
return urlhash.substring(0, 2);
}
} }

Loading…
Cancel
Save