From 8b522687e0a15b3237eb0f032bc10ff0418baea5 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sat, 6 Dec 2014 00:18:14 +0100 Subject: [PATCH 1/3] added toString() methods to feed classes which makes it possible to export full rss feed files out of the RSSFeed class --- .../net/yacy/cora/document/feed/RSSFeed.java | 25 ++++++++++++++++++- .../yacy/cora/document/feed/RSSMessage.java | 14 ++++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/source/net/yacy/cora/document/feed/RSSFeed.java b/source/net/yacy/cora/document/feed/RSSFeed.java index 009c500ea..552dce869 100644 --- a/source/net/yacy/cora/document/feed/RSSFeed.java +++ b/source/net/yacy/cora/document/feed/RSSFeed.java @@ -36,10 +36,33 @@ public class RSSFeed implements Iterable { public static final int DEFAULT_MAXSIZE = 10000; // class variables - private RSSMessage channel; + private RSSMessage channel = null; private final Map messages; // a guid:Item map private final int maxsize; + + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("\n"); + sb.append("\n"); + sb.append("\n"); + if (this.channel != null) sb.append(this.channel.toString(false)); + sb.append("0\n"); + sb.append("" + this.size() + "\n"); + sb.append("" + this.size() + "\n"); + for (RSSMessage item: messages.values()) { + sb.append(item.toString()); + } + sb.append("\n"); + sb.append("\n"); + return sb.toString(); + } + public RSSFeed(final int maxsize) { this.messages = Collections.synchronizedMap(new LinkedHashMap()); this.channel = null; diff --git a/source/net/yacy/cora/document/feed/RSSMessage.java b/source/net/yacy/cora/document/feed/RSSMessage.java index 1886be789..c39892425 100644 --- a/source/net/yacy/cora/document/feed/RSSMessage.java +++ b/source/net/yacy/cora/document/feed/RSSMessage.java @@ -273,7 +273,19 @@ public class RSSMessage implements Hit, Comparable, Comparator\n"); + if (this.map.containsKey(Token.title.name())) sb.append("").append(this.map.get(Token.title.name())).append("\n"); + if (this.map.containsKey(Token.link.name())) sb.append("").append(this.map.get(Token.link.name())).append("\n"); + if (this.map.containsKey(Token.description.name())) sb.append("").append(this.map.get(Token.description.name())).append("\n"); + if (this.map.containsKey(Token.pubDate.name())) sb.append("").append(this.map.get(Token.pubDate.name())).append("\n"); + if (this.map.containsKey(Token.guid.name())) sb.append("").append(this.map.get(Token.guid.name())).append("\n"); + if (withItemTag) sb.append("\n"); + return sb.toString(); } @Override From 4fe4bf29ad8eb89cd7532307fc649e3812f90ecf Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sat, 6 Dec 2014 00:25:05 +0100 Subject: [PATCH 2/3] added rss feed output to snapshot servlet which can be used to get a list of latest/oldest entries in the snapshot database. This is an example: http://localhost:8090/api/snapshot.rss?depth=2&order=LATESTFIRST&host=yacy.net&maxcount=100 The properties depth, order, host and maxcount can be omited. The meaning of the fields are: host: select only urls from this host or all, if not given depth: select only urls at that crawl depth or all, if not given maxcount: select at most the given number of urls or 10, if not given order: either LATESTFIRST to select the youngest entries, OLDESTFIRST to select the first entries or ANY to select any The rss feed needs administration rights to work, a call to this servlet with rss extension must attach login credentials. --- htroot/api/snapshot.java | 36 ++++- source/net/yacy/crawler/data/Snapshots.java | 153 +++++++++++++++++++- 2 files changed, 180 insertions(+), 9 deletions(-) diff --git a/htroot/api/snapshot.java b/htroot/api/snapshot.java index 1d9e69d38..ff0e8f2d3 100644 --- a/htroot/api/snapshot.java +++ b/htroot/api/snapshot.java @@ -27,13 +27,18 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.Collection; import java.util.Date; +import java.util.Map; import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.feed.RSSFeed; +import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.Html2Image; +import net.yacy.crawler.data.Snapshots; import net.yacy.document.ImageParser; import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; @@ -53,9 +58,37 @@ public class snapshot { final Switchboard sb = (Switchboard) env; if (post == null) return null; + final boolean authenticated = sb.adminAuthenticated(header) >= 2; final String ext = header.get("EXT", ""); + + if (ext.equals("rss")) { + // create a report about the content of the snapshot directory + if (!authenticated) return null; + int maxcount = post.getInt("maxcount", 10); + int depthx = post.getInt("depth", -1); + Integer depth = depthx == -1 ? null : depthx; + String orderx = post.get("order", "ANY"); + Snapshots.Order order = Snapshots.Order.valueOf(orderx); + String host = post.get("host"); + Map iddate = sb.snapshots.select(host, depth, order, maxcount); + // now select the URL from the index for these ids in iddate and make an RSS feed + RSSFeed rssfeed = new RSSFeed(Integer.MAX_VALUE); + rssfeed.setChannel(new RSSMessage("Snapshot list for host = " + host + ", depth = " + depth + ", order = " + order + ", maxcount = " + maxcount, "", "")); + for (Map.Entry e: iddate.entrySet()) { + try { + DigestURL u = sb.index.fulltext().getURL(e.getKey()); + RSSMessage message = new RSSMessage(u.toNormalform(true), "", u, e.getKey()); + message.setPubDate(e.getValue()); + rssfeed.addMessage(message); + } catch (IOException ee) { + ConcurrentLog.logException(ee); + } + } + byte[] rssBinary = UTF8.getBytes(rssfeed.toString()); + return new ByteArrayInputStream(rssBinary); + } + final boolean pdf = ext.equals("pdf"); - final boolean authenticated = sb.adminAuthenticated(header) >= 2; if (pdf && !authenticated) return null; final boolean pngjpg = ext.equals("png") || ext.equals("jpg"); String urlhash = post.get("urlhash", ""); @@ -132,6 +165,7 @@ public class snapshot { } + return null; } } diff --git a/source/net/yacy/crawler/data/Snapshots.java b/source/net/yacy/crawler/data/Snapshots.java index 609098795..0e233301c 100644 --- a/source/net/yacy/crawler/data/Snapshots.java +++ b/source/net/yacy/crawler/data/Snapshots.java @@ -22,16 +22,21 @@ package net.yacy.crawler.data; import java.io.File; import java.io.IOException; +import java.text.ParseException; import java.util.ArrayList; import java.util.Collection; import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; +import java.util.TreeSet; import org.apache.solr.common.SolrDocument; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; -import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.util.Html2Image; import net.yacy.search.index.Fulltext; import net.yacy.search.schema.CollectionSchema; @@ -56,8 +61,42 @@ public class Snapshots { private File storageLocation; + private Map>> directory; // a TreeMap for each domain where the key is the depth and the value is a Set containing a key/urlhash id to get all files into a specific order to provide a recent view on the documents + public Snapshots(File location) { this.storageLocation = location; + // scan the location to fill the directory + this.directory = new HashMap<>(); + for (String domain: location.list()) { + TreeMap> domaindepth = new TreeMap<>(); + this.directory.put(domain, domaindepth); + File domaindir = new File(location, domain); + if (domaindir.isDirectory()) domainscan: for (String depth: domaindir.list()) { + TreeSet dateid = new TreeSet<>(); + Integer depthi = -1; + try { + depthi = Integer.parseInt(depth); + } catch (NumberFormatException e) { + continue domainscan; + } + domaindepth.put(depthi, dateid); + File sharddir = new File(domaindir, depth); + if (sharddir.isDirectory()) for (String shard: sharddir.list()) { + File snapshotdir = new File(sharddir, shard); + if (snapshotdir.isDirectory()) for (String snapshotfile: snapshotdir.list()) { + if (snapshotfile.endsWith(".pdf")) { + String s = snapshotfile.substring(0, snapshotfile.length() - 4); + int p = s.indexOf('.'); + assert p == 12; + if (p > 0) { + String key = s.substring(p + 1) + '.' + s.substring(0, p); + dateid.add(key); + } + } + } + } + } + } } /** @@ -75,10 +114,14 @@ public class Snapshots { if (replaceOld) { for (File oldPath: oldPaths) oldPath.delete(); } - File path = definePath(url, "pdf", depth, date); + File path = definePath(url, depth, date, "pdf"); path.getParentFile().mkdirs(); boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, userAgent, path); - return success ? path : null; + if (success) { + announceStorage(url, depth, date); + return path; + } + return null; } /** @@ -90,13 +133,91 @@ public class Snapshots { * @param date * @return a file to the snapshot */ - public File definePath(final DigestURL url, final String ext, final int depth, final Date date) { + public File definePath(final DigestURL url, final int depth, final Date date, final String ext) { String id = ASCII.String(url.hash()); - String ds = GenericFormatter.SHORT_DAY_FORMATTER.format(date); + String ds = GenericFormatter.SHORT_MINUTE_FORMATTER.format(date); File path = new File(pathToShard(url, depth), id + "." + ds + "." + ext); return path; } + + private void announceStorage(final DigestURL url, final int depth, final Date date) { + String id = ASCII.String(url.hash()); + String ds = GenericFormatter.SHORT_MINUTE_FORMATTER.format(date); + TreeMap> domaindepth = this.directory.get(pathToHostDir(url)); + if (domaindepth == null) {domaindepth = new TreeMap>(); this.directory.put(pathToHostDir(url), domaindepth);} + TreeSet dateid = domaindepth.get(depth); + if (dateid == null) {dateid = new TreeSet(); domaindepth.put(depth, dateid);} + dateid.add(ds + '.' + id); + } + + public static enum Order { + ANY, OLDESTFIRST, LATESTFIRST; + } + /** + * select a set of urlhashes from the snapshot directory. The selection either ordered + * by generation date (upwards == OLDESTFIRST or downwards == LATESTFIRST) or with any + * order. The result set can be selected either with a given host or a depth + * @param host selected host or null for all hosts + * @param depth selected depth or null for all depths + * @param order Order.ANY, Order.OLDESTFIRST or Order.LATESTFIRST + * @param maxcount the maximum number of hosthashes. If unlimited, submit Integer.MAX_VALUE + * @return a map of hosthashes with the associated creation date + */ + public Map select(String host, Integer depth, final Order order, int maxcount) { + TreeSet dateIdResult = new TreeSet<>(); + if (host == null && depth == null) { + loop: for (TreeMap> domaindepth: this.directory.values()) { + for (TreeSet keys: domaindepth.values()) { + dateIdResult.addAll(keys); + if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop; + } + } + } + if (host == null && depth != null) { + loop: for (TreeMap> domaindepth: this.directory.values()) { + TreeSet keys = domaindepth.get(depth); + if (keys != null) dateIdResult.addAll(keys); + if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop; + } + } + if (host != null && depth == null) { + TreeMap> domaindepth = this.directory.get(pathToHostDir(host,80)); + if (domaindepth != null) loop: for (TreeSet keys: domaindepth.values()) { + dateIdResult.addAll(keys); + if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop; + } + } + if (host != null && depth != null) { + TreeMap> domaindepth = this.directory.get(pathToHostDir(host,80)); + if (domaindepth != null) { + TreeSet keys = domaindepth.get(depth); + if (keys != null) dateIdResult.addAll(keys); + } + } + Map result = new HashMap<>(); + Iterator i = order == Order.LATESTFIRST ? dateIdResult.descendingIterator() : dateIdResult.iterator(); + while (i.hasNext() && result.size() < maxcount) { + String di = i.next(); + int p = di.indexOf('.'); + assert p >= 0; + String d = di.substring(0, p); + Date date; + try { + date = GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d); + } catch (ParseException e) { + try { + date = GenericFormatter.SHORT_DAY_FORMATTER.parse(d); + } catch (ParseException ee) { + date = new Date(); + } + } + result.put(di.substring(p + 1), date); + } + return result; + } + + /** * get the depth to a document, helper method for definePath to determine the depth value * @param url @@ -157,10 +278,26 @@ public class Snapshots { private File pathToShard(final DigestURL url, final int depth) { String id = ASCII.String(url.hash()); - File pathToHostDir = new File(storageLocation, url.getHost() + "." + url.getPort()); - File pathToDepthDir = new File(pathToHostDir, depth < 10 ? "0" + depth : Integer.toString(depth)); - File pathToShard = new File(pathToDepthDir, id.substring(0, 2)); + File pathToHostDir = new File(storageLocation, pathToHostDir(url)); + File pathToDepthDir = new File(pathToHostDir, pathToDepthDir(depth)); + File pathToShard = new File(pathToDepthDir, pathToShard(id)); return pathToShard; } + private String pathToHostDir(final DigestURL url) { + return pathToHostDir(url.getHost(), url.getPort()); + } + + private String pathToHostDir(final String host, final int port) { + return host + "." + port; + } + + private String pathToDepthDir(final int depth) { + return depth < 10 ? "0" + depth : Integer.toString(depth); + } + + private String pathToShard(final String urlhash) { + return urlhash.substring(0, 2); + } + } From d97deb5555957d68add3d10820aeb70c21f21408 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sat, 6 Dec 2014 00:43:12 +0100 Subject: [PATCH 3/3] npe fix --- htroot/api/snapshot.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/htroot/api/snapshot.java b/htroot/api/snapshot.java index ff0e8f2d3..3d56130db 100644 --- a/htroot/api/snapshot.java +++ b/htroot/api/snapshot.java @@ -57,19 +57,18 @@ public class snapshot { public static Object respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { final Switchboard sb = (Switchboard) env; - if (post == null) return null; final boolean authenticated = sb.adminAuthenticated(header) >= 2; final String ext = header.get("EXT", ""); if (ext.equals("rss")) { // create a report about the content of the snapshot directory if (!authenticated) return null; - int maxcount = post.getInt("maxcount", 10); - int depthx = post.getInt("depth", -1); + int maxcount = post == null ? 10 : post.getInt("maxcount", 10); + int depthx = post == null ? -1 : post.getInt("depth", -1); Integer depth = depthx == -1 ? null : depthx; - String orderx = post.get("order", "ANY"); + String orderx = post == null ? "ANY" : post.get("order", "ANY"); Snapshots.Order order = Snapshots.Order.valueOf(orderx); - String host = post.get("host"); + String host = post == null ? null : post.get("host"); Map iddate = sb.snapshots.select(host, depth, order, maxcount); // now select the URL from the index for these ids in iddate and make an RSS feed RSSFeed rssfeed = new RSSFeed(Integer.MAX_VALUE); @@ -77,6 +76,7 @@ public class snapshot { for (Map.Entry e: iddate.entrySet()) { try { DigestURL u = sb.index.fulltext().getURL(e.getKey()); + if (u == null) continue; RSSMessage message = new RSSMessage(u.toNormalform(true), "", u, e.getKey()); message.setPubDate(e.getValue()); rssfeed.addMessage(message); @@ -87,7 +87,8 @@ public class snapshot { byte[] rssBinary = UTF8.getBytes(rssfeed.toString()); return new ByteArrayInputStream(rssBinary); } - + + if (post == null) return null; final boolean pdf = ext.equals("pdf"); if (pdf && !authenticated) return null; final boolean pngjpg = ext.equals("png") || ext.equals("jpg");