From 8b522687e0a15b3237eb0f032bc10ff0418baea5 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Sat, 6 Dec 2014 00:18:14 +0100
Subject: [PATCH 1/3] added toString() methods to feed classes which makes it
 possible to export full rss feed files out of the RSSFeed class

---
 .../net/yacy/cora/document/feed/RSSFeed.java  | 25 ++++++++++++++++++-
 .../yacy/cora/document/feed/RSSMessage.java   | 14 ++++++++++-
 2 files changed, 37 insertions(+), 2 deletions(-)
diff --git a/source/net/yacy/cora/document/feed/RSSFeed.java b/source/net/yacy/cora/document/feed/RSSFeed.java
index 009c500ea..552dce869 100644
--- a/source/net/yacy/cora/document/feed/RSSFeed.java
+++ b/source/net/yacy/cora/document/feed/RSSFeed.java
@@ -36,10 +36,33 @@ public class RSSFeed implements Iterable<RSSMessage> {
     public static final int DEFAULT_MAXSIZE = 10000;
 
     // class variables
-    private RSSMessage channel;
+    private RSSMessage channel = null;
     private final Map<String, RSSMessage> messages; // a guid:Item map
     private final int maxsize;
 
+    
+
+    @Override
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
+        sb.append("<rss version=\"2.0\"\n");
+        sb.append("  xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\"\n");
+        sb.append("  xmlns:atom=\"http://www.w3.org/2005/Atom\"\n");
+        sb.append(">\n");
+        sb.append("<channel>\n");
+        if (this.channel != null) sb.append(this.channel.toString(false));
+        sb.append("<opensearch:startIndex>0</opensearch:startIndex>\n");
+        sb.append("<opensearch:itemsPerPage>" + this.size() + "</opensearch:itemsPerPage>\n");
+        sb.append("<opensearch:totalResults>" + this.size() + "</opensearch:totalResults>\n");
+        for (RSSMessage item: messages.values()) {
+            sb.append(item.toString());
+        }
+        sb.append("</channel>\n");
+        sb.append("</rss>\n");
+        return sb.toString();
+    }
+    
     public RSSFeed(final int maxsize) {
         this.messages = Collections.synchronizedMap(new LinkedHashMap<String, RSSMessage>());
         this.channel = null;
diff --git a/source/net/yacy/cora/document/feed/RSSMessage.java b/source/net/yacy/cora/document/feed/RSSMessage.java
index 1886be789..c39892425 100644
--- a/source/net/yacy/cora/document/feed/RSSMessage.java
+++ b/source/net/yacy/cora/document/feed/RSSMessage.java
@@ -273,7 +273,19 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
 
     @Override
     public String toString() {
-        return this.map.toString();
+        return this.toString(true);
+    }
+    
+    public String toString(boolean withItemTag) {
+        StringBuilder sb = new StringBuilder();
+        if (withItemTag) sb.append("<item>\n");
+        if (this.map.containsKey(Token.title.name())) sb.append("<title>").append(this.map.get(Token.title.name())).append("</title>\n");
+        if (this.map.containsKey(Token.link.name())) sb.append("<link>").append(this.map.get(Token.link.name())).append("</link>\n");
+        if (this.map.containsKey(Token.description.name())) sb.append("<description>").append(this.map.get(Token.description.name())).append("</description>\n");
+        if (this.map.containsKey(Token.pubDate.name())) sb.append("<pubDate>").append(this.map.get(Token.pubDate.name())).append("</pubDate>\n");
+        if (this.map.containsKey(Token.guid.name())) sb.append("<guid isPermaLink=\"false\">").append(this.map.get(Token.guid.name())).append("</guid>\n");
+        if (withItemTag) sb.append("</item>\n");
+        return sb.toString();
     }
 
     @Override

From 4fe4bf29ad8eb89cd7532307fc649e3812f90ecf Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Sat, 6 Dec 2014 00:25:05 +0100
Subject: [PATCH 2/3] added rss feed output to snapshot servlet which can be
 used to get a list of latest/oldest entries in the snapshot database. This is
 an example:
 http://localhost:8090/api/snapshot.rss?depth=2&order=LATESTFIRST&host=yacy.net&maxcount=100

The properties depth, order, host and maxcount can be omited. The
meaning of the fields are:
host: select only urls from this host or all, if not given
depth: select only urls at that crawl depth or all, if not given
maxcount: select at most the given number of urls or 10, if not given
order: either LATESTFIRST to select the youngest entries, OLDESTFIRST to
select the first entries or ANY to select any

The rss feed needs administration rights to work, a call to this servlet
with rss extension must attach login credentials.
---
 htroot/api/snapshot.java                    |  36 ++++-
 source/net/yacy/crawler/data/Snapshots.java | 153 +++++++++++++++++++-
 2 files changed, 180 insertions(+), 9 deletions(-)

diff --git a/htroot/api/snapshot.java b/htroot/api/snapshot.java
index 1d9e69d38..ff0e8f2d3 100644
--- a/htroot/api/snapshot.java
+++ b/htroot/api/snapshot.java
@@ -27,13 +27,18 @@ import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.Collection;
 import java.util.Date;
+import java.util.Map;
 
 import net.yacy.cora.document.encoding.ASCII;
+import net.yacy.cora.document.encoding.UTF8;
+import net.yacy.cora.document.feed.RSSFeed;
+import net.yacy.cora.document.feed.RSSMessage;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.cora.util.Html2Image;
+import net.yacy.crawler.data.Snapshots;
 import net.yacy.document.ImageParser;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.search.Switchboard;
@@ -53,9 +58,37 @@ public class snapshot {
         final Switchboard sb = (Switchboard) env;
 
         if (post == null) return null;
+        final boolean authenticated = sb.adminAuthenticated(header) >= 2;
         final String ext = header.get("EXT", "");
+        
+        if (ext.equals("rss")) {
+            // create a report about the content of the snapshot directory
+            if (!authenticated) return null;
+            int maxcount = post.getInt("maxcount", 10);
+            int depthx = post.getInt("depth", -1);
+            Integer depth = depthx == -1 ? null : depthx;
+            String orderx = post.get("order", "ANY");
+            Snapshots.Order order = Snapshots.Order.valueOf(orderx);
+            String host = post.get("host");
+            Map<String, Date> iddate = sb.snapshots.select(host, depth, order, maxcount);
+            // now select the URL from the index for these ids in iddate and make an RSS feed
+            RSSFeed rssfeed = new RSSFeed(Integer.MAX_VALUE);
+            rssfeed.setChannel(new RSSMessage("Snapshot list for host = " + host + ", depth = " + depth + ", order = " + order + ", maxcount = " + maxcount, "", ""));
+            for (Map.Entry<String, Date> e: iddate.entrySet()) {
+                try {
+                    DigestURL u = sb.index.fulltext().getURL(e.getKey());
+                    RSSMessage message = new RSSMessage(u.toNormalform(true), "", u, e.getKey());
+                    message.setPubDate(e.getValue());
+                    rssfeed.addMessage(message);
+                } catch (IOException ee) {
+                    ConcurrentLog.logException(ee);
+                }
+            }
+            byte[] rssBinary = UTF8.getBytes(rssfeed.toString());
+            return new ByteArrayInputStream(rssBinary);
+        }
+        
         final boolean pdf = ext.equals("pdf");
-        final boolean authenticated = sb.adminAuthenticated(header) >= 2;
         if (pdf && !authenticated) return null;
         final boolean pngjpg = ext.equals("png") || ext.equals("jpg");
         String urlhash = post.get("urlhash", "");
@@ -132,6 +165,7 @@ public class snapshot {
 
         }
         
+        
         return null;
     }
 }
diff --git a/source/net/yacy/crawler/data/Snapshots.java b/source/net/yacy/crawler/data/Snapshots.java
index 609098795..0e233301c 100644
--- a/source/net/yacy/crawler/data/Snapshots.java
+++ b/source/net/yacy/crawler/data/Snapshots.java
@@ -22,16 +22,21 @@ package net.yacy.crawler.data;
 
 import java.io.File;
 import java.io.IOException;
+import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Date;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.TreeSet;
 
 import org.apache.solr.common.SolrDocument;
 
 import net.yacy.cora.date.GenericFormatter;
 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.DigestURL;
-import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.util.Html2Image;
 import net.yacy.search.index.Fulltext;
 import net.yacy.search.schema.CollectionSchema;
@@ -56,8 +61,42 @@ public class Snapshots {
 
     private File storageLocation;
     
+    private Map<String, TreeMap<Integer, TreeSet<String>>> directory; // a TreeMap for each domain where the key is the depth and the value is a Set containing a key/urlhash id to get all files into a specific order to provide a recent view on the documents
+    
     public Snapshots(File location) {
         this.storageLocation = location;
+        // scan the location to fill the directory
+        this.directory = new HashMap<>();
+        for (String domain: location.list()) {
+            TreeMap<Integer, TreeSet<String>> domaindepth = new TreeMap<>();
+            this.directory.put(domain, domaindepth);
+            File domaindir = new File(location, domain);
+            if (domaindir.isDirectory()) domainscan: for (String depth: domaindir.list()) {
+                TreeSet<String> dateid = new TreeSet<>();
+                Integer depthi = -1;
+                try {
+                    depthi = Integer.parseInt(depth);
+                } catch (NumberFormatException e) {
+                    continue domainscan;
+                }
+                domaindepth.put(depthi, dateid);
+                File sharddir = new File(domaindir, depth);
+                if (sharddir.isDirectory()) for (String shard: sharddir.list()) {
+                    File snapshotdir = new File(sharddir, shard);
+                    if (snapshotdir.isDirectory()) for (String snapshotfile: snapshotdir.list()) {
+                        if (snapshotfile.endsWith(".pdf")) {
+                            String s = snapshotfile.substring(0, snapshotfile.length() - 4);
+                            int p = s.indexOf('.');
+                            assert p == 12;
+                            if (p > 0) {
+                                String key = s.substring(p + 1) + '.' + s.substring(0, p);
+                                dateid.add(key);
+                            }
+                        }
+                    }
+                }
+            }
+        }
     }
 
     /**
@@ -75,10 +114,14 @@ public class Snapshots {
         if (replaceOld) {
             for (File oldPath: oldPaths) oldPath.delete();
         }
-        File path = definePath(url, "pdf", depth, date);
+        File path = definePath(url, depth, date, "pdf");
         path.getParentFile().mkdirs();
         boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, userAgent, path);
-        return success ? path : null;
+        if (success) {
+            announceStorage(url, depth, date);
+            return path;
+        }
+        return null;
     }
     
     /**
@@ -90,13 +133,91 @@ public class Snapshots {
      * @param date
      * @return a file to the snapshot
      */
-    public File definePath(final DigestURL url, final String ext, final int depth, final Date date) {
+    public File definePath(final DigestURL url, final int depth, final Date date, final String ext) {
         String id = ASCII.String(url.hash());
-        String ds = GenericFormatter.SHORT_DAY_FORMATTER.format(date);
+        String ds = GenericFormatter.SHORT_MINUTE_FORMATTER.format(date);
         File path = new File(pathToShard(url, depth), id + "." + ds + "." + ext);
         return path;
     }
+    
+    private void announceStorage(final DigestURL url, final int depth, final Date date) {
+        String id = ASCII.String(url.hash());
+        String ds = GenericFormatter.SHORT_MINUTE_FORMATTER.format(date);
+        TreeMap<Integer, TreeSet<String>> domaindepth = this.directory.get(pathToHostDir(url));
+        if (domaindepth == null) {domaindepth = new TreeMap<Integer, TreeSet<String>>(); this.directory.put(pathToHostDir(url), domaindepth);}
+        TreeSet<String> dateid = domaindepth.get(depth);
+        if (dateid == null) {dateid = new TreeSet<String>(); domaindepth.put(depth, dateid);}
+        dateid.add(ds + '.' + id);        
+    }
+    
+    public static enum Order {
+        ANY, OLDESTFIRST, LATESTFIRST;
+    }
 
+    /**
+     * select a set of urlhashes from the snapshot directory. The selection either ordered
+     * by generation date (upwards == OLDESTFIRST or downwards == LATESTFIRST) or with any
+     * order. The result set can be selected either with a given host or a depth
+     * @param host selected host or null for all hosts
+     * @param depth selected depth or null for all depths
+     * @param order Order.ANY, Order.OLDESTFIRST or Order.LATESTFIRST
+     * @param maxcount the maximum number of hosthashes. If unlimited, submit Integer.MAX_VALUE
+     * @return a map of hosthashes with the associated creation date
+     */
+    public Map<String, Date> select(String host, Integer depth, final Order order, int maxcount) {
+        TreeSet<String> dateIdResult = new TreeSet<>();
+        if (host == null && depth == null) {
+            loop: for (TreeMap<Integer, TreeSet<String>> domaindepth: this.directory.values()) {
+                for (TreeSet<String> keys: domaindepth.values()) {
+                    dateIdResult.addAll(keys);
+                    if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop; 
+                }
+            }
+        }
+        if (host == null && depth != null) {
+            loop: for (TreeMap<Integer, TreeSet<String>> domaindepth: this.directory.values()) {
+                TreeSet<String> keys = domaindepth.get(depth);
+                if (keys != null) dateIdResult.addAll(keys);
+                if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop;
+            }
+        }
+        if (host != null && depth == null) {
+            TreeMap<Integer, TreeSet<String>> domaindepth = this.directory.get(pathToHostDir(host,80));
+            if (domaindepth != null) loop: for (TreeSet<String> keys: domaindepth.values()) {
+                dateIdResult.addAll(keys);
+                if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop;
+            }
+        }
+        if (host != null && depth != null) {
+            TreeMap<Integer, TreeSet<String>> domaindepth = this.directory.get(pathToHostDir(host,80));
+            if (domaindepth != null) {
+                TreeSet<String> keys = domaindepth.get(depth);
+                if (keys != null) dateIdResult.addAll(keys);
+            }
+        }
+        Map<String, Date> result = new HashMap<>();
+        Iterator<String> i = order == Order.LATESTFIRST ? dateIdResult.descendingIterator() : dateIdResult.iterator();
+        while (i.hasNext() && result.size() < maxcount) {
+            String di = i.next();
+            int p = di.indexOf('.');
+            assert p >= 0;
+            String d = di.substring(0, p);
+            Date date;
+            try {
+                date = GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d);
+            } catch (ParseException e) {
+                try {
+                    date = GenericFormatter.SHORT_DAY_FORMATTER.parse(d);
+                } catch (ParseException ee) {
+                    date = new Date();
+                }
+            }
+            result.put(di.substring(p + 1), date);
+        }
+        return result;
+    }
+    
+    
     /**
      * get the depth to a document, helper method for definePath to determine the depth value
      * @param url
@@ -157,10 +278,26 @@ public class Snapshots {
     
     private File pathToShard(final DigestURL url, final int depth) {
         String id = ASCII.String(url.hash());
-        File pathToHostDir = new File(storageLocation, url.getHost() + "." + url.getPort());
-        File pathToDepthDir = new File(pathToHostDir, depth < 10 ? "0" + depth : Integer.toString(depth));
-        File pathToShard = new File(pathToDepthDir, id.substring(0, 2));
+        File pathToHostDir = new File(storageLocation, pathToHostDir(url));
+        File pathToDepthDir = new File(pathToHostDir, pathToDepthDir(depth));
+        File pathToShard = new File(pathToDepthDir, pathToShard(id));
         return pathToShard;
     }
 
+    private String pathToHostDir(final DigestURL url) {
+        return pathToHostDir(url.getHost(), url.getPort());
+    }
+    
+    private String pathToHostDir(final String host, final int port) {
+        return host + "." + port;
+    }
+    
+    private String pathToDepthDir(final int depth) {
+        return depth < 10 ? "0" + depth : Integer.toString(depth);
+    }
+    
+    private String pathToShard(final String urlhash) {
+        return urlhash.substring(0, 2);
+    }
+
 }

From d97deb5555957d68add3d10820aeb70c21f21408 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Sat, 6 Dec 2014 00:43:12 +0100
Subject: [PATCH 3/3] npe fix

---
 htroot/api/snapshot.java | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/htroot/api/snapshot.java b/htroot/api/snapshot.java
index ff0e8f2d3..3d56130db 100644
--- a/htroot/api/snapshot.java
+++ b/htroot/api/snapshot.java
@@ -57,19 +57,18 @@ public class snapshot {
     public static Object respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
         final Switchboard sb = (Switchboard) env;
 
-        if (post == null) return null;
         final boolean authenticated = sb.adminAuthenticated(header) >= 2;
         final String ext = header.get("EXT", "");
         
         if (ext.equals("rss")) {
             // create a report about the content of the snapshot directory
             if (!authenticated) return null;
-            int maxcount = post.getInt("maxcount", 10);
-            int depthx = post.getInt("depth", -1);
+            int maxcount = post == null ? 10 : post.getInt("maxcount", 10);
+            int depthx = post == null ? -1 : post.getInt("depth", -1);
             Integer depth = depthx == -1 ? null : depthx;
-            String orderx = post.get("order", "ANY");
+            String orderx = post == null ? "ANY" : post.get("order", "ANY");
             Snapshots.Order order = Snapshots.Order.valueOf(orderx);
-            String host = post.get("host");
+            String host = post == null ? null : post.get("host");
             Map<String, Date> iddate = sb.snapshots.select(host, depth, order, maxcount);
             // now select the URL from the index for these ids in iddate and make an RSS feed
             RSSFeed rssfeed = new RSSFeed(Integer.MAX_VALUE);
@@ -77,6 +76,7 @@ public class snapshot {
             for (Map.Entry<String, Date> e: iddate.entrySet()) {
                 try {
                     DigestURL u = sb.index.fulltext().getURL(e.getKey());
+                    if (u == null) continue;
                     RSSMessage message = new RSSMessage(u.toNormalform(true), "", u, e.getKey());
                     message.setPubDate(e.getValue());
                     rssfeed.addMessage(message);
@@ -87,7 +87,8 @@ public class snapshot {
             byte[] rssBinary = UTF8.getBytes(rssfeed.toString());
             return new ByteArrayInputStream(rssBinary);
         }
-        
+
+        if (post == null) return null;
         final boolean pdf = ext.equals("pdf");
         if (pdf && !authenticated) return null;
         final boolean pngjpg = ext.equals("png") || ext.equals("jpg");