From b85db72a73da4797d09dc72155c56ca00dd5da0f Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Mon, 10 Jun 2013 18:41:00 +0200
Subject: [PATCH] added another response writer which can present search result
 with texts, separated by sentences. Then, these sentences can be used to
 search again in the index for the same sentence. This can be used to provide
 a tool for plagiarism-search. (not finished yet). Try the following:
 http://localhost:8090/solr/select?q=text_t:flut&grep=wasser&defType=edismax&start=0&rows=3&core=collection1&wt=grephtml
 .. to search for 'flut' and show only sentences in the result documents which
 contain the word 'wasser'. Consider this like using a grep-tool on documents:
 you select the documents by a search query and you grep sentences inside the
 found documents with the 'grep' attribute.

---
 htroot/solr/select.java                       |   4 +-
 .../GrepHTMLResponseWriter.java               | 210 ++++++++++++++++++
 .../responsewriter/HTMLResponseWriter.java    |   2 +-
 3 files changed, 214 insertions(+), 2 deletions(-)
 create mode 100644 source/net/yacy/cora/federate/solr/responsewriter/GrepHTMLResponseWriter.java

diff --git a/htroot/solr/select.java b/htroot/solr/select.java
index c127d2bae..883f79efb 100644
--- a/htroot/solr/select.java
+++ b/htroot/solr/select.java
@@ -33,6 +33,7 @@ import net.yacy.cora.federate.solr.SolrServlet;
 import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
 import net.yacy.cora.federate.solr.responsewriter.EnhancedXMLResponseWriter;
 import net.yacy.cora.federate.solr.responsewriter.GSAResponseWriter;
+import net.yacy.cora.federate.solr.responsewriter.GrepHTMLResponseWriter;
 import net.yacy.cora.federate.solr.responsewriter.HTMLResponseWriter;
 import net.yacy.cora.federate.solr.responsewriter.JsonResponseWriter;
 import net.yacy.cora.federate.solr.responsewriter.OpensearchResponseWriter;
@@ -85,6 +86,7 @@ public class select {
         RESPONSE_WRITER.put("xslt", xsltWriter); // try i.e. http://localhost:8090/solr/select?q=*:*&start=0&rows=10&wt=xslt&tr=json.xsl
         RESPONSE_WRITER.put("exml", new EnhancedXMLResponseWriter());
         RESPONSE_WRITER.put("html", new HTMLResponseWriter());
+        RESPONSE_WRITER.put("grephtml", new GrepHTMLResponseWriter());
         RESPONSE_WRITER.put("rss", opensearchResponseWriter); //try http://localhost:8090/solr/select?wt=rss&q=olympia&hl=true&hl.fl=text_t,h1,h2
         RESPONSE_WRITER.put("opensearch", opensearchResponseWriter); //try http://localhost:8090/solr/select?wt=rss&q=olympia&hl=true&hl.fl=text_t,h1,h2
         RESPONSE_WRITER.put("yjson", new JsonResponseWriter()); //try http://localhost:8090/solr/select?wt=json&q=olympia&hl=true&hl.fl=text_t,h1,h2
@@ -109,7 +111,7 @@ public class select {
         if ("exml".equals(wt)) return "application/rss+xml";
         if ("json".equals(wt)) return "application/json";
         if ("yjson".equals(wt)) return "application/json";
-        if ("html".equals(wt) || "python".equals(wt)) return "text/html";
+        if ("html".equals(wt) || "grephtml".equals(wt) || "python".equals(wt)) return "text/html";
         if ("php".equals(wt) || "phps".equals(wt)) return "application/x-httpd-php";
         if ("ruby".equals(wt)) return "text/html";
         if ("raw".equals(wt)) return "application/octet-stream";
diff --git a/source/net/yacy/cora/federate/solr/responsewriter/GrepHTMLResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/GrepHTMLResponseWriter.java
new file mode 100644
index 000000000..57b763c2b
--- /dev/null
+++ b/source/net/yacy/cora/federate/solr/responsewriter/GrepHTMLResponseWriter.java
@@ -0,0 +1,210 @@
+/**
+ *  GrepHTMLResponseWriter
+ *  Copyright 2013 by Michael Peter Christen
+ *  First released 09.06.2013 at http://yacy.net
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file lgpl21.txt
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+package net.yacy.cora.federate.solr.responsewriter;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import net.yacy.cora.federate.solr.connector.SolrConnector;
+import net.yacy.cora.sorting.OrderedScoreMap;
+import net.yacy.document.SentenceReader;
+import net.yacy.search.Switchboard;
+import net.yacy.search.schema.CollectionSchema;
+
+import org.apache.lucene.document.Document;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.XML;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.QueryResponseWriter;
+import org.apache.solr.response.ResultContext;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.search.DocIterator;
+import org.apache.solr.search.DocList;
+import org.apache.solr.search.SolrIndexSearcher;
+
+/**
+ * this response writer shows a list of documents with the lines containing matches
+ * of the search request in 'grep-style', which means it is like doing a grep on a set
+ * of files. Within the result list, the document is splitted into the sentences of the
+ * text part and each sentence is shown as separate line. grep attributes can be used to
+ * show leading and trainling lines.
+ */
+public class GrepHTMLResponseWriter implements QueryResponseWriter {
+
+    private static final Set<String> DEFAULT_FIELD_LIST = new HashSet<String>();
+    private static final Pattern dqp = Pattern.compile("\"");
+    static {
+        DEFAULT_FIELD_LIST.add(CollectionSchema.id.getSolrFieldName());
+        DEFAULT_FIELD_LIST.add(CollectionSchema.sku.getSolrFieldName());
+        DEFAULT_FIELD_LIST.add(CollectionSchema.title.getSolrFieldName());
+        DEFAULT_FIELD_LIST.add(CollectionSchema.text_t.getSolrFieldName());
+    }
+    
+    public GrepHTMLResponseWriter() {
+        super();
+    }
+
+    @Override
+    public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) {
+        return CONTENT_TYPE_XML_UTF8;
+    }
+
+    @Override
+    public void init(@SuppressWarnings("rawtypes") NamedList n) {
+    }
+
+    @Override
+    public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp) throws IOException {
+        NamedList<?> values = rsp.getValues();
+        assert values.get("responseHeader") != null;
+        assert values.get("response") != null;
+
+        writer.write("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n");
+        writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"all\" href=\"/env/base.css\" />\n");
+        writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"screen\" href=\"/env/style.css\" />\n");
+        SolrParams params = request.getOriginalParams();
+        boolean discover = params.getBool("discover", false);
+        String grep = params.get("grep");
+        String query = "";
+        String q = params.get("q"); if (q == null) q = "";
+        int p = q.indexOf(':');
+        if (p >= 0) {
+            int r = q.charAt(p + 1) == '"' ? q.indexOf(p + 2, '"') : q.indexOf(' ');
+            if (r < 0) r = q.length();
+            query = q.substring(p + 1, r);
+            if (query.length() > 0) {
+                if (query.charAt(0) == '"') query = query.substring(1);
+                if (query.charAt(query.length() - 1) == '"') query = query.substring(0, query.length() - 1);
+            }
+        }
+        if (grep == null && query.length() > 0) grep = query;
+        if (grep.length() > 0) {
+            if (grep.charAt(0) == '"') grep = grep.substring(1);
+            if (grep.charAt(grep.length() - 1) == '"') grep = grep.substring(0, grep.length() - 1);
+        }
+        NamedList<Object> paramsList = params.toNamedList();
+        paramsList.remove("wt");
+        String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22");
+        writer.write("<div id=\"api\"><a href=\"" + xmlquery + "\"><img src=\"../env/grafics/api.png\" width=\"60\" height=\"40\" alt=\"API\" /></a>\n");
+        writer.write("<span>This search result can also be retrieved as XML. Click the API icon to see an example call to the search rss API.</div>\n");
+        
+        DocList response = ((ResultContext) values.get("response")).docs;
+        final int sz = response.size();
+        if (sz > 0) {
+            SolrIndexSearcher searcher = request.getSearcher();
+            DocIterator iterator = response.iterator();
+            IndexSchema schema = request.getSchema();
+            writer.write("<title>Document Grep for query \"" + query + "\" and grep phrase \"" + grep + "\"</title>\n</head><body>\n");
+            
+            LinkedHashMap<String,  ArrayList<String>> sentenceCache = new LinkedHashMap<String,  ArrayList<String>>();
+
+            for (int i = 0; i < sz; i++) {
+                int id = iterator.nextDoc();
+                Document doc = searcher.doc(id, DEFAULT_FIELD_LIST);
+                LinkedHashMap<String, String> tdoc = HTMLResponseWriter.translateDoc(schema, doc);
+                String sku = tdoc.get(CollectionSchema.sku.getSolrFieldName());
+                String title = tdoc.get(CollectionSchema.title.getSolrFieldName());
+                String text = tdoc.get(CollectionSchema.text_t.getSolrFieldName());
+
+                ArrayList<String> sentences = new ArrayList<String>();
+                if (title != null) sentences.add(title);
+                SentenceReader sr = new SentenceReader(text);
+                StringBuilder line;
+                while (sr.hasNext()) {
+                    line = sr.next();
+                    if (line.length() > 0) sentences.add(line.toString());
+                }
+                sentenceCache.put(sku, sentences);
+            }
+
+            OrderedScoreMap<String> scores = null;
+            if (discover) {
+                // for each line make a statistic about the number of occurrences somewhere else
+                SolrConnector connector = Switchboard.getSwitchboard().index.fulltext().getDefaultConnector();
+                scores = new OrderedScoreMap<String>(null);
+                for (Map.Entry<String,  ArrayList<String>> entry: sentenceCache.entrySet()) {
+                    for (String line: entry.getValue()) {
+                        long count = connector.getCountByQuery("text_t:\"" + line + "\"");
+                        if (count > 0) scores.inc(entry.getKey());
+                    }
+                }
+            }
+            
+            for (Map.Entry<String,  ArrayList<String>> entry: sentenceCache.entrySet()) {
+                writeDoc(writer, entry.getKey(), entry.getValue(), grep, scores);
+            }
+        } else {
+            writer.write("<title>No Document Found</title>\n</head><body>\n");
+        }
+        
+        writer.write("</body></html>\n");
+    }
+
+    private static final void writeDoc(Writer writer, String url, ArrayList<String> sentences, String grep, OrderedScoreMap<String> scores) throws IOException {
+        writer.write("<form name=\"yacydoc" + url + "\" method=\"post\" action=\"#\" enctype=\"multipart/form-data\" accept-charset=\"UTF-8\">\n");
+        writer.write("<fieldset>\n");
+        writer.write("<h1><a href=\"" + url + "\">" + url + "</a></h1>\n");
+        writer.write("<dl>\n");
+        int c = 0;
+        for (String line: sentences) {
+            if (grep != null && grep.length() > 0 && line.indexOf(grep) < 0) continue;
+            writer.write("<dt>");
+            if (c++ == 0) {
+                if (grep == null || grep.length() == 0) writer.write("all lines in document"); else {writer.write("matches for grep phrase \"");writer.write(grep);writer.write("\"");}
+            }
+            writer.write("</dt>");
+            writedd(writer, line, scores);
+        }
+        if (scores != null) {
+            Collection<String> discoveries = scores.keyList(false);
+            writer.write("<dt>Citations:</dt><dd></dd>");
+            for (String u: discoveries) {
+                writer.write("<dt>");
+                writer.write(Integer.toString(scores.get(u)));
+                writer.write(" citations</dt><dd>");
+                writedd(writer, u, scores);
+            }
+        }
+        writer.write("</dl>\n");
+        writer.write("</fieldset>\n");
+        writer.write("</form>\n");
+    }
+    
+    private static void writedd(Writer writer, String line, OrderedScoreMap<String> scores) throws IOException {
+        writer.write("<dd><a href=\"/solr/select?q=text_t:%22");
+        XML.escapeAttributeValue(line, writer);
+        writer.write("%22&rows=100&discover=");
+        writer.write(scores != null ? "true" : "false");
+        writer.write("&wt=grephtml\">");
+        XML.escapeAttributeValue(line, writer);
+        writer.write("</a></dd>\n");
+    }
+
+}
diff --git a/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java
index d2ca13f12..7bcabcfa1 100644
--- a/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java
+++ b/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java
@@ -140,7 +140,7 @@ public class HTMLResponseWriter implements QueryResponseWriter {
         writer.write("</form>\n");
     }
     
-    private static final LinkedHashMap<String, String> translateDoc(final IndexSchema schema, final Document doc) {
+    static final LinkedHashMap<String, String> translateDoc(final IndexSchema schema, final Document doc) {
         List<IndexableField> fields = doc.getFields();
         int sz = fields.size();
         int fidx1 = 0, fidx2 = 0;