diff --git a/htroot/solr/select.java b/htroot/solr/select.java index c127d2bae..883f79efb 100644 --- a/htroot/solr/select.java +++ b/htroot/solr/select.java @@ -33,6 +33,7 @@ import net.yacy.cora.federate.solr.SolrServlet; import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; import net.yacy.cora.federate.solr.responsewriter.EnhancedXMLResponseWriter; import net.yacy.cora.federate.solr.responsewriter.GSAResponseWriter; +import net.yacy.cora.federate.solr.responsewriter.GrepHTMLResponseWriter; import net.yacy.cora.federate.solr.responsewriter.HTMLResponseWriter; import net.yacy.cora.federate.solr.responsewriter.JsonResponseWriter; import net.yacy.cora.federate.solr.responsewriter.OpensearchResponseWriter; @@ -85,6 +86,7 @@ public class select { RESPONSE_WRITER.put("xslt", xsltWriter); // try i.e. http://localhost:8090/solr/select?q=*:*&start=0&rows=10&wt=xslt&tr=json.xsl RESPONSE_WRITER.put("exml", new EnhancedXMLResponseWriter()); RESPONSE_WRITER.put("html", new HTMLResponseWriter()); + RESPONSE_WRITER.put("grephtml", new GrepHTMLResponseWriter()); RESPONSE_WRITER.put("rss", opensearchResponseWriter); //try http://localhost:8090/solr/select?wt=rss&q=olympia&hl=true&hl.fl=text_t,h1,h2 RESPONSE_WRITER.put("opensearch", opensearchResponseWriter); //try http://localhost:8090/solr/select?wt=rss&q=olympia&hl=true&hl.fl=text_t,h1,h2 RESPONSE_WRITER.put("yjson", new JsonResponseWriter()); //try http://localhost:8090/solr/select?wt=json&q=olympia&hl=true&hl.fl=text_t,h1,h2 @@ -109,7 +111,7 @@ public class select { if ("exml".equals(wt)) return "application/rss+xml"; if ("json".equals(wt)) return "application/json"; if ("yjson".equals(wt)) return "application/json"; - if ("html".equals(wt) || "python".equals(wt)) return "text/html"; + if ("html".equals(wt) || "grephtml".equals(wt) || "python".equals(wt)) return "text/html"; if ("php".equals(wt) || "phps".equals(wt)) return "application/x-httpd-php"; if ("ruby".equals(wt)) return "text/html"; if ("raw".equals(wt)) return "application/octet-stream"; diff --git a/source/net/yacy/cora/federate/solr/responsewriter/GrepHTMLResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/GrepHTMLResponseWriter.java new file mode 100644 index 000000000..57b763c2b --- /dev/null +++ b/source/net/yacy/cora/federate/solr/responsewriter/GrepHTMLResponseWriter.java @@ -0,0 +1,210 @@ +/** + * GrepHTMLResponseWriter + * Copyright 2013 by Michael Peter Christen + * First released 09.06.2013 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr.responsewriter; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; + +import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.cora.sorting.OrderedScoreMap; +import net.yacy.document.SentenceReader; +import net.yacy.search.Switchboard; +import net.yacy.search.schema.CollectionSchema; + +import org.apache.lucene.document.Document; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.XML; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.QueryResponseWriter; +import org.apache.solr.response.ResultContext; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.search.DocIterator; +import org.apache.solr.search.DocList; +import org.apache.solr.search.SolrIndexSearcher; + +/** + * this response writer shows a list of documents with the lines containing matches + * of the search request in 'grep-style', which means it is like doing a grep on a set + * of files. Within the result list, the document is splitted into the sentences of the + * text part and each sentence is shown as separate line. grep attributes can be used to + * show leading and trainling lines. + */ +public class GrepHTMLResponseWriter implements QueryResponseWriter { + + private static final Set DEFAULT_FIELD_LIST = new HashSet(); + private static final Pattern dqp = Pattern.compile("\""); + static { + DEFAULT_FIELD_LIST.add(CollectionSchema.id.getSolrFieldName()); + DEFAULT_FIELD_LIST.add(CollectionSchema.sku.getSolrFieldName()); + DEFAULT_FIELD_LIST.add(CollectionSchema.title.getSolrFieldName()); + DEFAULT_FIELD_LIST.add(CollectionSchema.text_t.getSolrFieldName()); + } + + public GrepHTMLResponseWriter() { + super(); + } + + @Override + public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) { + return CONTENT_TYPE_XML_UTF8; + } + + @Override + public void init(@SuppressWarnings("rawtypes") NamedList n) { + } + + @Override + public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp) throws IOException { + NamedList values = rsp.getValues(); + assert values.get("responseHeader") != null; + assert values.get("response") != null; + + writer.write("\n"); + writer.write("\n"); + writer.write("\n"); + SolrParams params = request.getOriginalParams(); + boolean discover = params.getBool("discover", false); + String grep = params.get("grep"); + String query = ""; + String q = params.get("q"); if (q == null) q = ""; + int p = q.indexOf(':'); + if (p >= 0) { + int r = q.charAt(p + 1) == '"' ? q.indexOf(p + 2, '"') : q.indexOf(' '); + if (r < 0) r = q.length(); + query = q.substring(p + 1, r); + if (query.length() > 0) { + if (query.charAt(0) == '"') query = query.substring(1); + if (query.charAt(query.length() - 1) == '"') query = query.substring(0, query.length() - 1); + } + } + if (grep == null && query.length() > 0) grep = query; + if (grep.length() > 0) { + if (grep.charAt(0) == '"') grep = grep.substring(1); + if (grep.charAt(grep.length() - 1) == '"') grep = grep.substring(0, grep.length() - 1); + } + NamedList paramsList = params.toNamedList(); + paramsList.remove("wt"); + String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22"); + writer.write("
\"API\"\n"); + writer.write("This search result can also be retrieved as XML. Click the API icon to see an example call to the search rss API.
\n"); + + DocList response = ((ResultContext) values.get("response")).docs; + final int sz = response.size(); + if (sz > 0) { + SolrIndexSearcher searcher = request.getSearcher(); + DocIterator iterator = response.iterator(); + IndexSchema schema = request.getSchema(); + writer.write("Document Grep for query \"" + query + "\" and grep phrase \"" + grep + "\"\n\n"); + + LinkedHashMap> sentenceCache = new LinkedHashMap>(); + + for (int i = 0; i < sz; i++) { + int id = iterator.nextDoc(); + Document doc = searcher.doc(id, DEFAULT_FIELD_LIST); + LinkedHashMap tdoc = HTMLResponseWriter.translateDoc(schema, doc); + String sku = tdoc.get(CollectionSchema.sku.getSolrFieldName()); + String title = tdoc.get(CollectionSchema.title.getSolrFieldName()); + String text = tdoc.get(CollectionSchema.text_t.getSolrFieldName()); + + ArrayList sentences = new ArrayList(); + if (title != null) sentences.add(title); + SentenceReader sr = new SentenceReader(text); + StringBuilder line; + while (sr.hasNext()) { + line = sr.next(); + if (line.length() > 0) sentences.add(line.toString()); + } + sentenceCache.put(sku, sentences); + } + + OrderedScoreMap scores = null; + if (discover) { + // for each line make a statistic about the number of occurrences somewhere else + SolrConnector connector = Switchboard.getSwitchboard().index.fulltext().getDefaultConnector(); + scores = new OrderedScoreMap(null); + for (Map.Entry> entry: sentenceCache.entrySet()) { + for (String line: entry.getValue()) { + long count = connector.getCountByQuery("text_t:\"" + line + "\""); + if (count > 0) scores.inc(entry.getKey()); + } + } + } + + for (Map.Entry> entry: sentenceCache.entrySet()) { + writeDoc(writer, entry.getKey(), entry.getValue(), grep, scores); + } + } else { + writer.write("No Document Found\n\n"); + } + + writer.write("\n"); + } + + private static final void writeDoc(Writer writer, String url, ArrayList sentences, String grep, OrderedScoreMap scores) throws IOException { + writer.write("
\n"); + writer.write("
\n"); + writer.write("

" + url + "

\n"); + writer.write("
\n"); + int c = 0; + for (String line: sentences) { + if (grep != null && grep.length() > 0 && line.indexOf(grep) < 0) continue; + writer.write("
"); + if (c++ == 0) { + if (grep == null || grep.length() == 0) writer.write("all lines in document"); else {writer.write("matches for grep phrase \"");writer.write(grep);writer.write("\"");} + } + writer.write("
"); + writedd(writer, line, scores); + } + if (scores != null) { + Collection discoveries = scores.keyList(false); + writer.write("
Citations:
"); + for (String u: discoveries) { + writer.write("
"); + writer.write(Integer.toString(scores.get(u))); + writer.write(" citations
"); + writedd(writer, u, scores); + } + } + writer.write("
\n"); + writer.write("
\n"); + writer.write("
\n"); + } + + private static void writedd(Writer writer, String line, OrderedScoreMap scores) throws IOException { + writer.write("
"); + XML.escapeAttributeValue(line, writer); + writer.write("
\n"); + } + +} diff --git a/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java index d2ca13f12..7bcabcfa1 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java @@ -140,7 +140,7 @@ public class HTMLResponseWriter implements QueryResponseWriter { writer.write("\n"); } - private static final LinkedHashMap translateDoc(final IndexSchema schema, final Document doc) { + static final LinkedHashMap translateDoc(final IndexSchema schema, final Document doc) { List fields = doc.getFields(); int sz = fields.size(); int fidx1 = 0, fidx2 = 0;