From b85db72a73da4797d09dc72155c56ca00dd5da0f Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 10 Jun 2013 18:41:00 +0200 Subject: [PATCH] added another response writer which can present search result with texts, separated by sentences. Then, these sentences can be used to search again in the index for the same sentence. This can be used to provide a tool for plagiarism-search. (not finished yet). Try the following: http://localhost:8090/solr/select?q=text_t:flut&grep=wasser&defType=edismax&start=0&rows=3&core=collection1&wt=grephtml .. to search for 'flut' and show only sentences in the result documents which contain the word 'wasser'. Consider this like using a grep-tool on documents: you select the documents by a search query and you grep sentences inside the found documents with the 'grep' attribute. --- htroot/solr/select.java | 4 +- .../GrepHTMLResponseWriter.java | 210 ++++++++++++++++++ .../responsewriter/HTMLResponseWriter.java | 2 +- 3 files changed, 214 insertions(+), 2 deletions(-) create mode 100644 source/net/yacy/cora/federate/solr/responsewriter/GrepHTMLResponseWriter.java diff --git a/htroot/solr/select.java b/htroot/solr/select.java index c127d2bae..883f79efb 100644 --- a/htroot/solr/select.java +++ b/htroot/solr/select.java @@ -33,6 +33,7 @@ import net.yacy.cora.federate.solr.SolrServlet; import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; import net.yacy.cora.federate.solr.responsewriter.EnhancedXMLResponseWriter; import net.yacy.cora.federate.solr.responsewriter.GSAResponseWriter; +import net.yacy.cora.federate.solr.responsewriter.GrepHTMLResponseWriter; import net.yacy.cora.federate.solr.responsewriter.HTMLResponseWriter; import net.yacy.cora.federate.solr.responsewriter.JsonResponseWriter; import net.yacy.cora.federate.solr.responsewriter.OpensearchResponseWriter; @@ -85,6 +86,7 @@ public class select { RESPONSE_WRITER.put("xslt", xsltWriter); // try i.e. http://localhost:8090/solr/select?q=*:*&start=0&rows=10&wt=xslt&tr=json.xsl RESPONSE_WRITER.put("exml", new EnhancedXMLResponseWriter()); RESPONSE_WRITER.put("html", new HTMLResponseWriter()); + RESPONSE_WRITER.put("grephtml", new GrepHTMLResponseWriter()); RESPONSE_WRITER.put("rss", opensearchResponseWriter); //try http://localhost:8090/solr/select?wt=rss&q=olympia&hl=true&hl.fl=text_t,h1,h2 RESPONSE_WRITER.put("opensearch", opensearchResponseWriter); //try http://localhost:8090/solr/select?wt=rss&q=olympia&hl=true&hl.fl=text_t,h1,h2 RESPONSE_WRITER.put("yjson", new JsonResponseWriter()); //try http://localhost:8090/solr/select?wt=json&q=olympia&hl=true&hl.fl=text_t,h1,h2 @@ -109,7 +111,7 @@ public class select { if ("exml".equals(wt)) return "application/rss+xml"; if ("json".equals(wt)) return "application/json"; if ("yjson".equals(wt)) return "application/json"; - if ("html".equals(wt) || "python".equals(wt)) return "text/html"; + if ("html".equals(wt) || "grephtml".equals(wt) || "python".equals(wt)) return "text/html"; if ("php".equals(wt) || "phps".equals(wt)) return "application/x-httpd-php"; if ("ruby".equals(wt)) return "text/html"; if ("raw".equals(wt)) return "application/octet-stream"; diff --git a/source/net/yacy/cora/federate/solr/responsewriter/GrepHTMLResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/GrepHTMLResponseWriter.java new file mode 100644 index 000000000..57b763c2b --- /dev/null +++ b/source/net/yacy/cora/federate/solr/responsewriter/GrepHTMLResponseWriter.java @@ -0,0 +1,210 @@ +/** + * GrepHTMLResponseWriter + * Copyright 2013 by Michael Peter Christen + * First released 09.06.2013 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr.responsewriter; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; + +import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.cora.sorting.OrderedScoreMap; +import net.yacy.document.SentenceReader; +import net.yacy.search.Switchboard; +import net.yacy.search.schema.CollectionSchema; + +import org.apache.lucene.document.Document; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.XML; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.QueryResponseWriter; +import org.apache.solr.response.ResultContext; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.search.DocIterator; +import org.apache.solr.search.DocList; +import org.apache.solr.search.SolrIndexSearcher; + +/** + * this response writer shows a list of documents with the lines containing matches + * of the search request in 'grep-style', which means it is like doing a grep on a set + * of files. Within the result list, the document is splitted into the sentences of the + * text part and each sentence is shown as separate line. grep attributes can be used to + * show leading and trainling lines. + */ +public class GrepHTMLResponseWriter implements QueryResponseWriter { + + private static final Set DEFAULT_FIELD_LIST = new HashSet(); + private static final Pattern dqp = Pattern.compile("\""); + static { + DEFAULT_FIELD_LIST.add(CollectionSchema.id.getSolrFieldName()); + DEFAULT_FIELD_LIST.add(CollectionSchema.sku.getSolrFieldName()); + DEFAULT_FIELD_LIST.add(CollectionSchema.title.getSolrFieldName()); + DEFAULT_FIELD_LIST.add(CollectionSchema.text_t.getSolrFieldName()); + } + + public GrepHTMLResponseWriter() { + super(); + } + + @Override + public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) { + return CONTENT_TYPE_XML_UTF8; + } + + @Override + public void init(@SuppressWarnings("rawtypes") NamedList n) { + } + + @Override + public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp) throws IOException { + NamedList values = rsp.getValues(); + assert values.get("responseHeader") != null; + assert values.get("response") != null; + + writer.write("\n"); + writer.write("\n"); + writer.write("\n"); + SolrParams params = request.getOriginalParams(); + boolean discover = params.getBool("discover", false); + String grep = params.get("grep"); + String query = ""; + String q = params.get("q"); if (q == null) q = ""; + int p = q.indexOf(':'); + if (p >= 0) { + int r = q.charAt(p + 1) == '"' ? q.indexOf(p + 2, '"') : q.indexOf(' '); + if (r < 0) r = q.length(); + query = q.substring(p + 1, r); + if (query.length() > 0) { + if (query.charAt(0) == '"') query = query.substring(1); + if (query.charAt(query.length() - 1) == '"') query = query.substring(0, query.length() - 1); + } + } + if (grep == null && query.length() > 0) grep = query; + if (grep.length() > 0) { + if (grep.charAt(0) == '"') grep = grep.substring(1); + if (grep.charAt(grep.length() - 1) == '"') grep = grep.substring(0, grep.length() - 1); + } + NamedList paramsList = params.toNamedList(); + paramsList.remove("wt"); + String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22"); + writer.write("
\"API\"\n"); + writer.write("This search result can also be retrieved as XML. Click the API icon to see an example call to the search rss API.
\n"); + + DocList response = ((ResultContext) values.get("response")).docs; + final int sz = response.size(); + if (sz > 0) { + SolrIndexSearcher searcher = request.getSearcher(); + DocIterator iterator = response.iterator(); + IndexSchema schema = request.getSchema(); + writer.write("Document Grep for query \"" + query + "\" and grep phrase \"" + grep + "\"\n\n"); + + LinkedHashMap> sentenceCache = new LinkedHashMap>(); + + for (int i = 0; i < sz; i++) { + int id = iterator.nextDoc(); + Document doc = searcher.doc(id, DEFAULT_FIELD_LIST); + LinkedHashMap tdoc = HTMLResponseWriter.translateDoc(schema, doc); + String sku = tdoc.get(CollectionSchema.sku.getSolrFieldName()); + String title = tdoc.get(CollectionSchema.title.getSolrFieldName()); + String text = tdoc.get(CollectionSchema.text_t.getSolrFieldName()); + + ArrayList sentences = new ArrayList(); + if (title != null) sentences.add(title); + SentenceReader sr = new SentenceReader(text); + StringBuilder line; + while (sr.hasNext()) { + line = sr.next(); + if (line.length() > 0) sentences.add(line.toString()); + } + sentenceCache.put(sku, sentences); + } + + OrderedScoreMap scores = null; + if (discover) { + // for each line make a statistic about the number of occurrences somewhere else + SolrConnector connector = Switchboard.getSwitchboard().index.fulltext().getDefaultConnector(); + scores = new OrderedScoreMap(null); + for (Map.Entry> entry: sentenceCache.entrySet()) { + for (String line: entry.getValue()) { + long count = connector.getCountByQuery("text_t:\"" + line + "\""); + if (count > 0) scores.inc(entry.getKey()); + } + } + } + + for (Map.Entry> entry: sentenceCache.entrySet()) { + writeDoc(writer, entry.getKey(), entry.getValue(), grep, scores); + } + } else { + writer.write("No Document Found\n\n"); + } + + writer.write("\n"); + } + + private static final void writeDoc(Writer writer, String url, ArrayList sentences, String grep, OrderedScoreMap scores) throws IOException { + writer.write("
\n"); + writer.write("
\n"); + writer.write("

" + url + "

\n"); + writer.write("
\n"); + int c = 0; + for (String line: sentences) { + if (grep != null && grep.length() > 0 && line.indexOf(grep) < 0) continue; + writer.write("
"); + if (c++ == 0) { + if (grep == null || grep.length() == 0) writer.write("all lines in document"); else {writer.write("matches for grep phrase \"");writer.write(grep);writer.write("\"");} + } + writer.write("
"); + writedd(writer, line, scores); + } + if (scores != null) { + Collection discoveries = scores.keyList(false); + writer.write("
Citations:
"); + for (String u: discoveries) { + writer.write("
"); + writer.write(Integer.toString(scores.get(u))); + writer.write(" citations
"); + writedd(writer, u, scores); + } + } + writer.write("
\n"); + writer.write("
\n"); + writer.write("
\n"); + } + + private static void writedd(Writer writer, String line, OrderedScoreMap scores) throws IOException { + writer.write("
"); + XML.escapeAttributeValue(line, writer); + writer.write("
\n"); + } + +} diff --git a/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java index d2ca13f12..7bcabcfa1 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java @@ -140,7 +140,7 @@ public class HTMLResponseWriter implements QueryResponseWriter { writer.write("\n"); } - private static final LinkedHashMap translateDoc(final IndexSchema schema, final Document doc) { + static final LinkedHashMap translateDoc(final IndexSchema schema, final Document doc) { List fields = doc.getFields(); int sz = fields.size(); int fidx1 = 0, fidx2 = 0;