texts, separated by sentences. Then, these sentences can be used to search again in the index for the same sentence. This can be used to provide a tool for plagiarism-search. (not finished yet). Try the following: http://localhost:8090/solr/select?q=text_t:flut&grep=wasser&defType=edismax&start=0&rows=3&core=collection1&wt=grephtml .. to search for 'flut' and show only sentences in the result documents which contain the word 'wasser'. Consider this like using a grep-tool on documents: you select the documents by a search query and you grep sentences inside the found documents with the 'grep' attribute.pull/1/head
parent
856e5c42ae
commit
b85db72a73
@ -0,0 +1,210 @@
|
||||
/**
|
||||
* GrepHTMLResponseWriter
|
||||
* Copyright 2013 by Michael Peter Christen
|
||||
* First released 09.06.2013 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.federate.solr.responsewriter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.cora.federate.solr.connector.SolrConnector;
|
||||
import net.yacy.cora.sorting.OrderedScoreMap;
|
||||
import net.yacy.document.SentenceReader;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.schema.CollectionSchema;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.XML;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.response.QueryResponseWriter;
|
||||
import org.apache.solr.response.ResultContext;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.search.DocIterator;
|
||||
import org.apache.solr.search.DocList;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
|
||||
/**
|
||||
* this response writer shows a list of documents with the lines containing matches
|
||||
* of the search request in 'grep-style', which means it is like doing a grep on a set
|
||||
* of files. Within the result list, the document is splitted into the sentences of the
|
||||
* text part and each sentence is shown as separate line. grep attributes can be used to
|
||||
* show leading and trainling lines.
|
||||
*/
|
||||
public class GrepHTMLResponseWriter implements QueryResponseWriter {
|
||||
|
||||
private static final Set<String> DEFAULT_FIELD_LIST = new HashSet<String>();
|
||||
private static final Pattern dqp = Pattern.compile("\"");
|
||||
static {
|
||||
DEFAULT_FIELD_LIST.add(CollectionSchema.id.getSolrFieldName());
|
||||
DEFAULT_FIELD_LIST.add(CollectionSchema.sku.getSolrFieldName());
|
||||
DEFAULT_FIELD_LIST.add(CollectionSchema.title.getSolrFieldName());
|
||||
DEFAULT_FIELD_LIST.add(CollectionSchema.text_t.getSolrFieldName());
|
||||
}
|
||||
|
||||
public GrepHTMLResponseWriter() {
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) {
|
||||
return CONTENT_TYPE_XML_UTF8;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(@SuppressWarnings("rawtypes") NamedList n) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp) throws IOException {
|
||||
NamedList<?> values = rsp.getValues();
|
||||
assert values.get("responseHeader") != null;
|
||||
assert values.get("response") != null;
|
||||
|
||||
writer.write("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n");
|
||||
writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"all\" href=\"/env/base.css\" />\n");
|
||||
writer.write("<link rel=\"stylesheet\" type=\"text/css\" media=\"screen\" href=\"/env/style.css\" />\n");
|
||||
SolrParams params = request.getOriginalParams();
|
||||
boolean discover = params.getBool("discover", false);
|
||||
String grep = params.get("grep");
|
||||
String query = "";
|
||||
String q = params.get("q"); if (q == null) q = "";
|
||||
int p = q.indexOf(':');
|
||||
if (p >= 0) {
|
||||
int r = q.charAt(p + 1) == '"' ? q.indexOf(p + 2, '"') : q.indexOf(' ');
|
||||
if (r < 0) r = q.length();
|
||||
query = q.substring(p + 1, r);
|
||||
if (query.length() > 0) {
|
||||
if (query.charAt(0) == '"') query = query.substring(1);
|
||||
if (query.charAt(query.length() - 1) == '"') query = query.substring(0, query.length() - 1);
|
||||
}
|
||||
}
|
||||
if (grep == null && query.length() > 0) grep = query;
|
||||
if (grep.length() > 0) {
|
||||
if (grep.charAt(0) == '"') grep = grep.substring(1);
|
||||
if (grep.charAt(grep.length() - 1) == '"') grep = grep.substring(0, grep.length() - 1);
|
||||
}
|
||||
NamedList<Object> paramsList = params.toNamedList();
|
||||
paramsList.remove("wt");
|
||||
String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22");
|
||||
writer.write("<div id=\"api\"><a href=\"" + xmlquery + "\"><img src=\"../env/grafics/api.png\" width=\"60\" height=\"40\" alt=\"API\" /></a>\n");
|
||||
writer.write("<span>This search result can also be retrieved as XML. Click the API icon to see an example call to the search rss API.</div>\n");
|
||||
|
||||
DocList response = ((ResultContext) values.get("response")).docs;
|
||||
final int sz = response.size();
|
||||
if (sz > 0) {
|
||||
SolrIndexSearcher searcher = request.getSearcher();
|
||||
DocIterator iterator = response.iterator();
|
||||
IndexSchema schema = request.getSchema();
|
||||
writer.write("<title>Document Grep for query \"" + query + "\" and grep phrase \"" + grep + "\"</title>\n</head><body>\n");
|
||||
|
||||
LinkedHashMap<String, ArrayList<String>> sentenceCache = new LinkedHashMap<String, ArrayList<String>>();
|
||||
|
||||
for (int i = 0; i < sz; i++) {
|
||||
int id = iterator.nextDoc();
|
||||
Document doc = searcher.doc(id, DEFAULT_FIELD_LIST);
|
||||
LinkedHashMap<String, String> tdoc = HTMLResponseWriter.translateDoc(schema, doc);
|
||||
String sku = tdoc.get(CollectionSchema.sku.getSolrFieldName());
|
||||
String title = tdoc.get(CollectionSchema.title.getSolrFieldName());
|
||||
String text = tdoc.get(CollectionSchema.text_t.getSolrFieldName());
|
||||
|
||||
ArrayList<String> sentences = new ArrayList<String>();
|
||||
if (title != null) sentences.add(title);
|
||||
SentenceReader sr = new SentenceReader(text);
|
||||
StringBuilder line;
|
||||
while (sr.hasNext()) {
|
||||
line = sr.next();
|
||||
if (line.length() > 0) sentences.add(line.toString());
|
||||
}
|
||||
sentenceCache.put(sku, sentences);
|
||||
}
|
||||
|
||||
OrderedScoreMap<String> scores = null;
|
||||
if (discover) {
|
||||
// for each line make a statistic about the number of occurrences somewhere else
|
||||
SolrConnector connector = Switchboard.getSwitchboard().index.fulltext().getDefaultConnector();
|
||||
scores = new OrderedScoreMap<String>(null);
|
||||
for (Map.Entry<String, ArrayList<String>> entry: sentenceCache.entrySet()) {
|
||||
for (String line: entry.getValue()) {
|
||||
long count = connector.getCountByQuery("text_t:\"" + line + "\"");
|
||||
if (count > 0) scores.inc(entry.getKey());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (Map.Entry<String, ArrayList<String>> entry: sentenceCache.entrySet()) {
|
||||
writeDoc(writer, entry.getKey(), entry.getValue(), grep, scores);
|
||||
}
|
||||
} else {
|
||||
writer.write("<title>No Document Found</title>\n</head><body>\n");
|
||||
}
|
||||
|
||||
writer.write("</body></html>\n");
|
||||
}
|
||||
|
||||
private static final void writeDoc(Writer writer, String url, ArrayList<String> sentences, String grep, OrderedScoreMap<String> scores) throws IOException {
|
||||
writer.write("<form name=\"yacydoc" + url + "\" method=\"post\" action=\"#\" enctype=\"multipart/form-data\" accept-charset=\"UTF-8\">\n");
|
||||
writer.write("<fieldset>\n");
|
||||
writer.write("<h1><a href=\"" + url + "\">" + url + "</a></h1>\n");
|
||||
writer.write("<dl>\n");
|
||||
int c = 0;
|
||||
for (String line: sentences) {
|
||||
if (grep != null && grep.length() > 0 && line.indexOf(grep) < 0) continue;
|
||||
writer.write("<dt>");
|
||||
if (c++ == 0) {
|
||||
if (grep == null || grep.length() == 0) writer.write("all lines in document"); else {writer.write("matches for grep phrase \"");writer.write(grep);writer.write("\"");}
|
||||
}
|
||||
writer.write("</dt>");
|
||||
writedd(writer, line, scores);
|
||||
}
|
||||
if (scores != null) {
|
||||
Collection<String> discoveries = scores.keyList(false);
|
||||
writer.write("<dt>Citations:</dt><dd></dd>");
|
||||
for (String u: discoveries) {
|
||||
writer.write("<dt>");
|
||||
writer.write(Integer.toString(scores.get(u)));
|
||||
writer.write(" citations</dt><dd>");
|
||||
writedd(writer, u, scores);
|
||||
}
|
||||
}
|
||||
writer.write("</dl>\n");
|
||||
writer.write("</fieldset>\n");
|
||||
writer.write("</form>\n");
|
||||
}
|
||||
|
||||
private static void writedd(Writer writer, String line, OrderedScoreMap<String> scores) throws IOException {
|
||||
writer.write("<dd><a href=\"/solr/select?q=text_t:%22");
|
||||
XML.escapeAttributeValue(line, writer);
|
||||
writer.write("%22&rows=100&discover=");
|
||||
writer.write(scores != null ? "true" : "false");
|
||||
writer.write("&wt=grephtml\">");
|
||||
XML.escapeAttributeValue(line, writer);
|
||||
writer.write("</a></dd>\n");
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in new issue