From d988ba50cfde74ab4d4b4b803bb19d41c03cdda7 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 14 Aug 2012 12:40:26 +0200 Subject: [PATCH] added a very rudimentary, incomplete, non-verified GSA response writer for solr. Try this: http://localhost:8090/gsa/searchresult?q=pdf&site=col1&num=10 --- .classpath | 3 +- htroot/gsa/searchresult.java | 120 ++++++++ htroot/solr/select.java | 33 +-- source/de/anomic/server/serverObjects.java | 7 + .../federated/solr/GSAResponseWriter.java | 258 ++++++++++++++++++ .../federated/solr/SolrConnector.java | 2 +- .../search/solr/EmbeddedSolrConnector.java | 2 +- 7 files changed, 402 insertions(+), 23 deletions(-) create mode 100644 htroot/gsa/searchresult.java create mode 100644 source/net/yacy/cora/services/federated/solr/GSAResponseWriter.java diff --git a/.classpath b/.classpath index e237659cc..5c395cf49 100644 --- a/.classpath +++ b/.classpath @@ -1,7 +1,7 @@ - + @@ -11,6 +11,7 @@ + diff --git a/htroot/gsa/searchresult.java b/htroot/gsa/searchresult.java new file mode 100644 index 000000000..e88a40d58 --- /dev/null +++ b/htroot/gsa/searchresult.java @@ -0,0 +1,120 @@ +/** + * search + * Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 14.08.2012 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.Writer; + +import net.yacy.cora.document.UTF8; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.services.federated.solr.GSAResponseWriter; +import net.yacy.kelondro.logging.Log; +import net.yacy.search.Switchboard; +import net.yacy.search.solr.EmbeddedSolrConnector; + +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.util.FastWriter; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; + +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; + +// try +// http://localhost:8090/gsa/search?q=chicken+teriyaki&output=xml&client=test&site=test&sort=date:D:S:d1 + +/** + * This is a gsa result formatter for solr search results. + * The result format is implemented according to + * https://developers.google.com/search-appliance/documentation/68/xml_reference#results_xml + */ +public class searchresult { + + private final static GSAResponseWriter responseWriter = new GSAResponseWriter(); + + /** + * get the right mime type for this streamed result page + * @param header + * @param post + * @param env + * @return + */ + public static String mime(final RequestHeader header, final serverObjects post, final serverSwitch env) { + return "text/xml"; + } + + /** + * @param header + * @param post + * @param env + * @param out + * @return + */ + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env, final OutputStream out) { + + // this uses the methods in the jetty servlet environment and can be removed if jetty in implemented + Switchboard sb = (Switchboard) env; + + // check if user is allowed to search (can be switched in /ConfigPortal.html) + final boolean searchAllowed = sb.getConfigBool("publicSearchpage", true) || sb.verifyAuthentication(header); + if (!searchAllowed) return null; + + // check post + if (post == null) return null; + + // rename post fields according to result style + //post.put(CommonParams.Q, post.remove("q")); // same as solr + //post.put(CommonParams.START, post.remove("start")); // same as solr + //post.put(, post.remove("site"));//required, example: col1|col2 + //post.put(, post.remove("client"));//required, example: myfrontend + //post.put(, post.remove("output"));//required, example: xml,xml_no_dtd + post.put(CommonParams.ROWS, post.remove("num")); + + // get the embedded connector + EmbeddedSolrConnector connector = (EmbeddedSolrConnector) sb.index.getLocalSolr(); + if (connector == null) return null; + + // do the solr request + SolrQueryRequest req = connector.request(post.toSolrParams()); + SolrQueryResponse response = null; + Exception e = null; + try {response = connector.query(req);} catch (SolrException ee) {e = ee;} + if (response != null) e = response.getException(); + if (e != null) { + Log.logException(e); + return null; + } + + // write the result directly to the output stream + Writer ow = new FastWriter(new OutputStreamWriter(out, UTF8.charset)); + try { + responseWriter.write(ow, req, response); + ow.flush(); + } catch (IOException e1) { + } finally { + req.close(); + try {ow.close();} catch (IOException e1) {} + } + + return null; + } +} \ No newline at end of file diff --git a/htroot/solr/select.java b/htroot/solr/select.java index c0f5323a6..438346f22 100644 --- a/htroot/solr/select.java +++ b/htroot/solr/select.java @@ -34,7 +34,6 @@ import net.yacy.cora.services.federated.solr.OpensearchResponseWriter; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; -import net.yacy.search.index.YaCySchema; import net.yacy.search.solr.EmbeddedSolrConnector; import net.yacy.search.solr.SolrServlet; @@ -55,9 +54,8 @@ import de.anomic.server.serverSwitch; // http://localhost:8090/solr/select?q=*:*&start=0&rows=10&indent=on /** - * + * this is a standard solr search result formatter as defined in * http://wiki.apache.org/solr/SolrQuerySyntax - * */ public class select { @@ -128,10 +126,18 @@ public class select { if (!post.containsKey(CommonParams.START)) post.put(CommonParams.START, post.remove("startRecord")); // sru patch if (!post.containsKey(CommonParams.ROWS)) post.put(CommonParams.ROWS, post.remove("maximumRecords")); // sru patch - // check if all required post fields are there - if (!post.containsKey(CommonParams.DF)) post.put(CommonParams.DF, YaCySchema.text_t.name()); // set default field to all fields - if (!post.containsKey(CommonParams.START)) post.put(CommonParams.START, "0"); // set default start item - if (!post.containsKey(CommonParams.ROWS)) post.put(CommonParams.ROWS, "10"); // set default number of search results + // get a response writer for the result + String wt = post.get(CommonParams.WT, "xml"); // maybe use /solr/select?q=*:*&start=0&rows=10&wt=exml + QueryResponseWriter responseWriter = RESPONSE_WRITER.get(wt); + if (responseWriter == null) return null; + if (responseWriter instanceof OpensearchResponseWriter) { + // set the title every time, it is possible that it has changed + final String promoteSearchPageGreeting = + (env.getConfigBool(SwitchboardConstants.GREETING_NETWORK_NAME, false)) ? env.getConfig( + "network.unit.description", + "") : env.getConfig(SwitchboardConstants.GREETING, ""); + ((OpensearchResponseWriter) responseWriter).setTitle(promoteSearchPageGreeting); + } // get the embedded connector EmbeddedSolrConnector connector = (EmbeddedSolrConnector) sb.index.getLocalSolr(); @@ -148,19 +154,6 @@ public class select { return null; } - // get a response writer for the result - String wt = post.get(CommonParams.WT, "xml"); // maybe use /solr/select?q=*:*&start=0&rows=10&wt=exml - QueryResponseWriter responseWriter = RESPONSE_WRITER.get(wt); - if (responseWriter == null) return null; - if (responseWriter instanceof OpensearchResponseWriter) { - // set the title every time, it is possible that it has changed - final String promoteSearchPageGreeting = - (env.getConfigBool(SwitchboardConstants.GREETING_NETWORK_NAME, false)) ? env.getConfig( - "network.unit.description", - "") : env.getConfig(SwitchboardConstants.GREETING, ""); - ((OpensearchResponseWriter) responseWriter).setTitle(promoteSearchPageGreeting); - } - // write the result directly to the output stream Writer ow = new FastWriter(new OutputStreamWriter(out, UTF8.charset)); try { diff --git a/source/de/anomic/server/serverObjects.java b/source/de/anomic/server/serverObjects.java index e8e62481f..165cb7b15 100644 --- a/source/de/anomic/server/serverObjects.java +++ b/source/de/anomic/server/serverObjects.java @@ -62,7 +62,9 @@ import net.yacy.cora.protocol.RequestHeader.FileType; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.util.Formatter; import net.yacy.search.Switchboard; +import net.yacy.search.index.YaCySchema; +import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.MultiMapSolrParams; import org.apache.solr.common.params.SolrParams; @@ -469,6 +471,11 @@ public class serverObjects extends HashMap implements Cloneable } public SolrParams toSolrParams() { + // check if all required post fields are there + if (!this.containsKey(CommonParams.DF)) this.put(CommonParams.DF, YaCySchema.text_t.name()); // set default field to all fields + if (!this.containsKey(CommonParams.START)) this.put(CommonParams.START, "0"); // set default start item + if (!this.containsKey(CommonParams.ROWS)) this.put(CommonParams.ROWS, "10"); // set default number of search results + Map m = new HashMap(); for (Map.Entry e: this.entrySet()) { m.put(e.getKey(), new String[]{e.getValue()}); diff --git a/source/net/yacy/cora/services/federated/solr/GSAResponseWriter.java b/source/net/yacy/cora/services/federated/solr/GSAResponseWriter.java new file mode 100644 index 000000000..564936f9b --- /dev/null +++ b/source/net/yacy/cora/services/federated/solr/GSAResponseWriter.java @@ -0,0 +1,258 @@ +/** + * GSAResponseWriter + * Copyright 2012 by Michael Peter Christen + * First released 14.08.2012 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.services.federated.solr; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.lod.vocabulary.DublinCore; +import net.yacy.search.index.YaCySchema; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Fieldable; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.SimpleOrderedMap; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.QueryResponseWriter; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.search.DocIterator; +import org.apache.solr.search.DocSlice; +import org.apache.solr.search.SolrIndexSearcher; + +/** + * implementation of a GSA search result. + * example: GET /gsa/searchresult?q=chicken+teriyaki&output=xml&client=test&site=test&sort=date:D:S:d1 + * for a xml reference, see https://developers.google.com/search-appliance/documentation/68/xml_reference + */ +public class GSAResponseWriter implements QueryResponseWriter { + + private static final char lb = '\n'; + private enum GSAToken { + CACHE_LAST_MODIFIED, // Date that the document was crawled, as specified in the Date HTTP header when the document was crawled for this index. + CRAWLDATE, // An optional element that shows the date when the page was crawled. It is shown only for pages that have been crawled within the past two days. + U, // The URL of the search result. + UE, // The URL-encoded version of the URL that is in the U parameter. + T, // The title of the search result. + RK, // Provides a ranking number used internally by the search appliance. + ENT_SOURCE, // Identifies the application ID (serial number) of the search appliance that contributes to a result. Example: S5-KUB000F0ADETLA + FS, // Additional details about the search result. + S, // The snippet for the search result. Query terms appear in bold in the results. Line breaks are included for proper text wrapping. + LANG, // Indicates the language of the search result. The LANG element contains a two-letter language code. + HAS; // Encapsulates special features that are included for this search result. + } + + + private static final char[] XML_START = ( + "\n\n").toCharArray(); + private static final char[] XML_STOP = "\n".toCharArray(); + + // define a list of simple YaCySchema -> RSS Token matchings + private static final Map field2tag = new HashMap(); + + // pre-select a set of YaCy schema fields for the solr searcher which should cause a better caching + private static final YaCySchema[] extrafields = new YaCySchema[]{ + YaCySchema.id, YaCySchema.title, YaCySchema.description, YaCySchema.text_t, + YaCySchema.h1_txt, YaCySchema.h2_txt, YaCySchema.h3_txt, YaCySchema.h4_txt, YaCySchema.h5_txt, YaCySchema.h6_txt, + }; + private static final Set SOLR_FIELDS = new HashSet(); + static { + field2tag.put(YaCySchema.last_modified.name(), GSAToken.CACHE_LAST_MODIFIED.name()); + field2tag.put(YaCySchema.load_date_dt.name(), GSAToken.CRAWLDATE.name()); + field2tag.put(YaCySchema.language_txt.name(), GSAToken.LANG.name()); + SOLR_FIELDS.addAll(field2tag.keySet()); + for (YaCySchema field: extrafields) SOLR_FIELDS.add(field.name()); + } + + private static class ResHead { + public int offset, rows, numFound; + //public int status, QTime; + //public String df, q, wt; + //public float maxScore; + } + + public GSAResponseWriter() { + super(); + } + + @Override + public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) { + return CONTENT_TYPE_XML_UTF8; + } + + @Override + public void init(@SuppressWarnings("rawtypes") NamedList n) { + } + + @Override + public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp) throws IOException { + assert rsp.getValues().get("responseHeader") != null; + assert rsp.getValues().get("response") != null; + + @SuppressWarnings("unchecked") + SimpleOrderedMap responseHeader = (SimpleOrderedMap) rsp.getResponseHeader(); + DocSlice response = (DocSlice) rsp.getValues().get("response"); + + // parse response header + ResHead resHead = new ResHead(); + NamedList val0 = (NamedList) responseHeader.get("params"); + resHead.rows = Integer.parseInt((String) val0.get("rows")); + resHead.offset = response.offset(); // equal to 'start' + resHead.numFound = response.matches(); + //resHead.df = (String) val0.get("df"); + //resHead.q = (String) val0.get("q"); + //resHead.wt = (String) val0.get("wt"); + //resHead.status = (Integer) responseHeader.get("status"); + //resHead.QTime = (Integer) responseHeader.get("QTime"); + //resHead.maxScore = response.maxScore(); + + // write header + writer.write(XML_START); + paramTag(writer, "start", Integer.toString(resHead.offset)); + paramTag(writer, "num", Integer.toString(resHead.rows)); + + // parse body + final int responseCount = response.size(); + SolrIndexSearcher searcher = request.getSearcher(); + DocIterator iterator = response.iterator(); + for (int i = 0; i < responseCount; i++) { + OpensearchResponseWriter.openTag(writer, "R"); + int id = iterator.nextDoc(); + Document doc = searcher.doc(id, SOLR_FIELDS); + List fields = doc.getFields(); + int fieldc = fields.size(); + List texts = new ArrayList(); + String description = ""; + for (int j = 0; j < fieldc; j++) { + Fieldable value = fields.get(j); + String fieldName = value.name(); + + // apply generic matching rule + String stag = field2tag.get(fieldName); + if (stag != null) { + OpensearchResponseWriter.solitaireTag(writer, stag, value.stringValue()); + continue; + } + +/* + + + + +*/ + + // if the rule is not generic, use the specific here + if (YaCySchema.sku.name().equals(fieldName)) { + String U = value.stringValue(); + OpensearchResponseWriter.solitaireTag(writer, GSAToken.U.name(), U); + OpensearchResponseWriter.solitaireTag(writer, GSAToken.UE.name(), U); + continue; + } + if (YaCySchema.title.name().equals(fieldName)) { + OpensearchResponseWriter.solitaireTag(writer, GSAToken.T.name(), value.stringValue()); + texts.add(value.stringValue()); + continue; + } + if (YaCySchema.description.name().equals(fieldName)) { + description = value.stringValue(); + OpensearchResponseWriter.solitaireTag(writer, DublinCore.Description.getURIref(), description); + texts.add(description); + continue; + } + if (YaCySchema.text_t.name().equals(fieldName)) { + texts.add(value.stringValue()); + continue; + } + if (YaCySchema.h1_txt.name().equals(fieldName) || YaCySchema.h2_txt.name().equals(fieldName) || + YaCySchema.h3_txt.name().equals(fieldName) || YaCySchema.h4_txt.name().equals(fieldName) || + YaCySchema.h5_txt.name().equals(fieldName) || YaCySchema.h6_txt.name().equals(fieldName)) { + // because these are multi-valued fields, there can be several of each + texts.add(value.stringValue()); + continue; + } + } + // compute snippet from texts + OpensearchResponseWriter.solitaireTag(writer, RSSMessage.Token.description.name(), description); + OpensearchResponseWriter.solitaireTag(writer, GSAToken.ENT_SOURCE.name(), "YaCy"); + OpensearchResponseWriter.closeTag(writer, "R"); + } + + writer.write(XML_STOP); + } + + + public static void paramTag(final Writer writer, final String tagname, String value) throws IOException { + if (value == null || value.length() == 0) return; + writer.write(""); writer.write(lb); + } +} + +/* + + +0.053898 +pdf + + + + + + + + + + + + + + +296 + + + + + + + + + + + + +de + + + + + +*/ \ No newline at end of file diff --git a/source/net/yacy/cora/services/federated/solr/SolrConnector.java b/source/net/yacy/cora/services/federated/solr/SolrConnector.java index 310991075..41434c4a0 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrConnector.java @@ -103,7 +103,7 @@ public interface SolrConnector { * @param querystring * @throws IOException */ - public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException; + public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException, SolrException; /** * get the size of the index diff --git a/source/net/yacy/search/solr/EmbeddedSolrConnector.java b/source/net/yacy/search/solr/EmbeddedSolrConnector.java index 10ceee786..9b24d9ab8 100644 --- a/source/net/yacy/search/solr/EmbeddedSolrConnector.java +++ b/source/net/yacy/search/solr/EmbeddedSolrConnector.java @@ -143,7 +143,7 @@ public class EmbeddedSolrConnector extends AbstractSolrConnector implements Solr return req; } - public SolrQueryResponse query(SolrQueryRequest req) { + public SolrQueryResponse query(SolrQueryRequest req) throws SolrException { final long startTime = System.currentTimeMillis(); SolrQueryResponse rsp = new SolrQueryResponse();