- added a SolrQueryResponse2SolrDocumentList method which is able to

work around the unfolding process in Solr's BinaryResponseWriter.
This was a huge performance bottleneck in the embedded solr connector
and the problem is actually on Solr side, but we have now a workaround.
- This made it possible to abstract a high-performance index access
method which is implemented as method getDocumentListByParams. That
method is also implemented in the SolrServerConnector and provides a
very efficient access to a solr index if the index is embedded.
- a popular use of the document list retrieval is a result count which
can now also make use of the new method, via getDocumentCountByParams.
- enhanced the Error cache which now does not store error documents
within the ram cache if the document is also written to solr. When
documents are retrieved from the cache, they are partly read from the
ram cache and if not existent there, from the Solr index.
pull/1/head
Michael Peter Christen 11 years ago
parent 74466d731a
commit 2702d9e56b

@ -46,6 +46,7 @@ import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.params.ModifiableSolrParams;
public abstract class AbstractSolrConnector implements SolrConnector {
@ -179,11 +180,16 @@ public abstract class AbstractSolrConnector implements SolrConnector {
params.setIncludeScore(false);
// query the server
QueryResponse rsp = getResponseByParams(params);
final SolrDocumentList docs = rsp.getResults();
final SolrDocumentList docs = getDocumentListByParams(params);
return docs;
}
@Override
public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException, SolrException {
final SolrDocumentList sdl = getDocumentListByParams(params);
return sdl == null ? 0 : sdl.getNumFound();
}
/**
* check if a given document, identified by url hash as ducument id exists
* @param id the url hash and document id
@ -205,10 +211,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
params.setIncludeScore(false);
// query the server
QueryResponse rsp = getResponseByParams(params);
final SolrDocumentList docs = rsp.getResults();
boolean exist = docs == null ? false : docs.getNumFound() > 0;
return exist;
return getDocumentCountByParams(params) > 0;
}
/**
@ -237,8 +240,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
params.setIncludeScore(false);
// query the server
QueryResponse rsp = getResponseByParams(params);
final SolrDocumentList docs = rsp.getResults();
final SolrDocumentList docs = getDocumentListByParams(params);
// construct a new id list from that
HashSet<String> idsr = new HashSet<String>();
for (SolrDocument doc : docs) {
@ -266,9 +268,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
params.setIncludeScore(false);
// query the server
QueryResponse rsp = getResponseByParams(params);
final SolrDocumentList docs = rsp.getResults();
return docs == null ? 0 : docs.getNumFound();
return getDocumentCountByParams(params);
}
/**
@ -325,12 +325,12 @@ public abstract class AbstractSolrConnector implements SolrConnector {
// query the server
try {
final QueryResponse rsp = getResponseByParams(query);
final SolrDocumentList docs = rsp.getResults();
if (docs.isEmpty()) return null;
final SolrDocumentList docs = getDocumentListByParams(query);
if (docs == null || docs.isEmpty()) return null;
return docs.get(0);
} catch (final Throwable e) {
throw new IOException(e.getMessage(), e);
}
}
}

@ -221,6 +221,12 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
QueryResponse list = this.solr.getResponseByParams(query);
return list;
}
@Override
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException {
SolrDocumentList sdl = this.solr.getDocumentListByParams(params);
return sdl;
}
@Override
public long getCountByQuery(final String querystring) throws IOException {

@ -418,6 +418,18 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
return this.connector.getResponseByParams(query);
}
@Override
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException {
SolrDocumentList sdl = this.connector.getDocumentListByParams(params);
return sdl;
}
@Override
public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException, SolrException {
final SolrDocumentList sdl = getDocumentListByParams(params);
return sdl == null ? 0 : sdl.getNumFound();
}
@Override
public SolrDocumentList getDocumentListByQuery(String querystring, int offset, int count, String... fields) throws IOException, SolrException {
return this.connector.getDocumentListByQuery(querystring, offset, count, fields);

@ -22,6 +22,7 @@
package net.yacy.cora.federate.solr.connector;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import java.util.TreeSet;
@ -35,10 +36,14 @@ import net.yacy.search.schema.CollectionSchema;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
@ -49,9 +54,12 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.SearchHandler;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.request.SolrRequestInfo;
import org.apache.solr.request.UnInvertedField;
import org.apache.solr.response.ResultContext;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocSet;
@ -180,7 +188,81 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
// return result
return rsp;
}
/**
* conversion from a SolrQueryResponse (which is a solr-internal data format) to SolrDocumentList (which is a solrj-format)
* The conversion is done inside the solrj api using the BinaryResponseWriter and a very complex unfolding process
* via org.apache.solr.common.util.JavaBinCodec.marshal.
* @param request
* @param sqr
* @return
*/
public SolrDocumentList SolrQueryResponse2SolrDocumentList(final SolrQueryRequest req, final SolrQueryResponse rsp) {
SolrDocumentList sdl = new SolrDocumentList();
@SuppressWarnings("rawtypes")
NamedList nl = rsp.getValues();
ResultContext resultContext = (ResultContext) nl.get("response");
DocList response = resultContext == null ? new DocSlice(0, 0, new int[0], new float[0], 0, 0.0f) : resultContext.docs;
sdl.setNumFound(response == null ? 0 : response.matches());
sdl.setStart(response == null ? 0 : response.offset());
if (response != null) {
final int responseCount = response.size();
SolrIndexSearcher searcher = req.getSearcher();
DocIterator iterator = response.iterator();
for (int i = 0; i < responseCount; i++) {
try {
sdl.add(doc2SolrDoc(searcher.doc(iterator.nextDoc(), (Set<String>) null)));
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}
}
return sdl;
}
public SolrDocument doc2SolrDoc(Document doc) {
SolrDocument solrDoc = new SolrDocument();
for (IndexableField field : doc) {
String fieldName = field.name();
SchemaField sf = this.core.getLatestSchema().getFieldOrNull(fieldName);
Object val = null;
try {
FieldType ft = null;
if (sf != null) ft = sf.getType();
if (ft == null) {
BytesRef bytesRef = field.binaryValue();
if (bytesRef != null) {
if (bytesRef.offset == 0 && bytesRef.length == bytesRef.bytes.length) {
val = bytesRef.bytes;
} else {
final byte[] bytes = new byte[bytesRef.length];
System.arraycopy(bytesRef.bytes, bytesRef.offset, bytes, 0, bytesRef.length);
val = bytes;
}
} else {
val = field.stringValue();
}
} else {
val = ft.toObject(field);
}
} catch (Throwable e) {
continue;
}
if (sf != null && sf.multiValued() && !solrDoc.containsKey(fieldName)) {
ArrayList<Object> l = new ArrayList<Object>();
l.add(val);
solrDoc.addField(fieldName, l);
} else {
solrDoc.addField(fieldName, val);
}
}
return solrDoc;
}
/**
* the usage of getResponseByParams is disencouraged for the embedded Solr connector. Please use request(SolrParams) instead.
* Reason: Solr makes a very complex folding/unfolding including data compression for SolrQueryResponses.
@ -196,7 +278,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
try {
rsp = this.server.query(params);
if (q != null) Thread.currentThread().setName(threadname);
if (rsp != null) log.fine(rsp.getResults().getNumFound() + " results for q=" + q);
if (rsp != null) if (log.isFine()) log.fine(rsp.getResults().getNumFound() + " results for q=" + q);
return rsp;
} catch (final SolrServerException e) {
throw new IOException(e);
@ -205,6 +287,44 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
}
}
/**
* get the solr document list from a query response
* This differs from getResponseByParams in such a way that it does only create the fields of the response but
* never search snippets and there are also no facets generated.
* @param params
* @return
* @throws IOException
* @throws SolrException
*/
@Override
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException {
SolrQueryRequest req = this.request(params);
SolrQueryResponse response = null;
try {
response = this.query(req);
if (response == null) throw new IOException("response == null");
return SolrQueryResponse2SolrDocumentList(req, response);
} finally {
req.close();
SolrRequestInfo.clearRequestInfo();
}
}
public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException, SolrException {
SolrQueryRequest req = this.request(params);
SolrQueryResponse response = null;
try {
response = this.query(req);
if (response == null) throw new IOException("response == null");
NamedList<?> nl = response.getValues();
ResultContext resultContext = (ResultContext) nl.get("response");
return resultContext == null ? 0 : resultContext.docs.matches();
} finally {
req.close();
SolrRequestInfo.clearRequestInfo();
}
}
private class DocListSearcher {
public SolrQueryRequest request;
public DocList response;

@ -275,6 +275,49 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
// TODO: combine both
return rsp1;
}
@Override
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams query) throws IOException, SolrException {
Integer count0 = query.getInt(CommonParams.ROWS);
int count = count0 == null ? 10 : count0.intValue();
Integer start0 = query.getInt(CommonParams.START);
int start = start0 == null ? 0 : start0.intValue();
if (this.solr0 == null && this.solr1 == null) return new SolrDocumentList();
if (this.solr0 != null && this.solr1 == null) {
SolrDocumentList list = this.solr0.getDocumentListByParams(query);
return list;
}
if (this.solr1 != null && this.solr0 == null) {
SolrDocumentList list = this.solr1.getDocumentListByParams(query);
return list;
}
// combine both lists
final SolrDocumentList l = this.solr0.getDocumentListByParams(query);
if (l.size() >= count) return l;
// at this point we need to know how many results are in solr0
// compute this with a very bad hack; replace with better method later
int size0 = 0;
{ //bad hack - TODO: replace
query.set(CommonParams.START, 0);
query.set(CommonParams.ROWS, Integer.MAX_VALUE);
final SolrDocumentList lHack = this.solr0.getDocumentListByParams(query);
query.set(CommonParams.START, start);
query.set(CommonParams.ROWS, count);
size0 = lHack.size();
}
// now use the size of the first query to do a second query
query.set(CommonParams.START, start + l.size() - size0);
query.set(CommonParams.ROWS, count - l.size());
final SolrDocumentList l1 = this.solr1.getDocumentListByParams(query);
query.set(CommonParams.START, start);
query.set(CommonParams.ROWS, count);
// TODO: combine both
return l1;
}
@Override
public long getCountByQuery(final String querystring) throws IOException {

@ -98,7 +98,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
public void deleteByQuery(final String querystring) throws IOException;
/**
* check if a given document, identified by url hash as ducument id exists
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id
* @return true if any entry in solr exists
* @throws IOException
@ -139,12 +139,32 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
public SolrDocument getDocumentById(final String key, final String ... fields) throws IOException;
/**
* get a query response from solr
* get a "full" query response from solr. Please compare to getSolrDocumentListByParams which may be much more efficient
* @param query
* @throws IOException
*/
public QueryResponse getResponseByParams(final ModifiableSolrParams query) throws IOException, SolrException;
/**
* get the solr document list from a query response
* This differs from getResponseByParams in such a way that it does only create the fields of the response but
* never search snippets and there are also no facets generated.
* @param params
* @return
* @throws IOException
* @throws SolrException
*/
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException;
/**
* get the number of results for a query response
* @param params
* @return
* @throws IOException
* @throws SolrException
*/
public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException, SolrException;
/**
* get a query result from solr
* to get all results set the query String to "*:*"

@ -30,8 +30,11 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.search.schema.CollectionSchema;
import org.apache.lucene.analysis.NumericTokenStream;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.impl.XMLResponseParser;
@ -41,6 +44,7 @@ import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
import org.apache.solr.client.solrj.request.LukeRequest;
import org.apache.solr.client.solrj.response.LukeResponse.FieldInfo;
import org.apache.solr.client.solrj.response.LukeResponse;
import org.apache.solr.client.solrj.response.QueryResponse;
public abstract class SolrServerConnector extends AbstractSolrConnector implements SolrConnector {
@ -285,6 +289,35 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
}
}
}
/**
* get the solr document list from a query response
* This differs from getResponseByParams in such a way that it does only create the fields of the response but
* never search snippets and there are also no facets generated.
* @param params
* @return
* @throws IOException
* @throws SolrException
*/
@Override
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException {
if (this.server == null) throw new IOException("server disconnected");
// during the solr query we set the thread name to the query string to get more debugging info in thread dumps
String q = params.get("q");
String threadname = Thread.currentThread().getName();
if (q != null) Thread.currentThread().setName("solr query: q = " + q);
QueryResponse rsp;
try {
rsp = this.server.query(params);
if (q != null) Thread.currentThread().setName(threadname);
if (rsp != null) if (log.isFine()) log.fine(rsp.getResults().getNumFound() + " results for q=" + q);
return rsp.getResults();
} catch (final SolrServerException e) {
throw new SolrException(ErrorCode.UNKNOWN, e);
} catch (final Throwable e) {
throw new IOException("Error executing query", e);
}
}
public Collection<FieldInfo> getFields() throws SolrServerException {
// get all fields contained in index

@ -30,10 +30,10 @@ import java.util.Set;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.SortClause;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.CommonParams;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
@ -50,12 +50,12 @@ public class ErrorCache {
private static final int maxStackSize = 1000;
// the class object
private final Map<String, CollectionConfiguration.FailDoc> stack;
private final Map<String, CollectionConfiguration.FailDoc> cache;
private final Fulltext fulltext;
public ErrorCache(final Fulltext fulltext) {
this.fulltext = fulltext;
this.stack = new LinkedHashMap<String, CollectionConfiguration.FailDoc>();
this.cache = new LinkedHashMap<String, CollectionConfiguration.FailDoc>();
try {
// fill stack with latest values
final SolrQuery params = new SolrQuery();
@ -64,28 +64,29 @@ public class ErrorCache {
params.setRows(100);
params.setFacet(false);
params.setSort(new SortClause(CollectionSchema.last_modified.getSolrFieldName(), SolrQuery.ORDER.desc));
params.setFacet(false);
params.setFields(CollectionSchema.id.getSolrFieldName());
params.setQuery(CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
QueryResponse rsp = fulltext.getDefaultConnector().getResponseByParams(params);
SolrDocumentList docList = rsp == null ? null : rsp.getResults();
params.set(CommonParams.DF, CollectionSchema.id.getSolrFieldName()); // DisMaxParams.QF or CommonParams.DF must be given
SolrDocumentList docList = fulltext.getDefaultConnector().getDocumentListByParams(params);
if (docList != null) for (int i = docList.size() - 1; i >= 0; i--) {
CollectionConfiguration.FailDoc failDoc = new CollectionConfiguration.FailDoc(docList.get(i));
this.stack.put(ASCII.String(failDoc.getDigestURL().hash()), failDoc);
SolrDocument doc = docList.get(i);
String hash = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
this.cache.put(hash, null);
}
} catch (final Throwable e) {
}
}
public void clear() throws IOException {
if (this.stack != null) synchronized (this.stack) {this.stack.clear();}
if (this.cache != null) synchronized (this.cache) {this.cache.clear();}
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
}
public void removeHosts(final Set<String> hosthashes) {
if (hosthashes == null || hosthashes.size() == 0) return;
this.fulltext.deleteDomainErrors(hosthashes);
synchronized (this.stack) {
Iterator<String> i = ErrorCache.this.stack.keySet().iterator();
synchronized (this.cache) {
Iterator<String> i = ErrorCache.this.cache.keySet().iterator();
while (i.hasNext()) {
String b = i.next();
if (hosthashes.contains(b)) i.remove();
@ -105,9 +106,6 @@ public class ErrorCache {
url, profile == null ? null : profile.collections(),
failCategory.name() + " " + reason, failCategory.failType,
httpcode);
synchronized (this.stack) {
this.stack.put(ASCII.String(url.hash()), failDoc);
}
if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
// send the error to solr
try {
@ -116,38 +114,57 @@ public class ErrorCache {
} catch (final IOException e) {
ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage());
}
synchronized (this.cache) {
this.cache.put(ASCII.String(url.hash()), null);
}
} else {
synchronized (this.cache) {
this.cache.put(ASCII.String(url.hash()), failDoc);
}
}
checkStackSize();
}
private void checkStackSize() {
synchronized (this.stack) {
int dc = this.stack.size() - maxStackSize;
synchronized (this.cache) {
int dc = this.cache.size() - maxStackSize;
if (dc > 0) {
Collection<String> d = new ArrayList<String>();
Iterator<String> i = this.stack.keySet().iterator();
Iterator<String> i = this.cache.keySet().iterator();
while (dc-- > 0 && i.hasNext()) d.add(i.next());
for (String s: d) this.stack.remove(s);
for (String s: d) this.cache.remove(s);
}
}
}
public ArrayList<CollectionConfiguration.FailDoc> list(int max) {
final ArrayList<CollectionConfiguration.FailDoc> l = new ArrayList<CollectionConfiguration.FailDoc>();
synchronized (this.stack) {
Iterator<CollectionConfiguration.FailDoc> fdi = this.stack.values().iterator();
for (int i = 0; i < this.stack.size() - max; i++) fdi.next();
while (fdi.hasNext()) l.add(fdi.next());
synchronized (this.cache) {
Iterator<Map.Entry<String, CollectionConfiguration.FailDoc>> hi = this.cache.entrySet().iterator();
for (int i = 0; i < this.cache.size() - max; i++) hi.next();
while (hi.hasNext()) {
try {
Map.Entry<String, CollectionConfiguration.FailDoc> entry = hi.next();
String hash = entry.getKey();
CollectionConfiguration.FailDoc failDoc = entry.getValue();
if (failDoc == null) {
SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(hash);
if (doc != null) failDoc = new CollectionConfiguration.FailDoc(doc);
}
if (failDoc != null) l.add(failDoc);
} catch (IOException e) {
}
}
}
return l;
}
public CollectionConfiguration.FailDoc get(final String urlhash) {
CollectionConfiguration.FailDoc fd;
synchronized (this.stack) {
fd = this.stack.get(urlhash);
CollectionConfiguration.FailDoc failDoc = null;
synchronized (this.cache) {
failDoc = this.cache.get(urlhash);
}
if (fd != null) return fd;
if (failDoc != null) return failDoc;
try {
SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlhash);
if (doc == null) return null;
@ -171,16 +188,17 @@ public class ErrorCache {
}
public void clearStack() {
synchronized (this.stack) {
this.stack.clear();
synchronized (this.cache) {
this.cache.clear();
}
}
public int stackSize() {
synchronized (this.stack) {
return this.stack.size();
synchronized (this.cache) {
return this.cache.size();
}
}
}

@ -1365,7 +1365,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
this.collections = new HashMap<String, Pattern>();
Collection<Object> c = doc.getFieldValues(CollectionSchema.collection_sxt.getSolrFieldName());
for (Object cn: c) this.collections.put((String) cn, QueryParams.catchall_pattern);
if (c != null) for (Object cn: c) if (cn != null) this.collections.put((String) cn, QueryParams.catchall_pattern);
this.failReason = (String) doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName());
this.failType = FailType.valueOf((String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()));
this.httpstatus = (Integer) doc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName());

Loading…
Cancel
Save