added new classes which shall reduce call overhead to Solr (stub)

pull/1/head
orbiter 11 years ago
parent 3491ab4c38
commit 4099296b45

@ -0,0 +1,91 @@
/**
* Conjunction
* Copyright 2014 by Michael Peter Christen
* First released 03.08.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr.logic;
import java.util.ArrayList;
import java.util.List;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
/**
* A Concunction is a conjunction of atomic key/value pairs to Solr. The purpose of this class is,
* to provide a mechanism to reduce the calls to Solr when calling Solr several times with sets of
* key/value pairs which are all conjunctive. A combined query for a set of disjunctive conjunctions
* is provided by the DNF class. The result of a DNF class query to solr must be separated again using
* the original conjunctive terms which is represented by this class. The SolrDocumentList which are
* results from individual calls is then the same as a SolrDocument list which can be computed with the
* method apply() in this class on the DNF of the Solr result.
*/
public class Conjunction {
private final List<Literal> literals;
public Conjunction() {
this.literals = new ArrayList<>();
}
public void addLiteral(Literal literal) {
this.literals.add(literal);
}
/**
* create a Solr query string from this conjunction
* @return a string which is a Solr query string
*/
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
for (Literal l: this.literals) {
if (sb.length() > 0) sb.append(" AND ");
sb.append(l.toString());
}
return sb.toString();
}
/**
* check if this conjunction matches with a given SolrDocument
* @param doc the SolrDocument to match to
* @return true, if all literals of this conjunction match with the key/value pairs of the document
*/
public boolean matches(SolrDocument doc) {
for (Literal literal: this.literals) {
if (!literal.matches(doc)) return false;
}
return true;
}
/**
* create a hit subset of the given SolrDocumentList according to the conjunction defined
* in this object
* @param sdl the SolrDocumentList
* @return a manufactured subset-clone of the given SolrDocumentList where document match with the Conjunction as given in this object
*/
public SolrDocumentList apply(SolrDocumentList sdl) {
SolrDocumentList r = new SolrDocumentList();
int numFound = 0;
for (SolrDocument d: r) {
if (matches(d)) {r.add(d); numFound++;}
}
r.setNumFound(numFound);
return r;
}
}

@ -0,0 +1,59 @@
/**
* DNF
* Copyright 2014 by Michael Peter Christen
* First released 03.08.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr.logic;
import java.util.ArrayList;
import java.util.List;
/**
* This is the implementation of a disjunctive normal form, which is the disjunction of conjunctions.
* See: http://en.wikipedia.org/wiki/Disjunctive_normal_form
* We use a DNF to combine several solr queries into one if that is applicable.
* When caling Solr with a DNF, we need only one http request (if this is done with a remote Solr)
* and thus saving the network overhead for each single (conjunctive) query. To filter out the conjunctions
* from the bundled query result, you must apply the apply() method from the Conjunction class.
*/
public class DNF {
private final List<Conjunction> dnf;
public DNF() {
this.dnf = new ArrayList<>();
}
public void addConjunction(Conjunction conjunction) {
this.dnf.add(conjunction);
}
/**
* create a Solr query string from this DNF
*/
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
for (Conjunction c: this.dnf) {
if (sb.length() > 0) sb.append(" OR ");
sb.append('(').append(c.toString()).append(')');
}
return sb.toString();
}
}

@ -0,0 +1,63 @@
/**
* Literal
* Copyright 2014 by Michael Peter Christen
* First released 03.08.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr.logic;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.federate.solr.SchemaDeclaration;
public class Literal {
private SchemaDeclaration key;
private String value;
private boolean sign;
public Literal(final SchemaDeclaration key, final String value, final boolean sign) {
this.key = key;
this.value = value;
this.sign = sign;
}
/**
* create a Solr query string from this literal
* @return a string which is a Solr query string
*/
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
if (!this.sign) sb.append('-');
sb.append(this.key.getSolrFieldName());
sb.append(':').append('"').append(this.value).append('"');
return sb.toString();
}
/**
* check if the key/value pair of this literal occurs in the SolrDocument
* @param doc the document to match to this literal
* @return true, if the key of this literal is contained in the document and the
* value equals (does not equal) with the value if this literal (if the signature is false)
*/
public boolean matches(SolrDocument doc) {
Object v = doc.getFieldValue(this.key.getSolrFieldName());
if (v == null) return false;
return v.toString().matches(this.value) == this.sign;
}
}

@ -60,6 +60,9 @@ import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL;
import net.yacy.cora.federate.solr.logic.Conjunction;
import net.yacy.cora.federate.solr.logic.DNF;
import net.yacy.cora.federate.solr.logic.Literal;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
@ -1371,18 +1374,27 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// FIND OUT IF THIS IS A DOUBLE DOCUMENT
String urlhash = ASCII.String(url.hash());
String hostid = url.hosthash();
DNF dnf = new DNF();
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{
{CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i},
{CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b, CollectionSchema.fuzzy_signature_copycount_i}}) {
CollectionSchema signaturefield = checkfields[0];
CollectionSchema uniquefield = checkfields[1];
CollectionSchema countfield = checkfields[2];
if (this.contains(signaturefield) && this.contains(uniquefield) && this.contains(countfield)) {
// lookup the document with the same signature
Long signature = (Long) sid.getField(signaturefield.getSolrFieldName()).getValue();
if (signature == null) continue uniquecheck;
Conjunction con = new Conjunction();
con.addLiteral(new Literal(CollectionSchema.id, urlhash, false));
con.addLiteral(new Literal(CollectionSchema.host_id_s, hostid, true));
con.addLiteral(new Literal(signaturefield, signature.toString(), true));
dnf.addConjunction(con);
String query = con.toString();
try {
SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"", null, 0, 100, CollectionSchema.id.getSolrFieldName());
//SolrDocumentList docsOld = segment.fulltext().getDefaultConnector().getDocumentListByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"", null, 0, 2000, CollectionSchema.id.getSolrFieldName());
SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(query, null, 0, 2000, CollectionSchema.id.getSolrFieldName());
if (docs.getNumFound() == 0) {
sid.setField(uniquefield.getSolrFieldName(), true);
sid.setField(countfield.getSolrFieldName(), 1);

Loading…
Cancel
Save