From 4099296b450da4753dc8ddc975b9e1fb56dcbe77 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 3 Aug 2014 22:44:22 +0200 Subject: [PATCH] added new classes which shall reduce call overhead to Solr (stub) --- .../cora/federate/solr/logic/Conjunction.java | 91 +++++++++++++++++++ .../yacy/cora/federate/solr/logic/DNF.java | 59 ++++++++++++ .../cora/federate/solr/logic/Literal.java | 63 +++++++++++++ .../schema/CollectionConfiguration.java | 14 ++- 4 files changed, 226 insertions(+), 1 deletion(-) create mode 100644 source/net/yacy/cora/federate/solr/logic/Conjunction.java create mode 100644 source/net/yacy/cora/federate/solr/logic/DNF.java create mode 100644 source/net/yacy/cora/federate/solr/logic/Literal.java diff --git a/source/net/yacy/cora/federate/solr/logic/Conjunction.java b/source/net/yacy/cora/federate/solr/logic/Conjunction.java new file mode 100644 index 000000000..9645aef69 --- /dev/null +++ b/source/net/yacy/cora/federate/solr/logic/Conjunction.java @@ -0,0 +1,91 @@ +/** + * Conjunction + * Copyright 2014 by Michael Peter Christen + * First released 03.08.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr.logic; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; + +/** + * A Concunction is a conjunction of atomic key/value pairs to Solr. The purpose of this class is, + * to provide a mechanism to reduce the calls to Solr when calling Solr several times with sets of + * key/value pairs which are all conjunctive. A combined query for a set of disjunctive conjunctions + * is provided by the DNF class. The result of a DNF class query to solr must be separated again using + * the original conjunctive terms which is represented by this class. The SolrDocumentList which are + * results from individual calls is then the same as a SolrDocument list which can be computed with the + * method apply() in this class on the DNF of the Solr result. + */ +public class Conjunction { + + private final List literals; + + public Conjunction() { + this.literals = new ArrayList<>(); + } + + public void addLiteral(Literal literal) { + this.literals.add(literal); + } + + /** + * create a Solr query string from this conjunction + * @return a string which is a Solr query string + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + for (Literal l: this.literals) { + if (sb.length() > 0) sb.append(" AND "); + sb.append(l.toString()); + } + return sb.toString(); + } + + /** + * check if this conjunction matches with a given SolrDocument + * @param doc the SolrDocument to match to + * @return true, if all literals of this conjunction match with the key/value pairs of the document + */ + public boolean matches(SolrDocument doc) { + for (Literal literal: this.literals) { + if (!literal.matches(doc)) return false; + } + return true; + } + + /** + * create a hit subset of the given SolrDocumentList according to the conjunction defined + * in this object + * @param sdl the SolrDocumentList + * @return a manufactured subset-clone of the given SolrDocumentList where document match with the Conjunction as given in this object + */ + public SolrDocumentList apply(SolrDocumentList sdl) { + SolrDocumentList r = new SolrDocumentList(); + int numFound = 0; + for (SolrDocument d: r) { + if (matches(d)) {r.add(d); numFound++;} + } + r.setNumFound(numFound); + return r; + } +} diff --git a/source/net/yacy/cora/federate/solr/logic/DNF.java b/source/net/yacy/cora/federate/solr/logic/DNF.java new file mode 100644 index 000000000..37b53644f --- /dev/null +++ b/source/net/yacy/cora/federate/solr/logic/DNF.java @@ -0,0 +1,59 @@ +/** + * DNF + * Copyright 2014 by Michael Peter Christen + * First released 03.08.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr.logic; + +import java.util.ArrayList; +import java.util.List; + +/** + * This is the implementation of a disjunctive normal form, which is the disjunction of conjunctions. + * See: http://en.wikipedia.org/wiki/Disjunctive_normal_form + * We use a DNF to combine several solr queries into one if that is applicable. + * When caling Solr with a DNF, we need only one http request (if this is done with a remote Solr) + * and thus saving the network overhead for each single (conjunctive) query. To filter out the conjunctions + * from the bundled query result, you must apply the apply() method from the Conjunction class. + */ +public class DNF { + + private final List dnf; + + public DNF() { + this.dnf = new ArrayList<>(); + } + + public void addConjunction(Conjunction conjunction) { + this.dnf.add(conjunction); + } + + /** + * create a Solr query string from this DNF + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + for (Conjunction c: this.dnf) { + if (sb.length() > 0) sb.append(" OR "); + sb.append('(').append(c.toString()).append(')'); + } + return sb.toString(); + } + +} diff --git a/source/net/yacy/cora/federate/solr/logic/Literal.java b/source/net/yacy/cora/federate/solr/logic/Literal.java new file mode 100644 index 000000000..177031022 --- /dev/null +++ b/source/net/yacy/cora/federate/solr/logic/Literal.java @@ -0,0 +1,63 @@ +/** + * Literal + * Copyright 2014 by Michael Peter Christen + * First released 03.08.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr.logic; + +import org.apache.solr.common.SolrDocument; + +import net.yacy.cora.federate.solr.SchemaDeclaration; + +public class Literal { + + private SchemaDeclaration key; + private String value; + private boolean sign; + + public Literal(final SchemaDeclaration key, final String value, final boolean sign) { + this.key = key; + this.value = value; + this.sign = sign; + } + + /** + * create a Solr query string from this literal + * @return a string which is a Solr query string + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + if (!this.sign) sb.append('-'); + sb.append(this.key.getSolrFieldName()); + sb.append(':').append('"').append(this.value).append('"'); + return sb.toString(); + } + + /** + * check if the key/value pair of this literal occurs in the SolrDocument + * @param doc the document to match to this literal + * @return true, if the key of this literal is contained in the document and the + * value equals (does not equal) with the value if this literal (if the signature is false) + */ + public boolean matches(SolrDocument doc) { + Object v = doc.getFieldValue(this.key.getSolrFieldName()); + if (v == null) return false; + return v.toString().matches(this.value) == this.sign; + } +} diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 36c7aa77e..56587f024 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -60,6 +60,9 @@ import net.yacy.cora.federate.solr.SchemaDeclaration; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL; +import net.yacy.cora.federate.solr.logic.Conjunction; +import net.yacy.cora.federate.solr.logic.DNF; +import net.yacy.cora.federate.solr.logic.Literal; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; @@ -1371,18 +1374,27 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // FIND OUT IF THIS IS A DOUBLE DOCUMENT String urlhash = ASCII.String(url.hash()); String hostid = url.hosthash(); + DNF dnf = new DNF(); uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{ {CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i}, {CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b, CollectionSchema.fuzzy_signature_copycount_i}}) { CollectionSchema signaturefield = checkfields[0]; CollectionSchema uniquefield = checkfields[1]; CollectionSchema countfield = checkfields[2]; + if (this.contains(signaturefield) && this.contains(uniquefield) && this.contains(countfield)) { // lookup the document with the same signature Long signature = (Long) sid.getField(signaturefield.getSolrFieldName()).getValue(); if (signature == null) continue uniquecheck; + Conjunction con = new Conjunction(); + con.addLiteral(new Literal(CollectionSchema.id, urlhash, false)); + con.addLiteral(new Literal(CollectionSchema.host_id_s, hostid, true)); + con.addLiteral(new Literal(signaturefield, signature.toString(), true)); + dnf.addConjunction(con); + String query = con.toString(); try { - SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"", null, 0, 100, CollectionSchema.id.getSolrFieldName()); + //SolrDocumentList docsOld = segment.fulltext().getDefaultConnector().getDocumentListByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"", null, 0, 2000, CollectionSchema.id.getSolrFieldName()); + SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(query, null, 0, 2000, CollectionSchema.id.getSolrFieldName()); if (docs.getNumFound() == 0) { sid.setField(uniquefield.getSolrFieldName(), true); sid.setField(countfield.getSolrFieldName(), 1);