From 0ceeceb35e0b9a985b2094e6fd8c3f71e259cbb0 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 4 Aug 2014 02:35:38 +0200 Subject: [PATCH] more logic on Solr queries; usage of the query terms in posprocessing, saving one query for double document detection now per document --- .../solr/logic/AbstractOperations.java | 97 +++++++++++++++++++ .../federate/solr/logic/AbstractTerm.java | 44 +++++++++ .../cora/federate/solr/logic/Conjunction.java | 69 +++++-------- .../yacy/cora/federate/solr/logic/DNF.java | 59 ----------- .../cora/federate/solr/logic/Disjunction.java | 65 +++++++++++++ .../cora/federate/solr/logic/Literal.java | 44 +++++++-- .../cora/federate/solr/logic/Negation.java | 89 +++++++++++++++++ .../cora/federate/solr/logic/Operations.java | 58 +++++++++++ .../yacy/cora/federate/solr/logic/Term.java | 76 +++++++++++++++ .../cora/federate/solr/logic/TermTools.java | 58 +++++++++++ .../net/yacy/kelondro/util/MemoryControl.java | 3 +- .../schema/CollectionConfiguration.java | 65 ++++++++----- 12 files changed, 592 insertions(+), 135 deletions(-) create mode 100644 source/net/yacy/cora/federate/solr/logic/AbstractOperations.java create mode 100644 source/net/yacy/cora/federate/solr/logic/AbstractTerm.java delete mode 100644 source/net/yacy/cora/federate/solr/logic/DNF.java create mode 100644 source/net/yacy/cora/federate/solr/logic/Disjunction.java create mode 100644 source/net/yacy/cora/federate/solr/logic/Negation.java create mode 100644 source/net/yacy/cora/federate/solr/logic/Operations.java create mode 100644 source/net/yacy/cora/federate/solr/logic/Term.java create mode 100644 source/net/yacy/cora/federate/solr/logic/TermTools.java diff --git a/source/net/yacy/cora/federate/solr/logic/AbstractOperations.java b/source/net/yacy/cora/federate/solr/logic/AbstractOperations.java new file mode 100644 index 000000000..7cdaa5ce7 --- /dev/null +++ b/source/net/yacy/cora/federate/solr/logic/AbstractOperations.java @@ -0,0 +1,97 @@ +package net.yacy.cora.federate.solr.logic; + +import java.util.ArrayList; +import java.util.List; + +public abstract class AbstractOperations extends AbstractTerm implements Operations { + + protected final String operandName; + protected final List terms; + + public AbstractOperations(final String operandName) { + this.operandName = operandName; + this.terms = new ArrayList<>(); + } + + @Override + public int hashCode() { + int h = operandName.hashCode(); + for (Term t: this.terms) h += t.hashCode(); + return h; + } + + @Override + public void addOperand(Term term) { + this.terms.add(term); + } + + /** + * As a Operations object is a collection of Terms, we must be able to show them + * @return the list of terms + */ + @Override + public List getOperands() { + return this.terms; + } + + /** + * the weight attribute of a term shows if rewritten terms + * (using rules of replacement as allowed for propositional logic) + * are shorter and therefore more efficient. + * @return the number of operators plus the number of operands plus one + */ + @Override + public int weight() { + return terms.size() * 2; + } + + @Override + public boolean isBinary() { + return this.terms.size() == 2; + } + + /** + * can we set brackets anywhere (means: can we change calculation order)? + */ + @Override + public boolean isAssociative() { + return true; + } + + /** + * can we switch operands (must be binary) + */ + @Override + public boolean isCommutative() { + return isBinary(); + } + + /** + * can we 'multiply inside' (must be binary) + */ + @Override + public boolean isDistributive() { + return isBinary(); + } + + @Override + public Term lightestRewrite() { + return this; + } + + + /** + * create a Solr query string from this conjunction + * @return a string which is a Solr query string + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + for (Term term: this.terms) { + if (sb.length() == 0) sb.append('('); else sb.append(") ").append(this.operandName).append(" ("); + sb.append(term.toString()); + } + sb.append(')'); + return sb.toString(); + } +} diff --git a/source/net/yacy/cora/federate/solr/logic/AbstractTerm.java b/source/net/yacy/cora/federate/solr/logic/AbstractTerm.java new file mode 100644 index 000000000..18bb7f3f4 --- /dev/null +++ b/source/net/yacy/cora/federate/solr/logic/AbstractTerm.java @@ -0,0 +1,44 @@ +/** + * AbstractTerm + * Copyright 2014 by Michael Peter Christen + * First released 03.08.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr.logic; + +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; + +public abstract class AbstractTerm implements Term { + + /** + * create a hit subset of the given SolrDocumentList according to the conjunction defined + * in this object + * @param sdl the SolrDocumentList + * @return a manufactured subset-clone of the given SolrDocumentList where document match with the term as given in this object + */ + @Override + public SolrDocumentList apply(SolrDocumentList sdl) { + SolrDocumentList r = new SolrDocumentList(); + int numFound = 0; + for (SolrDocument d: sdl) { + if (matches(d)) {r.add(d); numFound++;} + } + r.setNumFound(numFound); + return r; + } +} diff --git a/source/net/yacy/cora/federate/solr/logic/Conjunction.java b/source/net/yacy/cora/federate/solr/logic/Conjunction.java index 9645aef69..df5286631 100644 --- a/source/net/yacy/cora/federate/solr/logic/Conjunction.java +++ b/source/net/yacy/cora/federate/solr/logic/Conjunction.java @@ -20,72 +20,47 @@ package net.yacy.cora.federate.solr.logic; -import java.util.ArrayList; -import java.util.List; - import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrDocumentList; /** - * A Concunction is a conjunction of atomic key/value pairs to Solr. The purpose of this class is, + * A Conjunction is a conjunction of terms to Solr. The purpose of this class is, * to provide a mechanism to reduce the calls to Solr when calling Solr several times with sets of - * key/value pairs which are all conjunctive. A combined query for a set of disjunctive conjunctions - * is provided by the DNF class. The result of a DNF class query to solr must be separated again using - * the original conjunctive terms which is represented by this class. The SolrDocumentList which are - * results from individual calls is then the same as a SolrDocument list which can be computed with the - * method apply() in this class on the DNF of the Solr result. + * terms which are all conjunctive. */ -public class Conjunction { - - private final List literals; +public class Conjunction extends AbstractOperations implements Operations { public Conjunction() { - this.literals = new ArrayList<>(); + super("AND"); } - - public void addLiteral(Literal literal) { - this.literals.add(literal); + + @Override + public Object clone() { + Conjunction c = new Conjunction(); + for (Term t: this.terms) c.addOperand(t); + return c; } - - /** - * create a Solr query string from this conjunction - * @return a string which is a Solr query string - */ + @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - for (Literal l: this.literals) { - if (sb.length() > 0) sb.append(" AND "); - sb.append(l.toString()); + public boolean equals(Object otherTerm) { + if (!(otherTerm instanceof Conjunction)) return false; + Conjunction o = (Conjunction) otherTerm; + for (Term t: this.terms) { + if (!TermTools.isIn(t, o.getOperands())) return false; } - return sb.toString(); + return true; } - + /** * check if this conjunction matches with a given SolrDocument * @param doc the SolrDocument to match to - * @return true, if all literals of this conjunction match with the key/value pairs of the document + * @return true, if all literals of this conjunction match with the terms of the document */ + @Override public boolean matches(SolrDocument doc) { - for (Literal literal: this.literals) { - if (!literal.matches(doc)) return false; + for (Term term: this.terms) { + if (!term.matches(doc)) return false; } return true; } - /** - * create a hit subset of the given SolrDocumentList according to the conjunction defined - * in this object - * @param sdl the SolrDocumentList - * @return a manufactured subset-clone of the given SolrDocumentList where document match with the Conjunction as given in this object - */ - public SolrDocumentList apply(SolrDocumentList sdl) { - SolrDocumentList r = new SolrDocumentList(); - int numFound = 0; - for (SolrDocument d: r) { - if (matches(d)) {r.add(d); numFound++;} - } - r.setNumFound(numFound); - return r; - } } diff --git a/source/net/yacy/cora/federate/solr/logic/DNF.java b/source/net/yacy/cora/federate/solr/logic/DNF.java deleted file mode 100644 index 37b53644f..000000000 --- a/source/net/yacy/cora/federate/solr/logic/DNF.java +++ /dev/null @@ -1,59 +0,0 @@ -/** - * DNF - * Copyright 2014 by Michael Peter Christen - * First released 03.08.2014 at http://yacy.net - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program in the file lgpl21.txt - * If not, see . - */ - -package net.yacy.cora.federate.solr.logic; - -import java.util.ArrayList; -import java.util.List; - -/** - * This is the implementation of a disjunctive normal form, which is the disjunction of conjunctions. - * See: http://en.wikipedia.org/wiki/Disjunctive_normal_form - * We use a DNF to combine several solr queries into one if that is applicable. - * When caling Solr with a DNF, we need only one http request (if this is done with a remote Solr) - * and thus saving the network overhead for each single (conjunctive) query. To filter out the conjunctions - * from the bundled query result, you must apply the apply() method from the Conjunction class. - */ -public class DNF { - - private final List dnf; - - public DNF() { - this.dnf = new ArrayList<>(); - } - - public void addConjunction(Conjunction conjunction) { - this.dnf.add(conjunction); - } - - /** - * create a Solr query string from this DNF - */ - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - for (Conjunction c: this.dnf) { - if (sb.length() > 0) sb.append(" OR "); - sb.append('(').append(c.toString()).append(')'); - } - return sb.toString(); - } - -} diff --git a/source/net/yacy/cora/federate/solr/logic/Disjunction.java b/source/net/yacy/cora/federate/solr/logic/Disjunction.java new file mode 100644 index 000000000..b74830ec1 --- /dev/null +++ b/source/net/yacy/cora/federate/solr/logic/Disjunction.java @@ -0,0 +1,65 @@ +/** + * Disjunction + * Copyright 2014 by Michael Peter Christen + * First released 03.08.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr.logic; + +import org.apache.solr.common.SolrDocument; + +/** + * A Disjunction is a desjunction of terms to Solr. The purpose of this class is, + * to provide a mechanism to reduce the calls to Solr when calling Solr several times with sets of + * terms which are all disjunctive. + */ +public class Disjunction extends AbstractOperations implements Operations { + + public Disjunction() { + super("OR"); + } + + @Override + public Object clone() { + Disjunction c = new Disjunction(); + for (Term t: this.terms) c.addOperand(t); + return c; + } + + @Override + public boolean equals(Object otherTerm) { + if (!(otherTerm instanceof Disjunction)) return false; + Disjunction o = (Disjunction) otherTerm; + for (Term t: this.terms) { + if (!TermTools.isIn(t, o.getOperands())) return false; + } + return true; + } + + /** + * check if this disjunction matches with a given SolrDocument + * @param doc the SolrDocument to match to + * @return true, if all literals of this disjunction match with the terms of the document + */ + @Override + public boolean matches(SolrDocument doc) { + for (Term term: this.terms) { + if (term.matches(doc)) return true; + } + return false; + } +} diff --git a/source/net/yacy/cora/federate/solr/logic/Literal.java b/source/net/yacy/cora/federate/solr/logic/Literal.java index 177031022..7cdeef944 100644 --- a/source/net/yacy/cora/federate/solr/logic/Literal.java +++ b/source/net/yacy/cora/federate/solr/logic/Literal.java @@ -23,19 +23,46 @@ package net.yacy.cora.federate.solr.logic; import org.apache.solr.common.SolrDocument; import net.yacy.cora.federate.solr.SchemaDeclaration; +import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; -public class Literal { +public class Literal extends AbstractTerm implements Term { private SchemaDeclaration key; private String value; - private boolean sign; - public Literal(final SchemaDeclaration key, final String value, final boolean sign) { + public Literal(final SchemaDeclaration key, final String value) { this.key = key; this.value = value; - this.sign = sign; } + @Override + public Object clone() { + return new Literal(this.key, this.value); + } + + @Override + public boolean equals(Object otherTerm) { + if (!(otherTerm instanceof Literal)) return false; + Literal o = (Literal) otherTerm; + return this.key.equals(o.key) && this.value.equals(o.value); + } + + @Override + public int hashCode() { + return key.hashCode() + value.hashCode(); + } + + /** + * the length attribute of a term shows if rewritten terms + * (using rules of replacement as allowed for propositional logic) + * are shorter and therefore more efficient. + * @return the number of operators plus the number of operands plus one + */ + @Override + public int weight() { + return 1; + } + /** * create a Solr query string from this literal * @return a string which is a Solr query string @@ -43,7 +70,6 @@ public class Literal { @Override public String toString() { StringBuilder sb = new StringBuilder(); - if (!this.sign) sb.append('-'); sb.append(this.key.getSolrFieldName()); sb.append(':').append('"').append(this.value).append('"'); return sb.toString(); @@ -55,9 +81,15 @@ public class Literal { * @return true, if the key of this literal is contained in the document and the * value equals (does not equal) with the value if this literal (if the signature is false) */ + @Override public boolean matches(SolrDocument doc) { Object v = doc.getFieldValue(this.key.getSolrFieldName()); if (v == null) return false; - return v.toString().matches(this.value) == this.sign; + return this.value.equals(AbstractSolrConnector.CATCHALL_TERM) || v.toString().matches(this.value); + } + + @Override + public Term lightestRewrite() { + return this; } } diff --git a/source/net/yacy/cora/federate/solr/logic/Negation.java b/source/net/yacy/cora/federate/solr/logic/Negation.java new file mode 100644 index 000000000..605f7c850 --- /dev/null +++ b/source/net/yacy/cora/federate/solr/logic/Negation.java @@ -0,0 +1,89 @@ +/** + * Negation + * Copyright 2014 by Michael Peter Christen + * First released 03.08.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr.logic; + +import org.apache.solr.common.SolrDocument; + +public class Negation extends AbstractTerm implements Term { + + private Term term; + + public Negation(final Term term) { + this.term = term; + } + + @Override + public Object clone() { + return new Negation(this.term); + } + + @Override + public boolean equals(Object otherTerm) { + if (!(otherTerm instanceof Negation)) return false; + Negation o = (Negation) otherTerm; + return this.term.equals(o.term); + } + + @Override + public int hashCode() { + return -this.term.hashCode(); + } + + /** + * the length attribute of a term shows if rewritten terms + * (using rules of replacement as allowed for propositional logic) + * are shorter and therefore more efficient. + * @return the number of operators plus the number of operands plus one + */ + @Override + public int weight() { + return term.weight() + 1; + } + + /** + * create a Solr query string from this literal + * @return a string which is a Solr query string + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append('-').append(this.term.toString()); + return sb.toString(); + } + + /** + * check if the key/value pair of this literal occurs in the SolrDocument + * @param doc the document to match to this literal + * @return true, if the key of this literal is contained in the document and the + * value equals (does not equal) with the value if this literal (if the signature is false) + */ + @Override + public boolean matches(SolrDocument doc) { + return !this.term.matches(doc); + } + + @Override + public Term lightestRewrite() { + // TODO: this can be enhanced if negations are not attached to atoms + Term t = this.term.lightestRewrite(); + return new Negation(t); + } +} diff --git a/source/net/yacy/cora/federate/solr/logic/Operations.java b/source/net/yacy/cora/federate/solr/logic/Operations.java new file mode 100644 index 000000000..219b9f172 --- /dev/null +++ b/source/net/yacy/cora/federate/solr/logic/Operations.java @@ -0,0 +1,58 @@ +package net.yacy.cora.federate.solr.logic; + +import java.util.List; + +/** + * The Operations class describes a set of operands which form a term using the same operation. + */ +public interface Operations extends Term { + + /** + * As a Operations object is a collection of Terms, we must be able to show them + * @return the list of terms + */ + public List getOperands(); + + /** + * add another operand to the operations term + * @param operand + */ + public void addOperand(Term operand); + + /** + * the operation is binary, if it contains two operands + * @return if this is a binary operation + */ + public boolean isBinary(); + + /** + * a binary operation * on a set S is called associative if it satisfies the associative law: + * (x * y) * z = x * (y * z) for all x,y,z in S. + * @return true if this is associative + */ + public boolean isAssociative(); + + /** + * In standard truth-functional propositional logic, commutativity refer to two valid rules of replacement. + * The rules allow one to transpose propositional variables within logical expressions in logical proofs. The rules are: + * (P OR Q) <=> (Q OR P) + * (P AND Q) <=> (Q AND P) + * @return true if this is distributive + */ + public boolean isCommutative(); + + /** + * In propositional logic, distribution refers to two valid rules of replacement. + * The rules allow one to reformulate conjunctions and disjunctions within logical proofs. + * Given a set S and two binary operators * and + on S, we say that the operation * + * is left-distributive over + if, given any elements x, y, and z of S, + * x * (y + z) = (x * y) + (x * z) + * is right-distributive over + if, given any elements x, y, and z of S: + * (y + z) * x = (y * x) + (z * x) + * is distributive over + if it is left- and right-distributive. + * @return true if this is distributive; + */ + public boolean isDistributive(); + + +} diff --git a/source/net/yacy/cora/federate/solr/logic/Term.java b/source/net/yacy/cora/federate/solr/logic/Term.java new file mode 100644 index 000000000..685e0bf7f --- /dev/null +++ b/source/net/yacy/cora/federate/solr/logic/Term.java @@ -0,0 +1,76 @@ +/** + * Term + * Copyright 2014 by Michael Peter Christen + * First released 03.08.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.cora.federate.solr.logic; + +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; + +public interface Term { + + /** + * Equal method which returns true if the terms are logically equal. + * It is advised to create minimum-weight variants of the terms using lightestRewrite() before comparing because + * the equals method should not apply rewrite rules. If two terms are equal, then also their minimum weight rewrite is equal. + * @param otherTerm + * @return true if the interpretation (apply method) of the term is equal to the interpretation (apply method) of otherTerm on any document + */ + @Override + public boolean equals(Object otherTerm); + + /** + * the weight attribute of a term shows if rewritten terms + * (using rules of replacement as allowed for propositional logic) + * are shorter and therefore more efficient. + * @return the number of operators plus the number of operands plus one + */ + public int weight(); + + /** + * toString produces the Solr Query representation of the term + * @return the Solr Query String + */ + @Override + public String toString(); + + /** + * check if this term matches the SolrDocument + * @param doc the document to match to this term + * @return true, if this term matches with the document + */ + public boolean matches(SolrDocument doc); + + /** + * Create a hit subset of the given SolrDocumentList according to the conjunction defined + * in this object. This is the interpretation of the term on a 'world object' (the Solr document). + * @param sdl the SolrDocumentList + * @return a manufactured subset-clone of the given SolrDocumentList where document match with the term + */ + public SolrDocumentList apply(SolrDocumentList sdl); + + /** + * Applying a rewrite rule to the term should not change the logical expression of the term. + * The possible set of rewrites of the term is computed and the ligtest rewrite of the underlying terms + * are used to compare all rewrites to each other. Then the lightest term is returned. + * @return the lightest term that is logically equivalent to the given term + */ + public Term lightestRewrite(); +} diff --git a/source/net/yacy/cora/federate/solr/logic/TermTools.java b/source/net/yacy/cora/federate/solr/logic/TermTools.java new file mode 100644 index 000000000..0be80e926 --- /dev/null +++ b/source/net/yacy/cora/federate/solr/logic/TermTools.java @@ -0,0 +1,58 @@ +/** + * TermTools + * Copyright 2014 by Michael Peter Christen + * First released 04.08.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.cora.federate.solr.logic; + +import java.util.List; + +/** + * static methods for term comparison, term order, term weights, permutations etc. + */ +public class TermTools { + + public static boolean isIn(final Term a, final List termlist) { + for (Term t: termlist) { + if (a.equals(t)) return true; + } + return false; + } + /* + public static ArrayList permutations(final Operations operations) { + List ops = operations.getOperands(); + int os = ops.size(); + ArrayList permutation = new ArrayList(); + if (ops.size() < 2) { + permutation.add(operations); + return permutation; + } + Term head = ops.get(0); + ops.remove(0); + ArrayList p1 = permutations(operations); + for (Operations pt: p1) { + // insert head into each position from pt + for (int i = 0; i < os; i++) { + + } + } + return + } + */ +} diff --git a/source/net/yacy/kelondro/util/MemoryControl.java b/source/net/yacy/kelondro/util/MemoryControl.java index 6b4735cfc..0e9cbc1ef 100644 --- a/source/net/yacy/kelondro/util/MemoryControl.java +++ b/source/net/yacy/kelondro/util/MemoryControl.java @@ -92,7 +92,8 @@ public class MemoryControl { * @return bytes */ public static final long available() { - return getStrategy().available(); + long available = getStrategy().available(); + return available; } /** diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 56587f024..3cf4ba1cd 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -61,8 +61,9 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL; import net.yacy.cora.federate.solr.logic.Conjunction; -import net.yacy.cora.federate.solr.logic.DNF; +import net.yacy.cora.federate.solr.logic.Disjunction; import net.yacy.cora.federate.solr.logic.Literal; +import net.yacy.cora.federate.solr.logic.Negation; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; @@ -1374,7 +1375,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // FIND OUT IF THIS IS A DOUBLE DOCUMENT String urlhash = ASCII.String(url.hash()); String hostid = url.hosthash(); - DNF dnf = new DNF(); + Conjunction con = new Conjunction(); + con.addOperand(new Negation(new Literal(CollectionSchema.id, urlhash))); + con.addOperand(new Literal(CollectionSchema.host_id_s, hostid)); + Disjunction dnf = new Disjunction(); uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{ {CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i}, {CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b, CollectionSchema.fuzzy_signature_copycount_i}}) { @@ -1386,25 +1390,42 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // lookup the document with the same signature Long signature = (Long) sid.getField(signaturefield.getSolrFieldName()).getValue(); if (signature == null) continue uniquecheck; - Conjunction con = new Conjunction(); - con.addLiteral(new Literal(CollectionSchema.id, urlhash, false)); - con.addLiteral(new Literal(CollectionSchema.host_id_s, hostid, true)); - con.addLiteral(new Literal(signaturefield, signature.toString(), true)); - dnf.addConjunction(con); - String query = con.toString(); - try { - //SolrDocumentList docsOld = segment.fulltext().getDefaultConnector().getDocumentListByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"", null, 0, 2000, CollectionSchema.id.getSolrFieldName()); - SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(query, null, 0, 2000, CollectionSchema.id.getSolrFieldName()); - if (docs.getNumFound() == 0) { - sid.setField(uniquefield.getSolrFieldName(), true); - sid.setField(countfield.getSolrFieldName(), 1); - } else { - boolean firstappearance = true; - for (SolrDocument d: docs) {if (uniqueURLs.contains(d.getFieldValue(CollectionSchema.id.getSolrFieldName()))) firstappearance = false; break;} - sid.setField(uniquefield.getSolrFieldName(), firstappearance); - sid.setField(countfield.getSolrFieldName(), docs.getNumFound() + 1); // the current url was excluded from search but is included in count - } - } catch (final IOException e) {} + //con.addOperand(new Negation(new Literal(CollectionSchema.id, urlhash))); + //con.addOperand(new Literal(CollectionSchema.host_id_s, hostid)); + dnf.addOperand(new Literal(signaturefield, signature.toString())); + } + } + con.addOperand(dnf); + String query = con.toString(); + SolrDocumentList docsAkk; + try { + docsAkk = segment.fulltext().getDefaultConnector().getDocumentListByQuery(query, null, 0, 2000, + CollectionSchema.id.getSolrFieldName(), CollectionSchema.exact_signature_l.getSolrFieldName(), CollectionSchema.fuzzy_signature_l.getSolrFieldName()); + } catch (final IOException e) { + ConcurrentLog.logException(e); + docsAkk = new SolrDocumentList(); + } + uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{ + {CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i}, + {CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b, CollectionSchema.fuzzy_signature_copycount_i}}) { + CollectionSchema signaturefield = checkfields[0]; + CollectionSchema uniquefield = checkfields[1]; + CollectionSchema countfield = checkfields[2]; + + if (this.contains(signaturefield) && this.contains(uniquefield) && this.contains(countfield)) { + // lookup the document with the same signature + Long signature = (Long) sid.getField(signaturefield.getSolrFieldName()).getValue(); + if (signature == null) continue uniquecheck; + SolrDocumentList docs = new Literal(signaturefield, signature.toString()).apply(docsAkk); + if (docs.getNumFound() == 0) { + sid.setField(uniquefield.getSolrFieldName(), true); + sid.setField(countfield.getSolrFieldName(), 1); + } else { + boolean firstappearance = true; + for (SolrDocument d: docs) {if (uniqueURLs.contains(d.getFieldValue(CollectionSchema.id.getSolrFieldName()))) firstappearance = false; break;} + sid.setField(uniquefield.getSolrFieldName(), firstappearance); + sid.setField(countfield.getSolrFieldName(), docs.getNumFound() + 1); // the current url was excluded from search but is included in count + } } } @@ -1440,7 +1461,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri "-" + CollectionSchema.robots_i.getSolrFieldName() + ":24 AND " + // bit 3 + 4 "-" + CollectionSchema.robots_i.getSolrFieldName() + ":512 AND " + // bit 9 "-" + CollectionSchema.robots_i.getSolrFieldName() + ":1536 AND " + // bit 9 + 10 - "((-" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":[* TO *]) OR (" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":true)) AND " + + "((-" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":" + AbstractSolrConnector.CATCHALL_TERM + ") OR (" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":true)) AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND " + "-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"";