diff --git a/source/net/yacy/cora/federate/solr/logic/AbstractOperations.java b/source/net/yacy/cora/federate/solr/logic/AbstractOperations.java index 7cdaa5ce7..a1c4a8270 100644 --- a/source/net/yacy/cora/federate/solr/logic/AbstractOperations.java +++ b/source/net/yacy/cora/federate/solr/logic/AbstractOperations.java @@ -78,7 +78,6 @@ public abstract class AbstractOperations extends AbstractTerm implements Operati public Term lightestRewrite() { return this; } - /** * create a Solr query string from this conjunction @@ -86,12 +85,18 @@ public abstract class AbstractOperations extends AbstractTerm implements Operati */ @Override public String toString() { + if (this.terms.size() == 0) return ""; StringBuilder sb = new StringBuilder(); - for (Term term: this.terms) { - if (sb.length() == 0) sb.append('('); else sb.append(") ").append(this.operandName).append(" ("); - sb.append(term.toString()); + if (this.terms.size() == 1) { + sb.append(terms.iterator().next().toString()); + } else { + sb.append('('); + for (Term term: this.terms) { + if (sb.length() > 1) sb.append(' ').append(this.operandName).append(' '); + sb.append(term.toString()); + } + sb.append(')'); } - sb.append(')'); return sb.toString(); } } diff --git a/source/net/yacy/cora/federate/solr/logic/BooleanLiteral.java b/source/net/yacy/cora/federate/solr/logic/BooleanLiteral.java new file mode 100644 index 000000000..47b5f8b37 --- /dev/null +++ b/source/net/yacy/cora/federate/solr/logic/BooleanLiteral.java @@ -0,0 +1,80 @@ +/** + * BooleanLiteral + * Copyright 2014 by Michael Peter Christen + * First released 24.10.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr.logic; + +import org.apache.solr.common.SolrDocument; + +import net.yacy.cora.federate.solr.SchemaDeclaration; + +public class BooleanLiteral extends Literal implements Term { + + private SchemaDeclaration key; + private boolean value; + + public BooleanLiteral(final SchemaDeclaration key, final boolean value) { + super(); + this.key = key; + this.value = value; + } + + @Override + public Object clone() { + return new BooleanLiteral(this.key, this.value); + } + + @Override + public boolean equals(Object otherTerm) { + if (!(otherTerm instanceof BooleanLiteral)) return false; + BooleanLiteral o = (BooleanLiteral) otherTerm; + return this.key.equals(o.key) && this.value == o.value; + } + + @Override + public int hashCode() { + return this.key.hashCode() + (this.value ? 1 : 0); + } + + /** + * create a Solr query string from this literal + * @return a string which is a Solr query string + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(this.key.getSolrFieldName()); + sb.append(':').append(this.value ? "true" : "false"); + return sb.toString(); + } + + /** + * check if the key/value pair of this literal occurs in the SolrDocument + * @param doc the document to match to this literal + * @return true, if the key of this literal is contained in the document and the + * value equals (does not equal) with the value if this literal (if the signature is false) + */ + @Override + public boolean matches(SolrDocument doc) { + Object v = doc.getFieldValue(this.key.getSolrFieldName()); + if (v == null) return false; + return v.toString().matches(this.value ? "true" : "false"); + } + +} \ No newline at end of file diff --git a/source/net/yacy/cora/federate/solr/logic/CatchallLiteral.java b/source/net/yacy/cora/federate/solr/logic/CatchallLiteral.java new file mode 100644 index 000000000..dcd414fb2 --- /dev/null +++ b/source/net/yacy/cora/federate/solr/logic/CatchallLiteral.java @@ -0,0 +1,79 @@ +/** + * CatchallLiteral + * Copyright 2014 by Michael Peter Christen + * First released 24.10.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr.logic; + +import org.apache.solr.common.SolrDocument; + +import net.yacy.cora.federate.solr.SchemaDeclaration; +import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; + +public class CatchallLiteral extends Literal implements Term { + + private SchemaDeclaration key; + + public CatchallLiteral(final SchemaDeclaration key) { + super(); + this.key = key; + } + + @Override + public Object clone() { + return new CatchallLiteral(this.key); + } + + @Override + public boolean equals(Object otherTerm) { + if (!(otherTerm instanceof CatchallLiteral)) return false; + CatchallLiteral o = (CatchallLiteral) otherTerm; + return this.key.equals(o.key); + } + + @Override + public int hashCode() { + return this.key.hashCode(); + } + + /** + * create a Solr query string from this literal + * @return a string which is a Solr query string + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(this.key.getSolrFieldName()); + sb.append(':').append(AbstractSolrConnector.CATCHALL_TERM); + return sb.toString(); + } + + /** + * check if the key/value pair of this literal occurs in the SolrDocument + * @param doc the document to match to this literal + * @return true, if the key of this literal is contained in the document and the + * value equals (does not equal) with the value if this literal (if the signature is false) + */ + @Override + public boolean matches(SolrDocument doc) { + Object v = doc.getFieldValue(this.key.getSolrFieldName()); + if (v == null) return false; // this does not match if the field is missing + return true; + } + +} \ No newline at end of file diff --git a/source/net/yacy/cora/federate/solr/logic/Conjunction.java b/source/net/yacy/cora/federate/solr/logic/Conjunction.java index df5286631..06f09204e 100644 --- a/source/net/yacy/cora/federate/solr/logic/Conjunction.java +++ b/source/net/yacy/cora/federate/solr/logic/Conjunction.java @@ -33,6 +33,12 @@ public class Conjunction extends AbstractOperations implements Operations { super("AND"); } + public Conjunction(final Term t1, final Term t2) { + super("AND"); + this.addOperand(t1); + this.addOperand(t2); + } + @Override public Object clone() { Conjunction c = new Conjunction(); diff --git a/source/net/yacy/cora/federate/solr/logic/Disjunction.java b/source/net/yacy/cora/federate/solr/logic/Disjunction.java index b74830ec1..bd2c74073 100644 --- a/source/net/yacy/cora/federate/solr/logic/Disjunction.java +++ b/source/net/yacy/cora/federate/solr/logic/Disjunction.java @@ -32,6 +32,12 @@ public class Disjunction extends AbstractOperations implements Operations { public Disjunction() { super("OR"); } + + public Disjunction(final Term t1, final Term t2) { + super("OR"); + this.addOperand(t1); + this.addOperand(t2); + } @Override public Object clone() { diff --git a/source/net/yacy/cora/federate/solr/logic/Literal.java b/source/net/yacy/cora/federate/solr/logic/Literal.java index 7cdeef944..edea5a1d7 100644 --- a/source/net/yacy/cora/federate/solr/logic/Literal.java +++ b/source/net/yacy/cora/federate/solr/logic/Literal.java @@ -20,36 +20,9 @@ package net.yacy.cora.federate.solr.logic; -import org.apache.solr.common.SolrDocument; +public abstract class Literal extends AbstractTerm implements Term { -import net.yacy.cora.federate.solr.SchemaDeclaration; -import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; - -public class Literal extends AbstractTerm implements Term { - - private SchemaDeclaration key; - private String value; - - public Literal(final SchemaDeclaration key, final String value) { - this.key = key; - this.value = value; - } - - @Override - public Object clone() { - return new Literal(this.key, this.value); - } - - @Override - public boolean equals(Object otherTerm) { - if (!(otherTerm instanceof Literal)) return false; - Literal o = (Literal) otherTerm; - return this.key.equals(o.key) && this.value.equals(o.value); - } - - @Override - public int hashCode() { - return key.hashCode() + value.hashCode(); + public Literal() { } /** @@ -63,31 +36,6 @@ public class Literal extends AbstractTerm implements Term { return 1; } - /** - * create a Solr query string from this literal - * @return a string which is a Solr query string - */ - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append(this.key.getSolrFieldName()); - sb.append(':').append('"').append(this.value).append('"'); - return sb.toString(); - } - - /** - * check if the key/value pair of this literal occurs in the SolrDocument - * @param doc the document to match to this literal - * @return true, if the key of this literal is contained in the document and the - * value equals (does not equal) with the value if this literal (if the signature is false) - */ - @Override - public boolean matches(SolrDocument doc) { - Object v = doc.getFieldValue(this.key.getSolrFieldName()); - if (v == null) return false; - return this.value.equals(AbstractSolrConnector.CATCHALL_TERM) || v.toString().matches(this.value); - } - @Override public Term lightestRewrite() { return this; diff --git a/source/net/yacy/cora/federate/solr/logic/LongLiteral.java b/source/net/yacy/cora/federate/solr/logic/LongLiteral.java new file mode 100644 index 000000000..39255248a --- /dev/null +++ b/source/net/yacy/cora/federate/solr/logic/LongLiteral.java @@ -0,0 +1,80 @@ +/** + * LongLiteral + * Copyright 2014 by Michael Peter Christen + * First released 24.10.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr.logic; + +import org.apache.solr.common.SolrDocument; + +import net.yacy.cora.federate.solr.SchemaDeclaration; + +public class LongLiteral extends Literal implements Term { + + private SchemaDeclaration key; + private long value; + + public LongLiteral(final SchemaDeclaration key, final long value) { + super(); + this.key = key; + this.value = value; + } + + @Override + public Object clone() { + return new LongLiteral(this.key, this.value); + } + + @Override + public boolean equals(Object otherTerm) { + if (!(otherTerm instanceof LongLiteral)) return false; + LongLiteral o = (LongLiteral) otherTerm; + return this.key.equals(o.key) && this.value == o.value; + } + + @Override + public int hashCode() { + return this.key.hashCode() + (int) (this.value & Integer.MAX_VALUE); + } + + /** + * create a Solr query string from this literal + * @return a string which is a Solr query string + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(this.key.getSolrFieldName()); + sb.append(':').append(this.value); + return sb.toString(); + } + + /** + * check if the key/value pair of this literal occurs in the SolrDocument + * @param doc the document to match to this literal + * @return true, if the key of this literal is contained in the document and the + * value equals (does not equal) with the value if this literal (if the signature is false) + */ + @Override + public boolean matches(SolrDocument doc) { + Object v = doc.getFieldValue(this.key.getSolrFieldName()); + if (v == null) return false; + return v.toString().matches(Long.toString(this.value)); + } + +} \ No newline at end of file diff --git a/source/net/yacy/cora/federate/solr/logic/StringLiteral.java b/source/net/yacy/cora/federate/solr/logic/StringLiteral.java new file mode 100644 index 000000000..fc00137df --- /dev/null +++ b/source/net/yacy/cora/federate/solr/logic/StringLiteral.java @@ -0,0 +1,81 @@ +/** + * StringLiteral + * Copyright 2014 by Michael Peter Christen + * First released 03.08.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr.logic; + +import org.apache.solr.common.SolrDocument; + +import net.yacy.cora.federate.solr.SchemaDeclaration; +import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; + +public class StringLiteral extends Literal implements Term { + + private SchemaDeclaration key; + private String value; + + public StringLiteral(final SchemaDeclaration key, final String value) { + super(); + this.key = key; + this.value = value; + } + + @Override + public Object clone() { + return new StringLiteral(this.key, this.value); + } + + @Override + public boolean equals(Object otherTerm) { + if (!(otherTerm instanceof StringLiteral)) return false; + StringLiteral o = (StringLiteral) otherTerm; + return this.key.equals(o.key) && this.value.equals(o.value); + } + + @Override + public int hashCode() { + return key.hashCode() + value.hashCode(); + } + + /** + * create a Solr query string from this literal + * @return a string which is a Solr query string + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(this.key.getSolrFieldName()); + sb.append(':').append('"').append(this.value).append('"'); + return sb.toString(); + } + + /** + * check if the key/value pair of this literal occurs in the SolrDocument + * @param doc the document to match to this literal + * @return true, if the key of this literal is contained in the document and the + * value equals (does not equal) with the value if this literal (if the signature is false) + */ + @Override + public boolean matches(SolrDocument doc) { + Object v = doc.getFieldValue(this.key.getSolrFieldName()); + if (v == null) return false; + return this.value.equals(AbstractSolrConnector.CATCHALL_TERM) || v.toString().matches(this.value); + } + +} diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 2c2521310..50803695a 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -61,10 +61,13 @@ import net.yacy.cora.federate.solr.SchemaDeclaration; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL; +import net.yacy.cora.federate.solr.logic.BooleanLiteral; +import net.yacy.cora.federate.solr.logic.CatchallLiteral; import net.yacy.cora.federate.solr.logic.Conjunction; import net.yacy.cora.federate.solr.logic.Disjunction; -import net.yacy.cora.federate.solr.logic.Literal; +import net.yacy.cora.federate.solr.logic.LongLiteral; import net.yacy.cora.federate.solr.logic.Negation; +import net.yacy.cora.federate.solr.logic.StringLiteral; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; @@ -1442,9 +1445,19 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri public void postprocessing_doublecontent(Segment segment, Set uniqueURLs, SolrDocument doc, final SolrInputDocument sid, final DigestURL url) { // FIND OUT IF THIS IS A DOUBLE DOCUMENT + // term to describe documents which are indexable: + // - no noindex in meta oder x-robots + // - no canonical-tag + Conjunction ValidDocTermTemplate = new Conjunction(); + ValidDocTermTemplate.addOperand(new LongLiteral(CollectionSchema.httpstatus_i, 200)); + ValidDocTermTemplate.addOperand(new Disjunction(new Negation(new CatchallLiteral(CollectionSchema.canonical_equal_sku_b)), new BooleanLiteral(CollectionSchema.canonical_equal_sku_b, true))); + ValidDocTermTemplate.addOperand(new Negation(new LongLiteral(CollectionSchema.robots_i, 8))); // bit 3 (noindex) + ValidDocTermTemplate.addOperand(new Negation(new LongLiteral(CollectionSchema.robots_i, 24))); // bit 3 + 4 (noindex + nofollow) + ValidDocTermTemplate.addOperand(new Negation(new LongLiteral(CollectionSchema.robots_i, 512))); // bit 9 (noindex) + ValidDocTermTemplate.addOperand(new Negation(new LongLiteral(CollectionSchema.robots_i, 1536))); // bit 9 + 10 (noindex + nofollow) + String urlhash = ASCII.String(url.hash()); String hostid = url.hosthash(); - Conjunction con = new Conjunction(); Disjunction dnf = new Disjunction(); CollectionSchema[][] doccheckschema = new CollectionSchema[][]{ {CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i}, @@ -1460,12 +1473,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (signature == null) continue uniquecheck; //con.addOperand(new Negation(new Literal(CollectionSchema.id, urlhash))); //con.addOperand(new Literal(CollectionSchema.host_id_s, hostid)); - dnf.addOperand(new Literal(signaturefield, signature.toString())); + dnf.addOperand(new LongLiteral(signaturefield, signature)); } } + Conjunction con = (Conjunction) ValidDocTermTemplate.clone(); con.addOperand(dnf); - con.addOperand(new Negation(new Literal(CollectionSchema.id, urlhash))); - con.addOperand(new Literal(CollectionSchema.host_id_s, hostid)); + con.addOperand(new Negation(new StringLiteral(CollectionSchema.id, urlhash))); + con.addOperand(new StringLiteral(CollectionSchema.host_id_s, hostid)); String query = con.toString(); SolrDocumentList docsAkk; try { @@ -1484,7 +1498,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // lookup the document with the same signature Long signature = (Long) doc.getFieldValue(signaturefield.getSolrFieldName()); if (signature == null) continue uniquecheck; - SolrDocumentList docs = new Literal(signaturefield, signature.toString()).apply(docsAkk); + SolrDocumentList docs = new StringLiteral(signaturefield, signature.toString()).apply(docsAkk); if (docs.getNumFound() == 0) { sid.setField(uniquefield.getSolrFieldName(), true); sid.setField(countfield.getSolrFieldName(), 1); @@ -1525,17 +1539,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri continue uniquecheck; } try { - String doccountquery = - CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + - "-" + CollectionSchema.robots_i.getSolrFieldName() + ":8 AND " + // bit 3 (noindex) - "-" + CollectionSchema.robots_i.getSolrFieldName() + ":24 AND " + // bit 3 + 4 (noindex + nofollow) - "-" + CollectionSchema.robots_i.getSolrFieldName() + ":512 AND " + // bit 9 (noindex) - "-" + CollectionSchema.robots_i.getSolrFieldName() + ":1536 AND " + // bit 9 + 10 (noindex + nofollow) - "((-" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":" + AbstractSolrConnector.CATCHALL_TERM + ") OR (" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":true)) AND " + - CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND " + - "-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + - signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\""; - long doccount = segment.fulltext().getDefaultConnector().getCountByQuery(doccountquery); + Conjunction doccountterm = (Conjunction) ValidDocTermTemplate.clone(); + doccountterm.addOperand(new Negation(new StringLiteral(CollectionSchema.id, urlhash))); + doccountterm.addOperand(new StringLiteral(CollectionSchema.host_id_s, hostid)); + doccountterm.addOperand(new LongLiteral(signaturefield, signature)); + long doccount = segment.fulltext().getDefaultConnector().getCountByQuery(doccountterm.toString()); sid.setField(uniquefield.getSolrFieldName(), doccount == 0); } catch (final IOException e) {} }