fix for exact_signature_unique_b, exact_signature_copycount_i,

fuzzy_signature_unique_b and fuzzy_signature_copycount_i: apply same
criteria for 'valid document' as for title and description uniqueness
test.
pull/1/head
Michael Peter Christen 10 years ago
parent 7832ba44d6
commit fe537679de

@ -78,7 +78,6 @@ public abstract class AbstractOperations extends AbstractTerm implements Operati
public Term lightestRewrite() { public Term lightestRewrite() {
return this; return this;
} }
/** /**
* create a Solr query string from this conjunction * create a Solr query string from this conjunction
@ -86,12 +85,18 @@ public abstract class AbstractOperations extends AbstractTerm implements Operati
*/ */
@Override @Override
public String toString() { public String toString() {
if (this.terms.size() == 0) return "";
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (Term term: this.terms) { if (this.terms.size() == 1) {
if (sb.length() == 0) sb.append('('); else sb.append(") ").append(this.operandName).append(" ("); sb.append(terms.iterator().next().toString());
sb.append(term.toString()); } else {
sb.append('(');
for (Term term: this.terms) {
if (sb.length() > 1) sb.append(' ').append(this.operandName).append(' ');
sb.append(term.toString());
}
sb.append(')');
} }
sb.append(')');
return sb.toString(); return sb.toString();
} }
} }

@ -0,0 +1,80 @@
/**
* BooleanLiteral
* Copyright 2014 by Michael Peter Christen
* First released 24.10.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr.logic;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.federate.solr.SchemaDeclaration;
public class BooleanLiteral extends Literal implements Term {
private SchemaDeclaration key;
private boolean value;
public BooleanLiteral(final SchemaDeclaration key, final boolean value) {
super();
this.key = key;
this.value = value;
}
@Override
public Object clone() {
return new BooleanLiteral(this.key, this.value);
}
@Override
public boolean equals(Object otherTerm) {
if (!(otherTerm instanceof BooleanLiteral)) return false;
BooleanLiteral o = (BooleanLiteral) otherTerm;
return this.key.equals(o.key) && this.value == o.value;
}
@Override
public int hashCode() {
return this.key.hashCode() + (this.value ? 1 : 0);
}
/**
* create a Solr query string from this literal
* @return a string which is a Solr query string
*/
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(this.key.getSolrFieldName());
sb.append(':').append(this.value ? "true" : "false");
return sb.toString();
}
/**
* check if the key/value pair of this literal occurs in the SolrDocument
* @param doc the document to match to this literal
* @return true, if the key of this literal is contained in the document and the
* value equals (does not equal) with the value if this literal (if the signature is false)
*/
@Override
public boolean matches(SolrDocument doc) {
Object v = doc.getFieldValue(this.key.getSolrFieldName());
if (v == null) return false;
return v.toString().matches(this.value ? "true" : "false");
}
}

@ -0,0 +1,79 @@
/**
* CatchallLiteral
* Copyright 2014 by Michael Peter Christen
* First released 24.10.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr.logic;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
public class CatchallLiteral extends Literal implements Term {
private SchemaDeclaration key;
public CatchallLiteral(final SchemaDeclaration key) {
super();
this.key = key;
}
@Override
public Object clone() {
return new CatchallLiteral(this.key);
}
@Override
public boolean equals(Object otherTerm) {
if (!(otherTerm instanceof CatchallLiteral)) return false;
CatchallLiteral o = (CatchallLiteral) otherTerm;
return this.key.equals(o.key);
}
@Override
public int hashCode() {
return this.key.hashCode();
}
/**
* create a Solr query string from this literal
* @return a string which is a Solr query string
*/
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(this.key.getSolrFieldName());
sb.append(':').append(AbstractSolrConnector.CATCHALL_TERM);
return sb.toString();
}
/**
* check if the key/value pair of this literal occurs in the SolrDocument
* @param doc the document to match to this literal
* @return true, if the key of this literal is contained in the document and the
* value equals (does not equal) with the value if this literal (if the signature is false)
*/
@Override
public boolean matches(SolrDocument doc) {
Object v = doc.getFieldValue(this.key.getSolrFieldName());
if (v == null) return false; // this does not match if the field is missing
return true;
}
}

@ -33,6 +33,12 @@ public class Conjunction extends AbstractOperations implements Operations {
super("AND"); super("AND");
} }
public Conjunction(final Term t1, final Term t2) {
super("AND");
this.addOperand(t1);
this.addOperand(t2);
}
@Override @Override
public Object clone() { public Object clone() {
Conjunction c = new Conjunction(); Conjunction c = new Conjunction();

@ -32,6 +32,12 @@ public class Disjunction extends AbstractOperations implements Operations {
public Disjunction() { public Disjunction() {
super("OR"); super("OR");
} }
public Disjunction(final Term t1, final Term t2) {
super("OR");
this.addOperand(t1);
this.addOperand(t2);
}
@Override @Override
public Object clone() { public Object clone() {

@ -20,36 +20,9 @@
package net.yacy.cora.federate.solr.logic; package net.yacy.cora.federate.solr.logic;
import org.apache.solr.common.SolrDocument; public abstract class Literal extends AbstractTerm implements Term {
import net.yacy.cora.federate.solr.SchemaDeclaration; public Literal() {
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
public class Literal extends AbstractTerm implements Term {
private SchemaDeclaration key;
private String value;
public Literal(final SchemaDeclaration key, final String value) {
this.key = key;
this.value = value;
}
@Override
public Object clone() {
return new Literal(this.key, this.value);
}
@Override
public boolean equals(Object otherTerm) {
if (!(otherTerm instanceof Literal)) return false;
Literal o = (Literal) otherTerm;
return this.key.equals(o.key) && this.value.equals(o.value);
}
@Override
public int hashCode() {
return key.hashCode() + value.hashCode();
} }
/** /**
@ -63,31 +36,6 @@ public class Literal extends AbstractTerm implements Term {
return 1; return 1;
} }
/**
* create a Solr query string from this literal
* @return a string which is a Solr query string
*/
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(this.key.getSolrFieldName());
sb.append(':').append('"').append(this.value).append('"');
return sb.toString();
}
/**
* check if the key/value pair of this literal occurs in the SolrDocument
* @param doc the document to match to this literal
* @return true, if the key of this literal is contained in the document and the
* value equals (does not equal) with the value if this literal (if the signature is false)
*/
@Override
public boolean matches(SolrDocument doc) {
Object v = doc.getFieldValue(this.key.getSolrFieldName());
if (v == null) return false;
return this.value.equals(AbstractSolrConnector.CATCHALL_TERM) || v.toString().matches(this.value);
}
@Override @Override
public Term lightestRewrite() { public Term lightestRewrite() {
return this; return this;

@ -0,0 +1,80 @@
/**
* LongLiteral
* Copyright 2014 by Michael Peter Christen
* First released 24.10.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr.logic;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.federate.solr.SchemaDeclaration;
public class LongLiteral extends Literal implements Term {
private SchemaDeclaration key;
private long value;
public LongLiteral(final SchemaDeclaration key, final long value) {
super();
this.key = key;
this.value = value;
}
@Override
public Object clone() {
return new LongLiteral(this.key, this.value);
}
@Override
public boolean equals(Object otherTerm) {
if (!(otherTerm instanceof LongLiteral)) return false;
LongLiteral o = (LongLiteral) otherTerm;
return this.key.equals(o.key) && this.value == o.value;
}
@Override
public int hashCode() {
return this.key.hashCode() + (int) (this.value & Integer.MAX_VALUE);
}
/**
* create a Solr query string from this literal
* @return a string which is a Solr query string
*/
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(this.key.getSolrFieldName());
sb.append(':').append(this.value);
return sb.toString();
}
/**
* check if the key/value pair of this literal occurs in the SolrDocument
* @param doc the document to match to this literal
* @return true, if the key of this literal is contained in the document and the
* value equals (does not equal) with the value if this literal (if the signature is false)
*/
@Override
public boolean matches(SolrDocument doc) {
Object v = doc.getFieldValue(this.key.getSolrFieldName());
if (v == null) return false;
return v.toString().matches(Long.toString(this.value));
}
}

@ -0,0 +1,81 @@
/**
* StringLiteral
* Copyright 2014 by Michael Peter Christen
* First released 03.08.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr.logic;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
public class StringLiteral extends Literal implements Term {
private SchemaDeclaration key;
private String value;
public StringLiteral(final SchemaDeclaration key, final String value) {
super();
this.key = key;
this.value = value;
}
@Override
public Object clone() {
return new StringLiteral(this.key, this.value);
}
@Override
public boolean equals(Object otherTerm) {
if (!(otherTerm instanceof StringLiteral)) return false;
StringLiteral o = (StringLiteral) otherTerm;
return this.key.equals(o.key) && this.value.equals(o.value);
}
@Override
public int hashCode() {
return key.hashCode() + value.hashCode();
}
/**
* create a Solr query string from this literal
* @return a string which is a Solr query string
*/
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(this.key.getSolrFieldName());
sb.append(':').append('"').append(this.value).append('"');
return sb.toString();
}
/**
* check if the key/value pair of this literal occurs in the SolrDocument
* @param doc the document to match to this literal
* @return true, if the key of this literal is contained in the document and the
* value equals (does not equal) with the value if this literal (if the signature is false)
*/
@Override
public boolean matches(SolrDocument doc) {
Object v = doc.getFieldValue(this.key.getSolrFieldName());
if (v == null) return false;
return this.value.equals(AbstractSolrConnector.CATCHALL_TERM) || v.toString().matches(this.value);
}
}

@ -61,10 +61,13 @@ import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL; import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL;
import net.yacy.cora.federate.solr.logic.BooleanLiteral;
import net.yacy.cora.federate.solr.logic.CatchallLiteral;
import net.yacy.cora.federate.solr.logic.Conjunction; import net.yacy.cora.federate.solr.logic.Conjunction;
import net.yacy.cora.federate.solr.logic.Disjunction; import net.yacy.cora.federate.solr.logic.Disjunction;
import net.yacy.cora.federate.solr.logic.Literal; import net.yacy.cora.federate.solr.logic.LongLiteral;
import net.yacy.cora.federate.solr.logic.Negation; import net.yacy.cora.federate.solr.logic.Negation;
import net.yacy.cora.federate.solr.logic.StringLiteral;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
@ -1442,9 +1445,19 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
public void postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrDocument doc, final SolrInputDocument sid, final DigestURL url) { public void postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrDocument doc, final SolrInputDocument sid, final DigestURL url) {
// FIND OUT IF THIS IS A DOUBLE DOCUMENT // FIND OUT IF THIS IS A DOUBLE DOCUMENT
// term to describe documents which are indexable:
// - no noindex in meta oder x-robots
// - no canonical-tag
Conjunction ValidDocTermTemplate = new Conjunction();
ValidDocTermTemplate.addOperand(new LongLiteral(CollectionSchema.httpstatus_i, 200));
ValidDocTermTemplate.addOperand(new Disjunction(new Negation(new CatchallLiteral(CollectionSchema.canonical_equal_sku_b)), new BooleanLiteral(CollectionSchema.canonical_equal_sku_b, true)));
ValidDocTermTemplate.addOperand(new Negation(new LongLiteral(CollectionSchema.robots_i, 8))); // bit 3 (noindex)
ValidDocTermTemplate.addOperand(new Negation(new LongLiteral(CollectionSchema.robots_i, 24))); // bit 3 + 4 (noindex + nofollow)
ValidDocTermTemplate.addOperand(new Negation(new LongLiteral(CollectionSchema.robots_i, 512))); // bit 9 (noindex)
ValidDocTermTemplate.addOperand(new Negation(new LongLiteral(CollectionSchema.robots_i, 1536))); // bit 9 + 10 (noindex + nofollow)
String urlhash = ASCII.String(url.hash()); String urlhash = ASCII.String(url.hash());
String hostid = url.hosthash(); String hostid = url.hosthash();
Conjunction con = new Conjunction();
Disjunction dnf = new Disjunction(); Disjunction dnf = new Disjunction();
CollectionSchema[][] doccheckschema = new CollectionSchema[][]{ CollectionSchema[][] doccheckschema = new CollectionSchema[][]{
{CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i}, {CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i},
@ -1460,12 +1473,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (signature == null) continue uniquecheck; if (signature == null) continue uniquecheck;
//con.addOperand(new Negation(new Literal(CollectionSchema.id, urlhash))); //con.addOperand(new Negation(new Literal(CollectionSchema.id, urlhash)));
//con.addOperand(new Literal(CollectionSchema.host_id_s, hostid)); //con.addOperand(new Literal(CollectionSchema.host_id_s, hostid));
dnf.addOperand(new Literal(signaturefield, signature.toString())); dnf.addOperand(new LongLiteral(signaturefield, signature));
} }
} }
Conjunction con = (Conjunction) ValidDocTermTemplate.clone();
con.addOperand(dnf); con.addOperand(dnf);
con.addOperand(new Negation(new Literal(CollectionSchema.id, urlhash))); con.addOperand(new Negation(new StringLiteral(CollectionSchema.id, urlhash)));
con.addOperand(new Literal(CollectionSchema.host_id_s, hostid)); con.addOperand(new StringLiteral(CollectionSchema.host_id_s, hostid));
String query = con.toString(); String query = con.toString();
SolrDocumentList docsAkk; SolrDocumentList docsAkk;
try { try {
@ -1484,7 +1498,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// lookup the document with the same signature // lookup the document with the same signature
Long signature = (Long) doc.getFieldValue(signaturefield.getSolrFieldName()); Long signature = (Long) doc.getFieldValue(signaturefield.getSolrFieldName());
if (signature == null) continue uniquecheck; if (signature == null) continue uniquecheck;
SolrDocumentList docs = new Literal(signaturefield, signature.toString()).apply(docsAkk); SolrDocumentList docs = new StringLiteral(signaturefield, signature.toString()).apply(docsAkk);
if (docs.getNumFound() == 0) { if (docs.getNumFound() == 0) {
sid.setField(uniquefield.getSolrFieldName(), true); sid.setField(uniquefield.getSolrFieldName(), true);
sid.setField(countfield.getSolrFieldName(), 1); sid.setField(countfield.getSolrFieldName(), 1);
@ -1525,17 +1539,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
continue uniquecheck; continue uniquecheck;
} }
try { try {
String doccountquery = Conjunction doccountterm = (Conjunction) ValidDocTermTemplate.clone();
CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + doccountterm.addOperand(new Negation(new StringLiteral(CollectionSchema.id, urlhash)));
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":8 AND " + // bit 3 (noindex) doccountterm.addOperand(new StringLiteral(CollectionSchema.host_id_s, hostid));
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":24 AND " + // bit 3 + 4 (noindex + nofollow) doccountterm.addOperand(new LongLiteral(signaturefield, signature));
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":512 AND " + // bit 9 (noindex) long doccount = segment.fulltext().getDefaultConnector().getCountByQuery(doccountterm.toString());
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":1536 AND " + // bit 9 + 10 (noindex + nofollow)
"((-" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":" + AbstractSolrConnector.CATCHALL_TERM + ") OR (" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":true)) AND " +
CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND " +
"-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " +
signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"";
long doccount = segment.fulltext().getDefaultConnector().getCountByQuery(doccountquery);
sid.setField(uniquefield.getSolrFieldName(), doccount == 0); sid.setField(uniquefield.getSolrFieldName(), doccount == 0);
} catch (final IOException e) {} } catch (final IOException e) {}
} }

Loading…
Cancel
Save