pull/1/head
Michael Peter Christen 11 years ago
parent d80418f1b1
commit 2e09da9832

@ -1326,9 +1326,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
if (tagtype == ProcessType.UNIQUE) {
postprocessing_http_unique(segment, sid, url);
postprocessing_www_unique(segment, sid, url);
postprocessing_doublecontent(segment, uniqueURLs, sid, url);
postprocessing_http_unique(segment, doc, sid, url);
postprocessing_www_unique(segment, doc, sid, url);
postprocessing_doublecontent(segment, uniqueURLs, doc, sid, url);
}
} catch (IllegalArgumentException e) {}
@ -1398,35 +1398,35 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
return allcount.get();
}
public void postprocessing_http_unique(Segment segment, SolrInputDocument sid, DigestURL url) {
public void postprocessing_http_unique(final Segment segment, final SolrDocument doc, final SolrInputDocument sid, final DigestURL url) {
if (!this.contains(CollectionSchema.http_unique_b)) return;
if (!url.isHTTPS() && !url.isHTTP()) return;
try {
DigestURL u = new DigestURL((url.isHTTP() ? "https://" : "http://") + url.urlstub(true, true));
SolrDocument d = segment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(u.hash()), CollectionSchema.http_unique_b.getSolrFieldName());
set_unique_flag(CollectionSchema.http_unique_b, sid, d);
set_unique_flag(CollectionSchema.http_unique_b, doc, sid, d);
} catch (final IOException e) {}
}
public void postprocessing_www_unique(Segment segment, SolrInputDocument sid, DigestURL url) {
public void postprocessing_www_unique(final Segment segment, final SolrDocument doc, final SolrInputDocument sid, final DigestURL url) {
if (!this.contains(CollectionSchema.www_unique_b)) return;
final String us = url.urlstub(true, true);
try {
DigestURL u = new DigestURL(url.getProtocol() + (us.startsWith("www.") ? "://" + us.substring(4) : "://www." + us));
SolrDocument d = segment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(u.hash()), CollectionSchema.www_unique_b.getSolrFieldName());
set_unique_flag(CollectionSchema.www_unique_b, sid, d);
set_unique_flag(CollectionSchema.www_unique_b, doc, sid, d);
} catch (final IOException e) {}
}
private void set_unique_flag(CollectionSchema field, SolrInputDocument sid, SolrDocument d) {
Object sb = sid.getFieldValue(field.getSolrFieldName());
private void set_unique_flag(CollectionSchema field, final SolrDocument doc, final SolrInputDocument sid, final SolrDocument d) {
Object sb = doc.getFieldValue(field.getSolrFieldName());
boolean sbb = sb != null && ((Boolean) sb).booleanValue();
Object ob = d == null ? null : d.getFieldValue(field.getSolrFieldName());
boolean obb = ob != null && ((Boolean) ob).booleanValue();
if (sbb == obb) sid.setField(field.getSolrFieldName(), !sbb);
}
public void postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURL url) {
public void postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrDocument doc, final SolrInputDocument sid, final DigestURL url) {
// FIND OUT IF THIS IS A DOUBLE DOCUMENT
String urlhash = ASCII.String(url.hash());
String hostid = url.hosthash();
@ -1442,7 +1442,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (this.contains(signaturefield) && this.contains(uniquefield) && this.contains(countfield)) {
// lookup the document with the same signature
Long signature = (Long) sid.getField(signaturefield.getSolrFieldName()).getValue();
Long signature = (Long) doc.getFieldValue(signaturefield.getSolrFieldName());
if (signature == null) continue uniquecheck;
//con.addOperand(new Negation(new Literal(CollectionSchema.id, urlhash)));
//con.addOperand(new Literal(CollectionSchema.host_id_s, hostid));
@ -1468,7 +1468,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (this.contains(signaturefield) && this.contains(uniquefield) && this.contains(countfield)) {
// lookup the document with the same signature
Long signature = (Long) sid.getField(signaturefield.getSolrFieldName()).getValue();
Long signature = (Long) doc.getFieldValue(signaturefield.getSolrFieldName());
if (signature == null) continue uniquecheck;
SolrDocumentList docs = new Literal(signaturefield, signature.toString()).apply(docsAkk);
if (docs.getNumFound() == 0) {
@ -1487,10 +1487,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// in case that the document has no status code 200, has a noindex attribute
// or a canonical tag which does not point to the document itself,
// then the unique-field is not written at all!
Integer robots_i = this.contains(CollectionSchema.robots_i) ? (Integer) sid.getFieldValue(CollectionSchema.robots_i.getSolrFieldName()) : null;
Integer httpstatus_i = this.contains(CollectionSchema.httpstatus_i) ? (Integer) sid.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) : null;
String canonical_s = this.contains(CollectionSchema.canonical_s) ? (String) sid.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()) : null;
Boolean canonical_equal_sku_b = this.contains(CollectionSchema.canonical_equal_sku_b) ? (Boolean) sid.getFieldValue(CollectionSchema.canonical_equal_sku_b.getSolrFieldName()) : null;
Integer robots_i = this.contains(CollectionSchema.robots_i) ? (Integer) doc.getFieldValue(CollectionSchema.robots_i.getSolrFieldName()) : null;
Integer httpstatus_i = this.contains(CollectionSchema.httpstatus_i) ? (Integer) doc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) : null;
String canonical_s = this.contains(CollectionSchema.canonical_s) ? (String) doc.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()) : null;
Boolean canonical_equal_sku_b = this.contains(CollectionSchema.canonical_equal_sku_b) ? (Boolean) doc.getFieldValue(CollectionSchema.canonical_equal_sku_b.getSolrFieldName()) : null;
CollectionSchema[][] metadatacheckschema = new CollectionSchema[][]{
{CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b},
@ -1506,7 +1506,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (this.contains(checkfield) && this.contains(signaturefield) && this.contains(uniquefield)) {
// lookup in the index within the same hosts for the same title or description
//String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description();
Long signature = (Long) sid.getFieldValue(signaturefield.getSolrFieldName());
Long signature = (Long) doc.getFieldValue(signaturefield.getSolrFieldName());
if (signature == null) {
continue uniquecheck;
}

Loading…
Cancel
Save