added new fields http_unique_b and www_unique_b which can be used for

ranking to prefer urls containing a www subdomain or using the https
protocol
pull/1/head
Michael Peter Christen 11 years ago
parent 66f6797f52
commit ff5b3ac84d

@ -21,6 +21,12 @@ last_modified
## mime-type of document, string (mandatory field) ## mime-type of document, string (mandatory field)
content_type content_type
## unique-field which is true when an url appears the first time. If the same url which was http then appears as https (or vice versa) then the field is false
http_unique_b
## unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false
www_unique_b
## content of title tag, text (mandatory field) ## content of title tag, text (mandatory field)
title title

@ -954,6 +954,42 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return result; return result;
} }
public String urlstub(final boolean excludeAnchor, final boolean removeSessionID) {
// generates a normal form of the URL
boolean defaultPort = false;
if (this.protocol.equals("mailto")) {
return this.protocol + ":" + this.userInfo + "@" + this.host;
} else if (isHTTP()) {
if (this.port < 0 || this.port == 80) { defaultPort = true; }
} else if (isHTTPS()) {
if (this.port < 0 || this.port == 443) { defaultPort = true; }
} else if (isFTP()) {
if (this.port < 0 || this.port == 21) { defaultPort = true; }
} else if (isSMB()) {
if (this.port < 0 || this.port == 445) { defaultPort = true; }
} else if (isFile()) {
defaultPort = true;
}
String urlPath = this.getFile(excludeAnchor, removeSessionID);
String h = getHost();
final StringBuilder u = new StringBuilder(20 + urlPath.length() + ((h == null) ? 0 : h.length()));
if (h != null) {
if (this.userInfo != null && !(this.isFTP() && this.userInfo.startsWith(FTPClient.ANONYMOUS))) {
u.append(this.userInfo);
u.append("@");
}
u.append(h.toLowerCase());
}
if (!defaultPort) {
u.append(":");
u.append(this.port);
}
u.append(urlPath);
String result = u.toString();
return result;
}
@Override @Override
public int hashCode() { public int hashCode() {
return return

@ -107,6 +107,40 @@ public class SchemaConfiguration extends Configuration implements Serializable {
return sd; return sd;
} }
public boolean postprocessing_http_unique(Segment segment, SolrInputDocument sid, DigestURL url) {
if (!this.contains(CollectionSchema.http_unique_b)) return false;
if (!url.isHTTPS() && !url.isHTTP()) return false;
try {
DigestURL u = new DigestURL((url.isHTTP() ? "https://" : "http://") + url.urlstub(true, true));
SolrDocument d = segment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(u.hash()), CollectionSchema.http_unique_b.getSolrFieldName());
return set_unique_flag(CollectionSchema.http_unique_b, sid, d);
} catch (final IOException e) {}
return false;
}
public boolean postprocessing_www_unique(Segment segment, SolrInputDocument sid, DigestURL url) {
if (!this.contains(CollectionSchema.www_unique_b)) return false;
final String us = url.urlstub(true, true);
try {
DigestURL u = new DigestURL(url.getProtocol() + (us.startsWith("www.") ? "://" + us.substring(4) : "://www." + us));
SolrDocument d = segment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(u.hash()), CollectionSchema.www_unique_b.getSolrFieldName());
return set_unique_flag(CollectionSchema.www_unique_b, sid, d);
} catch (final IOException e) {}
return false;
}
private boolean set_unique_flag(CollectionSchema field, SolrInputDocument sid, SolrDocument d) {
Object sb = sid.getFieldValue(field.getSolrFieldName());
boolean sbb = sb != null && ((Boolean) sb).booleanValue();
Object ob = d == null ? null : d.getFieldValue(field.getSolrFieldName());
boolean obb = ob != null && ((Boolean) ob).booleanValue();
if (sbb == obb) {
sid.setField(field.getSolrFieldName(), !sbb);
return true;
}
return false;
}
public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURL url) { public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURL url) {
boolean changed = false; boolean changed = false;
// FIND OUT IF THIS IS A DOUBLE DOCUMENT // FIND OUT IF THIS IS A DOUBLE DOCUMENT

@ -360,11 +360,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
(target_host.equals(source_host) || (target_host.equals(source_host) ||
target_host.equals("www." + source_host) || target_host.equals("www." + source_host) ||
source_host.equals("www." + target_host))); // well, not everybody defines 'outbound' that way but however, thats used here. source_host.equals("www." + target_host))); // well, not everybody defines 'outbound' that way but however, thats used here.
final String target_url_string = target_url.toNormalform(false);
int pr_target = target_url_string.indexOf("://",0);
int ioidx = inbound ? 0 : 1; int ioidx = inbound ? 0 : 1;
subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target)); subgraph.urlProtocols[ioidx].add(target_url.getProtocol());
subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3)); subgraph.urlStubs[ioidx].add(target_url.urlstub(true, true));
subgraph.urlAnchorTexts[ioidx].add(text); subgraph.urlAnchorTexts[ioidx].add(text);
return inbound; return inbound;
} }
@ -401,7 +399,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
String url = addURIAttributes(doc, allAttr, digestURL, Response.docType(digestURL)); String url = addURIAttributes(doc, allAttr, digestURL, Response.docType(digestURL));
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>(); Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
String host = digestURL.getHost();
String us = digestURL.toNormalform(true); String us = digestURL.toNormalform(true);
int crawldepth = document.getDepth(); int crawldepth = document.getDepth();
@ -477,6 +475,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
List<String> synonyms = condenser.synonyms(); List<String> synonyms = condenser.synonyms();
add(doc, CollectionSchema.synonyms_sxt, synonyms); add(doc, CollectionSchema.synonyms_sxt, synonyms);
} }
// unique-fields; these values must be corrected during postprocessing.
add(doc, CollectionSchema.http_unique_b, digestURL.isHTTPS()); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.www_unique_b, host != null && host.startsWith("www.")); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.exact_signature_l, condenser.exactSignature()); add(doc, CollectionSchema.exact_signature_l, condenser.exactSignature());
add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards during storage! add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.exact_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing! add(doc, CollectionSchema.exact_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing!
@ -485,7 +488,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards during storage! add(doc, CollectionSchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.fuzzy_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing! add(doc, CollectionSchema.fuzzy_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing!
if (this.contains(CollectionSchema.exact_signature_unique_b) || this.contains(CollectionSchema.exact_signature_copycount_i) || if (this.contains(CollectionSchema.exact_signature_unique_b) || this.contains(CollectionSchema.exact_signature_copycount_i) ||
this.contains(CollectionSchema.fuzzy_signature_l) || this.contains(CollectionSchema.fuzzy_signature_copycount_i)) { this.contains(CollectionSchema.fuzzy_signature_l) || this.contains(CollectionSchema.fuzzy_signature_copycount_i) ||
this.contains(CollectionSchema.http_unique_b) || this.contains(CollectionSchema.www_unique_b)) {
processTypes.add(ProcessType.UNIQUE); processTypes.add(ProcessType.UNIQUE);
} }
@ -1166,7 +1170,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
long count = collectionConnector.getCountByQuery(query); long count = collectionConnector.getCountByQuery(query);
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey); ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey);
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(query, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000000, 86400000, 200, 1); BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
query,
CollectionSchema.host_subdomain_s.getSolrFieldName() + " asc," + // sort on subdomain to get hosts without subdomain first; that gives an opportunity to set www_unique_b flag to false
CollectionSchema.url_protocol_s.getSolrFieldName() + " asc," + // sort on protocol to get http before htts; that gives an opportunity to set http_unique_b flag to false
CollectionSchema.url_chars_i.getSolrFieldName() + " asc",
0, 100000000, 86400000, 200, 1);
int countcheck = 0; int countcheck = 0;
Collection<String> failids = new ArrayList<String>(); Collection<String> failids = new ArrayList<String>();
SolrDocument doc; SolrDocument doc;
@ -1199,7 +1208,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
} }
if (tagtype == ProcessType.UNIQUE) { if (tagtype == ProcessType.UNIQUE) {
if (postprocessing_doublecontent(segment, uniqueURLs, sid, url)) proccount_uniquechange++; boolean uniquechange = false;
uniquechange |= postprocessing_http_unique(segment, sid, url);
uniquechange |= postprocessing_www_unique(segment, sid, url);
uniquechange |= postprocessing_doublecontent(segment, uniqueURLs, sid, url);
if (uniquechange) proccount_uniquechange++;
} }
} catch (IllegalArgumentException e) {} } catch (IllegalArgumentException e) {}

@ -36,6 +36,8 @@ public enum CollectionSchema implements SchemaDeclaration {
//sku(SolrType.text_en_splitting_tight, true, true, false, true, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr. //sku(SolrType.text_en_splitting_tight, true, true, false, true, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr.
last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"), last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
content_type(SolrType.string, true, true, true, false, false, "mime-type of document"), content_type(SolrType.string, true, true, true, false, false, "mime-type of document"),
http_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url which was http then appears as https (or vice versa) then the field is false"),
www_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false"),
title(SolrType.text_general, true, true, true, false, true, "content of title tag"), title(SolrType.text_general, true, true, true, false, true, "content of title tag"),
title_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b"), title_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b"),
title_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if title is unique in the whole index; if yes and another document appears with same title, the unique-flag is set to false"), title_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if title is unique in the whole index; if yes and another document appears with same title, the unique-flag is set to false"),

Loading…
Cancel
Save