added new fields http_unique_b and www_unique_b which can be used for

ranking to prefer urls containing a www subdomain or using the https
protocol
pull/1/head
Michael Peter Christen 11 years ago
parent 66f6797f52
commit ff5b3ac84d

@ -21,6 +21,12 @@ last_modified
## mime-type of document, string (mandatory field)
content_type
## unique-field which is true when an url appears the first time. If the same url which was http then appears as https (or vice versa) then the field is false
http_unique_b
## unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false
www_unique_b
## content of title tag, text (mandatory field)
title

@ -911,7 +911,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
if (c >= '0' && c <= '9') return CharType.number;
return CharType.high;
}
public String toNormalform(final boolean excludeAnchor) {
return toNormalform(excludeAnchor, false);
}
@ -954,6 +954,42 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return result;
}
public String urlstub(final boolean excludeAnchor, final boolean removeSessionID) {
// generates a normal form of the URL
boolean defaultPort = false;
if (this.protocol.equals("mailto")) {
return this.protocol + ":" + this.userInfo + "@" + this.host;
} else if (isHTTP()) {
if (this.port < 0 || this.port == 80) { defaultPort = true; }
} else if (isHTTPS()) {
if (this.port < 0 || this.port == 443) { defaultPort = true; }
} else if (isFTP()) {
if (this.port < 0 || this.port == 21) { defaultPort = true; }
} else if (isSMB()) {
if (this.port < 0 || this.port == 445) { defaultPort = true; }
} else if (isFile()) {
defaultPort = true;
}
String urlPath = this.getFile(excludeAnchor, removeSessionID);
String h = getHost();
final StringBuilder u = new StringBuilder(20 + urlPath.length() + ((h == null) ? 0 : h.length()));
if (h != null) {
if (this.userInfo != null && !(this.isFTP() && this.userInfo.startsWith(FTPClient.ANONYMOUS))) {
u.append(this.userInfo);
u.append("@");
}
u.append(h.toLowerCase());
}
if (!defaultPort) {
u.append(":");
u.append(this.port);
}
u.append(urlPath);
String result = u.toString();
return result;
}
@Override
public int hashCode() {
return

@ -107,6 +107,40 @@ public class SchemaConfiguration extends Configuration implements Serializable {
return sd;
}
public boolean postprocessing_http_unique(Segment segment, SolrInputDocument sid, DigestURL url) {
if (!this.contains(CollectionSchema.http_unique_b)) return false;
if (!url.isHTTPS() && !url.isHTTP()) return false;
try {
DigestURL u = new DigestURL((url.isHTTP() ? "https://" : "http://") + url.urlstub(true, true));
SolrDocument d = segment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(u.hash()), CollectionSchema.http_unique_b.getSolrFieldName());
return set_unique_flag(CollectionSchema.http_unique_b, sid, d);
} catch (final IOException e) {}
return false;
}
public boolean postprocessing_www_unique(Segment segment, SolrInputDocument sid, DigestURL url) {
if (!this.contains(CollectionSchema.www_unique_b)) return false;
final String us = url.urlstub(true, true);
try {
DigestURL u = new DigestURL(url.getProtocol() + (us.startsWith("www.") ? "://" + us.substring(4) : "://www." + us));
SolrDocument d = segment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(u.hash()), CollectionSchema.www_unique_b.getSolrFieldName());
return set_unique_flag(CollectionSchema.www_unique_b, sid, d);
} catch (final IOException e) {}
return false;
}
private boolean set_unique_flag(CollectionSchema field, SolrInputDocument sid, SolrDocument d) {
Object sb = sid.getFieldValue(field.getSolrFieldName());
boolean sbb = sb != null && ((Boolean) sb).booleanValue();
Object ob = d == null ? null : d.getFieldValue(field.getSolrFieldName());
boolean obb = ob != null && ((Boolean) ob).booleanValue();
if (sbb == obb) {
sid.setField(field.getSolrFieldName(), !sbb);
return true;
}
return false;
}
public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURL url) {
boolean changed = false;
// FIND OUT IF THIS IS A DOUBLE DOCUMENT

@ -360,11 +360,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
(target_host.equals(source_host) ||
target_host.equals("www." + source_host) ||
source_host.equals("www." + target_host))); // well, not everybody defines 'outbound' that way but however, thats used here.
final String target_url_string = target_url.toNormalform(false);
int pr_target = target_url_string.indexOf("://",0);
int ioidx = inbound ? 0 : 1;
subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target));
subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3));
subgraph.urlProtocols[ioidx].add(target_url.getProtocol());
subgraph.urlStubs[ioidx].add(target_url.urlstub(true, true));
subgraph.urlAnchorTexts[ioidx].add(text);
return inbound;
}
@ -401,7 +399,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
String url = addURIAttributes(doc, allAttr, digestURL, Response.docType(digestURL));
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
String host = digestURL.getHost();
String us = digestURL.toNormalform(true);
int crawldepth = document.getDepth();
@ -477,6 +475,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
List<String> synonyms = condenser.synonyms();
add(doc, CollectionSchema.synonyms_sxt, synonyms);
}
// unique-fields; these values must be corrected during postprocessing.
add(doc, CollectionSchema.http_unique_b, digestURL.isHTTPS()); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.www_unique_b, host != null && host.startsWith("www.")); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.exact_signature_l, condenser.exactSignature());
add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.exact_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing!
@ -485,7 +488,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.fuzzy_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing!
if (this.contains(CollectionSchema.exact_signature_unique_b) || this.contains(CollectionSchema.exact_signature_copycount_i) ||
this.contains(CollectionSchema.fuzzy_signature_l) || this.contains(CollectionSchema.fuzzy_signature_copycount_i)) {
this.contains(CollectionSchema.fuzzy_signature_l) || this.contains(CollectionSchema.fuzzy_signature_copycount_i) ||
this.contains(CollectionSchema.http_unique_b) || this.contains(CollectionSchema.www_unique_b)) {
processTypes.add(ProcessType.UNIQUE);
}
@ -1166,7 +1170,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
long count = collectionConnector.getCountByQuery(query);
long start = System.currentTimeMillis();
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey);
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(query, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000000, 86400000, 200, 1);
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
query,
CollectionSchema.host_subdomain_s.getSolrFieldName() + " asc," + // sort on subdomain to get hosts without subdomain first; that gives an opportunity to set www_unique_b flag to false
CollectionSchema.url_protocol_s.getSolrFieldName() + " asc," + // sort on protocol to get http before htts; that gives an opportunity to set http_unique_b flag to false
CollectionSchema.url_chars_i.getSolrFieldName() + " asc",
0, 100000000, 86400000, 200, 1);
int countcheck = 0;
Collection<String> failids = new ArrayList<String>();
SolrDocument doc;
@ -1199,7 +1208,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
if (tagtype == ProcessType.UNIQUE) {
if (postprocessing_doublecontent(segment, uniqueURLs, sid, url)) proccount_uniquechange++;
boolean uniquechange = false;
uniquechange |= postprocessing_http_unique(segment, sid, url);
uniquechange |= postprocessing_www_unique(segment, sid, url);
uniquechange |= postprocessing_doublecontent(segment, uniqueURLs, sid, url);
if (uniquechange) proccount_uniquechange++;
}
} catch (IllegalArgumentException e) {}

@ -36,6 +36,8 @@ public enum CollectionSchema implements SchemaDeclaration {
//sku(SolrType.text_en_splitting_tight, true, true, false, true, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr.
last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
content_type(SolrType.string, true, true, true, false, false, "mime-type of document"),
http_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url which was http then appears as https (or vice versa) then the field is false"),
www_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false"),
title(SolrType.text_general, true, true, true, false, true, "content of title tag"),
title_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b"),
title_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if title is unique in the whole index; if yes and another document appears with same title, the unique-flag is set to false"),

Loading…
Cancel
Save