introduced new solr field crawldepth_i which records the crawl depth of

a document. This is the upper limit for the clickdepth_i value which may
be shorter in case that the crawler did not take the shortest path to
the document.
pull/1/head
Michael Peter Christen 11 years ago
parent d321b0314e
commit cca851a417

@ -87,6 +87,9 @@ references_exthosts_i
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url
clickdepth_i
## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i
crawldepth_i
## needed (post-)processing steps on this metadata set
process_sxt

@ -298,6 +298,7 @@ public class HostBrowser {
CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName(),
CollectionSchema.clickdepth_i.getSolrFieldName(),
CollectionSchema.crawldepth_i.getSolrFieldName(),
CollectionSchema.references_i.getSolrFieldName(),
CollectionSchema.references_internal_i.getSolrFieldName(),
CollectionSchema.references_external_i.getSolrFieldName(),
@ -560,17 +561,19 @@ public class HostBrowser {
public static final class InfoCacheEntry {
public Integer cr_n;
public Double cr_c;
public int clickdepth, references, references_internal, references_external, references_exthosts;
public int clickdepth, crawldepth, references, references_internal, references_external, references_exthosts;
public List<String> references_internal_urls, references_external_urls;
public InfoCacheEntry(final Fulltext fulltext, final ReferenceReportCache rrCache, final SolrDocument doc, final String urlhash, boolean fetchReferences) {
this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName());
this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName());
this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName());
Integer cd = (Integer) doc.getFieldValue(CollectionSchema.clickdepth_i.getSolrFieldName());
Integer cr = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName());
Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName());
Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName());
Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
this.clickdepth = (cd == null || cd.intValue() < 0) ? 999 : cd.intValue();
this.crawldepth = (cr == null || cr.intValue() < 0) ? 999 : cr.intValue();
this.references = (rc == null || rc.intValue() <= 0) ? 0 : rc.intValue();
this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue();
// calculate the url reference list
@ -622,14 +625,11 @@ public class HostBrowser {
}
if (sbe.length() > 0) sbe.insert(0, "<br/>external referrer:</br>");
return
(this.clickdepth >= 0 ?
"clickdepth: " + this.clickdepth :
"") +
(this.clickdepth >= 0 ? "clickdepth: " + this.clickdepth : "") +
(this.crawldepth >= 0 ? ", crawldepth: " + this.crawldepth : "") +
(this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") +
(this.cr_n != null ? ", crn=" + this.cr_n : "") +
(this.references >= 0 ?
", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + sbi.toString() + sbe.toString() :
"");
(this.references >= 0 ? ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + sbi.toString() + sbe.toString() : "");
}
}

@ -95,6 +95,7 @@ public class Document {
private final Object parserObject; // the source object that was used to create the Document
private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
private final Date date;
private int crawldepth;
public Document(final DigestURL location, final String mimeType, final String charset,
final Object parserObject,
@ -146,8 +147,9 @@ public class Document {
this.text = text == null ? "" : text;
this.generic_facets = new HashMap<String, Set<String>>();
this.date = date == null ? new Date() : date;
this.crawldepth = 999; // unknown yet
}
/**
* Get the content domain of a document. This tries to get the content domain from the mime type
* and if this fails it uses alternatively the content domain from the file extension.
@ -740,6 +742,14 @@ dc_rights
return this.indexingDenied;
}
public void setDepth(int depth) {
this.crawldepth = depth;
}
public int getDepth() {
return this.crawldepth;
}
public void writeXML(final Writer os, final Date date) throws IOException {
os.write("<record>\n");
final String title = dc_title();
@ -819,6 +829,7 @@ dc_rights
double lon = 0.0d, lat = 0.0d;
Date date = new Date();
int mindepth = 999;
for (final Document doc: docs) {
if (doc == null) continue;
@ -857,6 +868,8 @@ dc_rights
images.putAll(doc.getImages());
if (doc.lon() != 0.0 && doc.lat() != 0.0) { lon = doc.lon(); lat = doc.lat(); }
if (doc.date.before(date)) date = doc.date;
if (doc.getDepth() < mindepth) mindepth = doc.getDepth();
}
// clean up parser data
@ -871,7 +884,7 @@ dc_rights
// return consolidation
ArrayList<String> titlesa = new ArrayList<String>();
titlesa.addAll(titles);
return new Document(
Document newDoc = new Document(
location,
globalMime,
null,
@ -890,6 +903,8 @@ dc_rights
images,
false,
date);
newDoc.setDepth(mindepth);
return newDoc;
}
public static Map<DigestURL, String> getHyperlinks(final Document[] documents) {

@ -2591,6 +2591,9 @@ public final class Switchboard extends serverSwitch {
response.profile().crawlerNoDepthLimitMatchPattern().matcher(response.url().toNormalform(true)).matches()
)
) {
for (Document d: documents) d.setDepth(response.depth());
// get the hyperlinks
final Map<DigestURL, String> hl = Document.getHyperlinks(documents);
if (response.profile().indexMedia()) {

@ -377,10 +377,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
} else {
clickdepth = 999;
}
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
if (document.getDepth() < 2) clickdepth = Math.min(clickdepth, document.getDepth()); // thats not true if the start url was not a root URL. We need a test for that.
if (clickdepth > 2) processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
CollectionSchema.clickdepth_i.add(doc, clickdepth); // no lazy value checking to get a '0' into the index
}
if ((allAttr || contains(CollectionSchema.crawldepth_i))) {
CollectionSchema.crawldepth_i.add(doc, document.getDepth());
}
if (allAttr || (contains(CollectionSchema.cr_host_chance_d) && contains(CollectionSchema.cr_host_count_i) && contains(CollectionSchema.cr_host_norm_i))) {
processTypes.add(ProcessType.CITATION); // postprocessing needed
}

@ -58,6 +58,7 @@ public enum CollectionSchema implements SchemaDeclaration {
references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"),
references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"),
clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
crawldepth_i(SolrType.num_integer, true, true, false, false, false, "crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i"),
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"),
harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),

Loading…
Cancel
Save