fix for source and target clickdepth in webgraph index

pull/1/head
Michael Peter Christen 11 years ago
parent f686ae30a4
commit 61ad194065

@ -179,7 +179,6 @@ public class SchemaConfiguration extends Configuration implements Serializable {
} }
public boolean postprocessing_clickdepth(final ClickdepthCache clickdepthCache, final SolrInputDocument sid, final DigestURL url, final SchemaDeclaration clickdepthfield, final int maxtime) { public boolean postprocessing_clickdepth(final ClickdepthCache clickdepthCache, final SolrInputDocument sid, final DigestURL url, final SchemaDeclaration clickdepthfield, final int maxtime) {
if (!this.contains(clickdepthfield)) return false;
// get new click depth and compare with old // get new click depth and compare with old
Integer oldclickdepth = (Integer) sid.getFieldValue(clickdepthfield.getSolrFieldName()); Integer oldclickdepth = (Integer) sid.getFieldValue(clickdepthfield.getSolrFieldName());
if (oldclickdepth != null && oldclickdepth.intValue() != 999) return false; // we do not want to compute that again if (oldclickdepth != null && oldclickdepth.intValue() != 999) return false; // we do not want to compute that again

@ -1161,7 +1161,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// switch over tag types // switch over tag types
ProcessType tagtype = ProcessType.valueOf((String) tag); ProcessType tagtype = ProcessType.valueOf((String) tag);
if (tagtype == ProcessType.CLICKDEPTH) { if (tagtype == ProcessType.CLICKDEPTH && collection.contains(CollectionSchema.clickdepth_i)) {
if (postprocessing_clickdepth(clickdepthCache, sid, url, CollectionSchema.clickdepth_i, 100)) proccount_clickdepthchange++; if (postprocessing_clickdepth(clickdepthCache, sid, url, CollectionSchema.clickdepth_i, 100)) proccount_clickdepthchange++;
} }

@ -208,9 +208,9 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
add(edge, WebgraphSchema.source_path_folders_count_i, paths.length); add(edge, WebgraphSchema.source_path_folders_count_i, paths.length);
add(edge, WebgraphSchema.source_path_folders_sxt, paths); add(edge, WebgraphSchema.source_path_folders_sxt, paths);
} }
if (this.contains(WebgraphSchema.source_clickdepth_i) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) { if ((allAttr || contains(WebgraphSchema.source_clickdepth_i)) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source); add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
if (clickdepth_source < 0 || clickdepth_source > 1) processTypes.add(ProcessType.CLICKDEPTH); processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
} }
// add the source attributes about the target // add the source attributes about the target
@ -276,16 +276,14 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
add(edge, WebgraphSchema.target_path_folders_sxt, paths); add(edge, WebgraphSchema.target_path_folders_sxt, paths);
} }
if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) { if ((allAttr || contains(WebgraphSchema.target_clickdepth_i)) && this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
if ((allAttr || contains(WebgraphSchema.target_clickdepth_i))) { if (target_url.probablyRootURL()) {
if (target_url.probablyRootURL()) { boolean lc = this.lazy; this.lazy = false;
boolean lc = this.lazy; this.lazy = false; add(edge, WebgraphSchema.target_clickdepth_i, 0);
add(edge, WebgraphSchema.target_clickdepth_i, 0); this.lazy = lc;
this.lazy = lc; } else {
} else { add(edge, WebgraphSchema.target_clickdepth_i, 999);
add(edge, WebgraphSchema.target_clickdepth_i, 999); processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
}
} }
} }

Loading…
Cancel
Save