diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 898179b7a..e9345c115 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -1163,52 +1163,59 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri Thread.currentThread().setName(name); SolrDocument doc; String id; try { - while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { - SolrInputDocument sid = webgraph.toSolrInputDocument(doc, omitFields); - Collection proctags = doc.getFieldValues(WebgraphSchema.process_sxt.getSolrFieldName()); - - for (Object tag: proctags) try { - - // switch over tag types - ProcessType tagtype = ProcessType.valueOf((String) tag); + processloop: while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { + try { + SolrInputDocument sid = webgraph.toSolrInputDocument(doc, omitFields); + Collection proctags = doc.getFieldValues(WebgraphSchema.process_sxt.getSolrFieldName()); + + for (Object tag: proctags) try { - // set cr values - if (tagtype == ProcessType.CITATION) { - if (segment.fulltext().useWebgraph() && webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) { - id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()); - CRV crv = rankings.get(id); - if (crv != null) { - sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn); + // switch over tag types + ProcessType tagtype = ProcessType.valueOf((String) tag); + + // set cr values + if (tagtype == ProcessType.CITATION) { + if (segment.fulltext().useWebgraph() && webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) { + id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()); + CRV crv = rankings.get(id); + if (crv != null) { + sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn); + } } - } - if (webgraph.contains(WebgraphSchema.target_id_s) && webgraph.contains(WebgraphSchema.target_cr_host_norm_i)) { - id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()); - CRV crv = rankings.get(id); - if (crv != null) { - sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn); + if (webgraph.contains(WebgraphSchema.target_id_s) && webgraph.contains(WebgraphSchema.target_cr_host_norm_i)) { + id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()); + CRV crv = rankings.get(id); + if (crv != null) { + sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn); + } } } + } catch (IllegalArgumentException e) { + ConcurrentLog.logException(e); } - } catch (IllegalArgumentException e) {} - - // write document back to index - try { - sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName()); - sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName()); - //segment.fulltext().getWebgraphConnector().deleteById((String) sid.getFieldValue(WebgraphSchema.id.getSolrFieldName())); - segment.fulltext().getWebgraphConnector().add(sid); - } catch (SolrException e) { - ConcurrentLog.logException(e); - } catch (IOException e) { + + // write document back to index + try { + sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName()); + sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName()); + //segment.fulltext().getWebgraphConnector().deleteById((String) sid.getFieldValue(WebgraphSchema.id.getSolrFieldName())); + segment.fulltext().getWebgraphConnector().add(sid); + } catch (SolrException e) { + ConcurrentLog.logException(e); + } catch (IOException e) { + ConcurrentLog.logException(e); + } + proccount.incrementAndGet(); + allcount.incrementAndGet(); + if (proccount.get() % 1000 == 0) { + postprocessingActivity = "writing cr values to webgraph for host " + hostfinal + "postprocessed " + proccount + " from " + count + " documents; " + + (proccount.get() * 1000 / (System.currentTimeMillis() - start)) + " docs/second; " + + ((System.currentTimeMillis() - start) * (count - proccount.get()) / proccount.get() / 60000) + " minutes remaining"; + ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); + } + } catch (Throwable e) { ConcurrentLog.logException(e); - } - proccount.incrementAndGet(); - allcount.incrementAndGet(); - if (proccount.get() % 1000 == 0) { - postprocessingActivity = "writing cr values to webgraph for host " + hostfinal + "postprocessed " + proccount + " from " + count + " documents; " + - (proccount.get() * 1000 / (System.currentTimeMillis() - start)) + " docs/second; " + - ((System.currentTimeMillis() - start) * (count - proccount.get()) / proccount.get() / 60000) + " minutes remaining"; - ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); + continue processloop; } } } catch (InterruptedException e) { @@ -1218,7 +1225,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri }; t[i.get()].start(); } - for (int i = 0; i < t.length; i++) try {t[i].join();} catch (InterruptedException e) {} + for (int i = 0; i < t.length; i++) try { + t[i].join(10000); + if (t[i].isAlive()) t[i].interrupt(); + } catch (InterruptedException e) {} if (count != proccount.get()) ConcurrentLog.warn("CollectionConfiguration", "ambiguous webgraph document count for host " + host + ": expected=" + count + ", counted=" + proccount); }