From f94c91315bce0fb5a328ce7204a9066ff457e9e3 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 24 Jul 2014 15:35:53 +0200 Subject: [PATCH] if the webgraph is used, then use it also for reference computation to avoid contradictions with references_i in the collection index. --- source/net/yacy/search/index/Segment.java | 47 +++++++++++------------ 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index dff407505..a8a005748 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -241,31 +241,7 @@ public class Segment { this.externalHosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0); this.internalIDs = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 0); this.externalIDs = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 0); - try { - if (connectedCitation()) { - // read the references from the citation index - ReferenceContainer references; - references = urlCitation().get(id, null); - if (references == null) return; // no references at all - Iterator ri = references.entries(); - while (ri.hasNext()) { - CitationReference ref = ri.next(); - byte[] hh = ref.hosthash(); // host hash - if (ByteBuffer.equals(hh, 0, id, 6, 6)) { - internalIDs.put(ref.urlhash()); - internal++; - } else { - externalHosts.put(hh); - externalIDs.put(ref.urlhash()); - external++; - } - } - } - } catch (SpaceExceededException e) { - // the Citation Index got too large, we ignore the problem and hope that a second solr index is attached which will take over now - if (Segment.this.fulltext.useWebgraph()) internalIDs.clear(); - } - if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.useWebgraph()) { + if (Segment.this.fulltext.useWebgraph()) { // reqd the references from the webgraph SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector(); BlockingQueue docs = webgraph.concurrentDocumentsByQuery("{!raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 10000000, Long.MAX_VALUE, 100, 1, WebgraphSchema.source_id_s.getSolrFieldName()); @@ -292,6 +268,27 @@ public class Segment { } catch (final InterruptedException e) { ConcurrentLog.logException(e); } + } else if (connectedCitation()) try { + // read the references from the citation index + ReferenceContainer references; + references = urlCitation().get(id, null); + if (references == null) return; // no references at all + Iterator ri = references.entries(); + while (ri.hasNext()) { + CitationReference ref = ri.next(); + byte[] hh = ref.hosthash(); // host hash + if (ByteBuffer.equals(hh, 0, id, 6, 6)) { + internalIDs.put(ref.urlhash()); + internal++; + } else { + externalHosts.put(hh); + externalIDs.put(ref.urlhash()); + external++; + } + } + } catch (SpaceExceededException e) { + // the Citation Index got too large, we ignore the problem and hope that a second solr index is attached which will take over now + if (Segment.this.fulltext.useWebgraph()) internalIDs.clear(); } this.externalHosts.optimize(); this.internalIDs.optimize();