if the webgraph is used, then use it also for reference computation to

avoid contradictions with references_i in the collection index.
pull/1/head
Michael Peter Christen 10 years ago
parent 6e1dc444c3
commit f94c91315b

@ -241,31 +241,7 @@ public class Segment {
this.externalHosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0);
this.internalIDs = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 0);
this.externalIDs = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 0);
try {
if (connectedCitation()) {
// read the references from the citation index
ReferenceContainer<CitationReference> references;
references = urlCitation().get(id, null);
if (references == null) return; // no references at all
Iterator<CitationReference> ri = references.entries();
while (ri.hasNext()) {
CitationReference ref = ri.next();
byte[] hh = ref.hosthash(); // host hash
if (ByteBuffer.equals(hh, 0, id, 6, 6)) {
internalIDs.put(ref.urlhash());
internal++;
} else {
externalHosts.put(hh);
externalIDs.put(ref.urlhash());
external++;
}
}
}
} catch (SpaceExceededException e) {
// the Citation Index got too large, we ignore the problem and hope that a second solr index is attached which will take over now
if (Segment.this.fulltext.useWebgraph()) internalIDs.clear();
}
if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.useWebgraph()) {
if (Segment.this.fulltext.useWebgraph()) {
// reqd the references from the webgraph
SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector();
BlockingQueue<SolrDocument> docs = webgraph.concurrentDocumentsByQuery("{!raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 10000000, Long.MAX_VALUE, 100, 1, WebgraphSchema.source_id_s.getSolrFieldName());
@ -292,6 +268,27 @@ public class Segment {
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
}
} else if (connectedCitation()) try {
// read the references from the citation index
ReferenceContainer<CitationReference> references;
references = urlCitation().get(id, null);
if (references == null) return; // no references at all
Iterator<CitationReference> ri = references.entries();
while (ri.hasNext()) {
CitationReference ref = ri.next();
byte[] hh = ref.hosthash(); // host hash
if (ByteBuffer.equals(hh, 0, id, 6, 6)) {
internalIDs.put(ref.urlhash());
internal++;
} else {
externalHosts.put(hh);
externalIDs.put(ref.urlhash());
external++;
}
}
} catch (SpaceExceededException e) {
// the Citation Index got too large, we ignore the problem and hope that a second solr index is attached which will take over now
if (Segment.this.fulltext.useWebgraph()) internalIDs.clear();
}
this.externalHosts.optimize();
this.internalIDs.optimize();

Loading…
Cancel
Save