Refactored postprocessing

For easier understanding and performances profiling.
pull/71/head
luccioman 9 years ago
parent de663be48b
commit 42f45760ed

@ -48,6 +48,13 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.eclipse.jetty.util.ConcurrentHashSet;
import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
@ -100,13 +107,6 @@ import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.query.QueryParams; import net.yacy.search.query.QueryParams;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.eclipse.jetty.util.ConcurrentHashSet;
public class CollectionConfiguration extends SchemaConfiguration implements Serializable { public class CollectionConfiguration extends SchemaConfiguration implements Serializable {
@ -1131,212 +1131,38 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
collection.contains(CollectionSchema.cr_host_chance_d) && collection.contains(CollectionSchema.cr_host_chance_d) &&
collection.contains(CollectionSchema.cr_host_norm_i))); collection.contains(CollectionSchema.cr_host_norm_i)));
// create the ranking map // create the ranking map
final Map<String, CRV> rankings = new ConcurrentHashMap<String, CRV>(); final Map<String, CRV> rankings = createRankingMap(segment, rrCache, collectionConnector, collection1hosts,
if (shallComputeCR) try { shallComputeCR);
int concurrency = Math.min(collection1hosts.size(), Runtime.getRuntime().availableProcessors());
postprocessingActivity = "collecting cr for " + collection1hosts.size() + " hosts, concurrency = " + concurrency;
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
int countcheck = 0;
for (String host: collection1hosts.keyList(true)) {
// Patch the citation index for links with canonical tags.
// This shall fulfill the following requirement:
// If a document A links to B and B contains a 'canonical C', then the citation rank computation shall consider that A links to C and B does not link to C.
// To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links
String patchquery = CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM;
long patchquerycount = collectionConnector.getCountByQuery("{!cache=false}" + patchquery);
BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000000, Long.MAX_VALUE, 20, 1, true,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName());
SolrDocument doc_B;
int patchquerycountcheck = 0;
try {
while ((doc_B = documents_with_canonical_tag.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
// find all documents which link to the canonical doc
DigestURL doc_C_url = new DigestURL((String) doc_B.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()));
byte[] doc_B_id = ASCII.getBytes(((String) doc_B.getFieldValue(CollectionSchema.id.getSolrFieldName())));
// we remove all references to B, because these become references to C
if (segment.connectedCitation()) {
ReferenceContainer<CitationReference> doc_A_ids = segment.urlCitation().remove(doc_B_id);
if (doc_A_ids == null) {
//System.out.println("*** document with canonical but no referrer: " + doc_B.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
continue; // the document has a canonical tag but no referrer?
}
Iterator<CitationReference> doc_A_ids_iterator = doc_A_ids.entries();
// for each of the referrer A of B, set A as a referrer of C
while (doc_A_ids_iterator.hasNext()) {
CitationReference doc_A_citation = doc_A_ids_iterator.next();
segment.urlCitation().add(doc_C_url.hash(), doc_A_citation);
}
}
patchquerycountcheck++;
if (MemoryControl.shortStatus()) {
ConcurrentLog.warn("CollectionConfiguration", "terminated canonical collection during postprocessing because of short memory");
break;
}
}
} catch (InterruptedException e) {
ConcurrentLog.logException(e);
} catch (SpaceExceededException e) {
ConcurrentLog.logException(e);
}
if (patchquerycount != patchquerycountcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous patchquery count for host " + host + ": expected=" + patchquerycount + ", counted=" + patchquerycountcheck);
// do the citation rank computation
if (collection1hosts.get(host) <= 0) continue;
// select all documents for each host
CRHost crh = new CRHost(segment, rrCache, host, 0.85d, 6);
int convergence_attempts = 0;
while (convergence_attempts++ < 30) {
ConcurrentLog.info("CollectionConfiguration", "convergence step " + convergence_attempts + " for host " + host + " ...");
if (crh.convergenceStep()) break;
if (MemoryControl.shortStatus()) {
ConcurrentLog.warn("CollectionConfiguration", "terminated convergenceStep during postprocessing because of short memory");
break;
}
}
ConcurrentLog.info("CollectionConfiguration", "convergence for host " + host + " after " + convergence_attempts + " steps");
// we have now the cr for all documents of a specific host; we store them for later use
Map<String, CRV> crn = crh.normalize();
//crh.log(crn);
rankings.putAll(crn); // accumulate this here for usage in document update later
if (MemoryControl.shortStatus()) {
ConcurrentLog.warn("CollectionConfiguration", "terminated crn akkumulation during postprocessing because of short memory");
break;
}
countcheck++;
}
if (collection1hosts.size() != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous host count: expected=" + collection1hosts.size() + ", counted=" + countcheck);
} catch (final IOException e2) {
ConcurrentLog.logException(e2);
collection1hosts = new ClusteredScoreMap<String>(true);
}
// process all documents at the webgraph for the outgoing links of this document // process all documents at the webgraph for the outgoing links of this document
final AtomicInteger allcount = new AtomicInteger(0); final AtomicInteger allcount = new AtomicInteger(0);
if (segment.fulltext().useWebgraph() && shallComputeCR) { if (segment.fulltext().useWebgraph() && shallComputeCR) {
postprocessingActivity = "collecting host facets for webgraph cr calculation"; postprocessWebgraph(segment, webgraph, webgraphquery, rankings, allcount);
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
final Set<String> omitFields = new HashSet<String>();
omitFields.add(WebgraphSchema.process_sxt.getSolrFieldName());
omitFields.add(WebgraphSchema.harvestkey_s.getSolrFieldName());
// collect hosts from index which shall take part in citation computation
ReversibleScoreMap<String> webgraphhosts;
try {
Map<String, ReversibleScoreMap<String>> hostfacet = segment.fulltext().getWebgraphConnector().getFacets(webgraphquery, 10000000, WebgraphSchema.source_host_s.getSolrFieldName());
webgraphhosts = hostfacet.get(WebgraphSchema.source_host_s.getSolrFieldName());
} catch (final IOException e2) {
ConcurrentLog.logException(e2);
webgraphhosts = new ClusteredScoreMap<String>(true);
}
try {
final long start = System.currentTimeMillis();
for (String host: webgraphhosts.keyList(true)) {
if (webgraphhosts.get(host) <= 0) continue;
final String hostfinal = host;
// select all webgraph edges and modify their cr value
postprocessingActivity = "writing cr values to webgraph for host " + host;
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
String patchquery = WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\" AND " + WebgraphSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM;
final long count = segment.fulltext().getWebgraphConnector().getCountByQuery("{!cache=false}" + patchquery);
int concurrency = Math.min((int) count, Math.max(1, Runtime.getRuntime().availableProcessors() / 4));
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph, concurrency = " + concurrency);
final BlockingQueue<SolrDocument> docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery(
patchquery,
WebgraphSchema.source_chars_i.getSolrFieldName() + " asc",
0, 100000000, Long.MAX_VALUE, concurrency + 1, concurrency, true
// TODO: add field list and do partial updates
);
final AtomicInteger proccount = new AtomicInteger(0);
Thread[] t = new Thread[concurrency];
for (final AtomicInteger i = new AtomicInteger(0); i.get() < t.length; i.incrementAndGet()) {
t[i.get()] = new Thread() {
private String name = "CollectionConfiguration.postprocessing.webgraph-" + i.get();
@Override
public void run() {
Thread.currentThread().setName(name);
SolrDocument doc; String id;
try {
processloop: while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
try {
SolrInputDocument sid = webgraph.toSolrInputDocument(doc, omitFields);
Collection<Object> proctags = doc.getFieldValues(WebgraphSchema.process_sxt.getSolrFieldName());
for (Object tag: proctags) try {
// switch over tag types
ProcessType tagtype = ProcessType.valueOf((String) tag);
// set cr values
if (tagtype == ProcessType.CITATION) {
if (segment.fulltext().useWebgraph() && webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) {
id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName());
CRV crv = rankings.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn);
}
}
if (webgraph.contains(WebgraphSchema.target_id_s) && webgraph.contains(WebgraphSchema.target_cr_host_norm_i)) {
id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName());
CRV crv = rankings.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn);
}
}
}
} catch (IllegalArgumentException e) {
ConcurrentLog.logException(e);
}
// write document back to index
try {
sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName());
sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName());
//segment.fulltext().getWebgraphConnector().deleteById((String) sid.getFieldValue(WebgraphSchema.id.getSolrFieldName()));
segment.fulltext().getWebgraphConnector().add(sid);
} catch (SolrException e) {
ConcurrentLog.logException(e);
} catch (IOException e) {
ConcurrentLog.logException(e);
}
proccount.incrementAndGet();
allcount.incrementAndGet();
if (proccount.get() % 1000 == 0) {
postprocessingActivity = "writing cr values to webgraph for host " + hostfinal + "postprocessed " + proccount + " from " + count + " documents; " +
(proccount.get() * 1000 / (System.currentTimeMillis() - start)) + " docs/second; " +
((System.currentTimeMillis() - start) * (count - proccount.get()) / proccount.get() / 60000) + " minutes remaining";
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
}
} catch (Throwable e) {
ConcurrentLog.logException(e);
continue processloop;
}
}
} catch (InterruptedException e) {
ConcurrentLog.warn("CollectionConfiguration", e.getMessage(), e);
}
}
};
t[i.get()].start();
}
for (int i = 0; i < t.length; i++) try {
t[i].join(10000);
if (t[i].isAlive()) t[i].interrupt();
} catch (InterruptedException e) {}
if (count != proccount.get()) ConcurrentLog.warn("CollectionConfiguration", "ambiguous webgraph document count for host " + host + ": expected=" + count + ", counted=" + proccount);
}
} catch (final IOException e2) {
ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2);
}
} }
// process all documents in collection // process all documents in collection
final Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id postprocessDocuments(segment, rrCache, harvestkey, byPartialUpdate, collectionConnector, collection,
collection1query, rankings, allcount);
postprocessingCollection1Count = 0;
postprocessingWebgraphCount = 0;
postprocessingActivity = "postprocessing terminated";
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
postprocessingRunning = false;
return allcount.get();
}
private void postprocessDocuments(final Segment segment, final ReferenceReportCache rrCache,
final String harvestkey, final boolean byPartialUpdate, final SolrConnector collectionConnector,
final CollectionConfiguration collection, final String collection1query, final Map<String, CRV> rankings,
final AtomicInteger allcount) {
final Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
final Set<String> uniqueURLs = new ConcurrentHashSet<String>(); // will be used in a concurrent environment final Set<String> uniqueURLs = new ConcurrentHashSet<String>(); // will be used in a concurrent environment
final Set<String> omitFields = new HashSet<String>(); final Set<String> omitFields = new HashSet<String>();
omitFields.add(CollectionSchema.process_sxt.getSolrFieldName()); omitFields.add(CollectionSchema.process_sxt.getSolrFieldName());
omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName()); omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName());
final Collection<String> failids = new ArrayList<String>(); final Collection<String> failids = new ConcurrentHashSet<String>();
final AtomicInteger countcheck = new AtomicInteger(0); final AtomicInteger countcheck = new AtomicInteger(0);
final AtomicInteger proccount = new AtomicInteger(); final AtomicInteger proccount = new AtomicInteger();
final AtomicInteger proccount_referencechange = new AtomicInteger(); final AtomicInteger proccount_referencechange = new AtomicInteger();
@ -1400,7 +1226,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
this.allFields()); this.allFields());
final Thread rewriteThread[] = new Thread[concurrency]; final Thread rewriteThread[] = new Thread[concurrency];
for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) { for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) {
rewriteThread[rewrite_start] = new Thread() { rewriteThread[rewrite_start] = new Thread("CollectionConfiguration.postprocessing.rewriteThread-" + rewrite_start) {
@Override @Override
public void run() { public void run() {
SolrDocument doc; SolrDocument doc;
@ -1516,13 +1342,208 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
ConcurrentLog.warn("CollectionConfiguration", e3.getMessage(), e3); ConcurrentLog.warn("CollectionConfiguration", e3.getMessage(), e3);
} }
collectionConnector.commit(true); // make changes available directly to prevent that the process repeats again collectionConnector.commit(true); // make changes available directly to prevent that the process repeats again
postprocessingCollection1Count = 0; }
postprocessingWebgraphCount = 0;
postprocessingActivity = "postprocessing terminated"; private void postprocessWebgraph(final Segment segment, final WebgraphConfiguration webgraph, String webgraphquery,
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); final Map<String, CRV> rankings, final AtomicInteger allcount) {
postprocessingRunning = false; postprocessingActivity = "collecting host facets for webgraph cr calculation";
return allcount.get(); ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
} final Set<String> omitFields = new HashSet<String>();
omitFields.add(WebgraphSchema.process_sxt.getSolrFieldName());
omitFields.add(WebgraphSchema.harvestkey_s.getSolrFieldName());
// collect hosts from index which shall take part in citation computation
ReversibleScoreMap<String> webgraphhosts;
try {
Map<String, ReversibleScoreMap<String>> hostfacet = segment.fulltext().getWebgraphConnector().getFacets(webgraphquery, 10000000, WebgraphSchema.source_host_s.getSolrFieldName());
webgraphhosts = hostfacet.get(WebgraphSchema.source_host_s.getSolrFieldName());
} catch (final IOException e2) {
ConcurrentLog.logException(e2);
webgraphhosts = new ClusteredScoreMap<String>(true);
}
try {
final long start = System.currentTimeMillis();
for (String host: webgraphhosts.keyList(true)) {
if (webgraphhosts.get(host) <= 0) continue;
final String hostfinal = host;
// select all webgraph edges and modify their cr value
postprocessingActivity = "writing cr values to webgraph for host " + host;
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
String patchquery = WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\" AND " + WebgraphSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM;
final long count = segment.fulltext().getWebgraphConnector().getCountByQuery("{!cache=false}" + patchquery);
int concurrency = Math.min((int) count, Math.max(1, Runtime.getRuntime().availableProcessors() / 4));
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph, concurrency = " + concurrency);
final BlockingQueue<SolrDocument> docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery(
patchquery,
WebgraphSchema.source_chars_i.getSolrFieldName() + " asc",
0, 100000000, Long.MAX_VALUE, concurrency + 1, concurrency, true
// TODO: add field list and do partial updates
);
final AtomicInteger proccount = new AtomicInteger(0);
Thread[] t = new Thread[concurrency];
for (int i = 0; i < t.length; i++) {
t[i] = new Thread("CollectionConfiguration.postprocessing.webgraph-" + i) {
@Override
public void run() {
SolrDocument doc; String id;
try {
processloop: while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
try {
SolrInputDocument sid = webgraph.toSolrInputDocument(doc, omitFields);
Collection<Object> proctags = doc.getFieldValues(WebgraphSchema.process_sxt.getSolrFieldName());
for (Object tag: proctags) try {
// switch over tag types
ProcessType tagtype = ProcessType.valueOf((String) tag);
// set cr values
if (tagtype == ProcessType.CITATION) {
if (segment.fulltext().useWebgraph() && webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) {
id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName());
CRV crv = rankings.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn);
}
}
if (webgraph.contains(WebgraphSchema.target_id_s) && webgraph.contains(WebgraphSchema.target_cr_host_norm_i)) {
id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName());
CRV crv = rankings.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn);
}
}
}
} catch (IllegalArgumentException e) {
ConcurrentLog.logException(e);
}
// write document back to index
try {
sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName());
sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName());
//segment.fulltext().getWebgraphConnector().deleteById((String) sid.getFieldValue(WebgraphSchema.id.getSolrFieldName()));
segment.fulltext().getWebgraphConnector().add(sid);
} catch (SolrException e) {
ConcurrentLog.logException(e);
} catch (IOException e) {
ConcurrentLog.logException(e);
}
proccount.incrementAndGet();
allcount.incrementAndGet();
if (proccount.get() % 1000 == 0) {
postprocessingActivity = "writing cr values to webgraph for host " + hostfinal + "postprocessed " + proccount + " from " + count + " documents; " +
(proccount.get() * 1000 / (System.currentTimeMillis() - start)) + " docs/second; " +
((System.currentTimeMillis() - start) * (count - proccount.get()) / proccount.get() / 60000) + " minutes remaining";
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
}
} catch (Throwable e) {
ConcurrentLog.logException(e);
continue processloop;
}
}
} catch (InterruptedException e) {
ConcurrentLog.warn("CollectionConfiguration", e.getMessage(), e);
}
}
};
t[i].start();
}
for (int i = 0; i < t.length; i++) try {
t[i].join(10000);
if (t[i].isAlive()) t[i].interrupt();
} catch (InterruptedException e) {}
if (count != proccount.get()) ConcurrentLog.warn("CollectionConfiguration", "ambiguous webgraph document count for host " + host + ": expected=" + count + ", counted=" + proccount);
}
} catch (final IOException e2) {
ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2);
}
}
private Map<String, CRV> createRankingMap(final Segment segment, final ReferenceReportCache rrCache,
final SolrConnector collectionConnector, ReversibleScoreMap<String> collection1hosts,
boolean shallComputeCR) {
final Map<String, CRV> rankings = new ConcurrentHashMap<String, CRV>();
if (shallComputeCR) try {
int concurrency = Math.min(collection1hosts.size(), Runtime.getRuntime().availableProcessors());
postprocessingActivity = "collecting cr for " + collection1hosts.size() + " hosts, concurrency = " + concurrency;
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
int countcheck = 0;
for (String host: collection1hosts.keyList(true)) {
// Patch the citation index for links with canonical tags.
// This shall fulfill the following requirement:
// If a document A links to B and B contains a 'canonical C', then the citation rank computation shall consider that A links to C and B does not link to C.
// To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links
String patchquery = CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM;
long patchquerycount = collectionConnector.getCountByQuery("{!cache=false}" + patchquery);
BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000000, Long.MAX_VALUE, 20, 1, true,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName());
SolrDocument doc_B;
int patchquerycountcheck = 0;
try {
while ((doc_B = documents_with_canonical_tag.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
// find all documents which link to the canonical doc
DigestURL doc_C_url = new DigestURL((String) doc_B.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()));
byte[] doc_B_id = ASCII.getBytes(((String) doc_B.getFieldValue(CollectionSchema.id.getSolrFieldName())));
// we remove all references to B, because these become references to C
if (segment.connectedCitation()) {
ReferenceContainer<CitationReference> doc_A_ids = segment.urlCitation().remove(doc_B_id);
if (doc_A_ids == null) {
//System.out.println("*** document with canonical but no referrer: " + doc_B.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
continue; // the document has a canonical tag but no referrer?
}
Iterator<CitationReference> doc_A_ids_iterator = doc_A_ids.entries();
// for each of the referrer A of B, set A as a referrer of C
while (doc_A_ids_iterator.hasNext()) {
CitationReference doc_A_citation = doc_A_ids_iterator.next();
segment.urlCitation().add(doc_C_url.hash(), doc_A_citation);
}
}
patchquerycountcheck++;
if (MemoryControl.shortStatus()) {
ConcurrentLog.warn("CollectionConfiguration", "terminated canonical collection during postprocessing because of short memory");
break;
}
}
} catch (InterruptedException e) {
ConcurrentLog.logException(e);
} catch (SpaceExceededException e) {
ConcurrentLog.logException(e);
}
if (patchquerycount != patchquerycountcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous patchquery count for host " + host + ": expected=" + patchquerycount + ", counted=" + patchquerycountcheck);
// do the citation rank computation
if (collection1hosts.get(host) <= 0) continue;
// select all documents for each host
CRHost crh = new CRHost(segment, rrCache, host, 0.85d, 6);
int convergence_attempts = 0;
while (convergence_attempts++ < 30) {
ConcurrentLog.info("CollectionConfiguration", "convergence step " + convergence_attempts + " for host " + host + " ...");
if (crh.convergenceStep()) break;
if (MemoryControl.shortStatus()) {
ConcurrentLog.warn("CollectionConfiguration", "terminated convergenceStep during postprocessing because of short memory");
break;
}
}
ConcurrentLog.info("CollectionConfiguration", "convergence for host " + host + " after " + convergence_attempts + " steps");
// we have now the cr for all documents of a specific host; we store them for later use
Map<String, CRV> crn = crh.normalize();
//crh.log(crn);
rankings.putAll(crn); // accumulate this here for usage in document update later
if (MemoryControl.shortStatus()) {
ConcurrentLog.warn("CollectionConfiguration", "terminated crn akkumulation during postprocessing because of short memory");
break;
}
countcheck++;
}
if (collection1hosts.size() != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous host count: expected=" + collection1hosts.size() + ", counted=" + countcheck);
} catch (final IOException e2) {
ConcurrentLog.logException(e2);
collection1hosts = new ClusteredScoreMap<String>(true);
}
return rankings;
}
public void postprocessing_http_unique(final Segment segment, final SolrDocument doc, final SolrInputDocument sid, final DigestURL url) { public void postprocessing_http_unique(final Segment segment, final SolrDocument doc, final SolrInputDocument sid, final DigestURL url) {
if (!this.contains(CollectionSchema.http_unique_b)) return; if (!this.contains(CollectionSchema.http_unique_b)) return;

Loading…
Cancel
Save