Added a citation reference computation for intra-domain link structures.

While the values for the reference evaluation are computed, also a
backlink-structure can be discovered and written to the index as well.
The host browser has been extended to show such backlinks to each
presented links. The host browser therefore can now show an information
where an document is linked. The new citation reference is computed as
likelyhood for a random click path with recursive usage of previously
computed likelyhood. This process is repeated until the likelyhood
converges to a specific number. This number is then normalized to a
ranking value CRn, 0<=CRn<=1. The value CRn can therefore be used to
rank popularity within intra-domain link structures.
pull/1/head
Michael Peter Christen 12 years ago
parent e20450e798
commit f7e77a21bf

@ -69,9 +69,15 @@ httpstatus_i
## number of unique http references, should be equal to references_internal_i + references_external_i
references_i
## number of unique http references from same host as referenced url
## number of unique http references from same host to referenced url
references_internal_i
## ids of unique http references from same host to referenced url
#references_internal_id_sxt
## urls of unique http references from same host to referenced url
#references_internal_url_sxt
## number of unique http references from external hosts
references_external_i
@ -93,8 +99,8 @@ load_date_dt
## date until resource shall be considered as fresh
fresh_date_dt
## ids of referrer to this document
referrer_id_txt
## id of the referrer to this document, discovered during crawling
referrer_id_s
## the name of the publisher of the document
publisher_t
@ -396,6 +402,19 @@ host_extent_i
#opengraph_url_s
#opengraph_image_s
## citation ranking
## the number of documents within a single host
#cr_host_count_i
## the chance to click on this page when randomly clicking on links within on one host
#cr_host_chance_d
## normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10
#cr_host_norm_i
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias
#ext_cms_txt

@ -21,6 +21,7 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
@ -273,8 +274,12 @@ public class HostBrowser {
CollectionSchema.clickdepth_i.getSolrFieldName(),
CollectionSchema.references_i.getSolrFieldName(),
CollectionSchema.references_internal_i.getSolrFieldName(),
CollectionSchema.references_internal_id_sxt.getSolrFieldName(),
CollectionSchema.references_internal_url_sxt.getSolrFieldName(),
CollectionSchema.references_external_i.getSolrFieldName(),
CollectionSchema.references_exthosts_i.getSolrFieldName()
CollectionSchema.references_exthosts_i.getSolrFieldName(),
CollectionSchema.cr_host_chance_d.getSolrFieldName(),
CollectionSchema.cr_host_norm_i.getSolrFieldName()
);
SolrDocument doc;
Set<String> storedDocs = new HashSet<String>();
@ -290,7 +295,7 @@ public class HostBrowser {
String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
FailType error = errortype == null ? null : FailType.valueOf(errortype);
String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
infoCache.put(ids, new InfoCacheEntry(doc));
infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), doc));
if (u.startsWith(path)) {
if (delete) {
deleteIDs.add(ids);
@ -425,9 +430,7 @@ public class HostBrowser {
if (type == StoreType.INDEX) {
String ids = ASCII.String(uri.hash());
InfoCacheEntry ice = infoCache.get(ids);
prop.put("files_list_" + c + "_type_stored_comment",
(ice.references >= 0 ? "refs: " + ice.references_internal + " int, " + ice.references_external + " ext, " + ice.references_exthosts + " hosts" : "") +
(ice.references >= 0 && ice.clickdepth >= 0 ? ", " : "") + (ice.clickdepth >= 0 ? "clickdepth: " + ice.clickdepth : ""));
prop.put("files_list_" + c + "_type_stored_comment", ice.toString()); // ice.toString() contains html, therefore do not use putHTML here
}
prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0);
if (error) {
@ -505,19 +508,52 @@ public class HostBrowser {
}
public static final class InfoCacheEntry {
public Integer cr_n;
public Double cr_c;
public int clickdepth, references, references_internal, references_external, references_exthosts;
public InfoCacheEntry(final SolrDocument doc) {
public List<String> references_internal_urls;
private final Fulltext fulltext;
public InfoCacheEntry(final Fulltext fulltext, final SolrDocument doc) {
this.fulltext = fulltext;
this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName());
this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName());
Integer cd = (Integer) doc.getFieldValue(CollectionSchema.clickdepth_i.getSolrFieldName());
Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName());
Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName());
Collection<Object> rc_internal_id = doc.getFieldValues(CollectionSchema.references_internal_id_sxt.getSolrFieldName());
Collection<Object> rc_internal_url = doc.getFieldValues(CollectionSchema.references_internal_url_sxt.getSolrFieldName());
Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName());
Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
this.clickdepth = (cd == null || cd.intValue() < 0) ? 999 : cd.intValue();
this.references = (rc == null || rc.intValue() <= 0) ? 0 : rc.intValue();
this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue();
// calculate the url reference list
this.references_internal_urls = new ArrayList<String>();
if (rc_internal_url != null) {
for (Object o: rc_internal_url) references_internal_urls.add((String) o);
} else if (rc_internal_id != null) {
for (Object o: rc_internal_id) {
DigestURI u = fulltext.getURL(ASCII.getBytes((String) o));
if (u != null) references_internal_urls.add(u.toNormalform(true));
}
}
this.references_external = (rc_external == null || rc_external.intValue() <= 0) ? 0 : rc_external.intValue();
this.references_exthosts = (rc_exthosts == null || rc_exthosts.intValue() <= 0) ? 0 : rc_exthosts.intValue();
}
public String toString() {
StringBuilder sb = new StringBuilder();
for (String s: references_internal_urls) sb.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
if (sb.length() == 0 && !fulltext.getDefaultConfiguration().contains(CollectionSchema.references_internal_id_sxt)) sb.append("<a href='/IndexSchema_p.html'><img src='env/grafics/i16.gif' alt='info' title='activate references_internal_id_sxt in IndexSchema_p.html to see all backlinks' width='12' height='12'/></a>");
return
(this.clickdepth >= 0 ?
"clickdepth: " + this.clickdepth :
"") +
(this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") +
(this.cr_n != null ? ", crn=" + this.cr_n : "") +
(this.references >= 0 ?
", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + (sb.length() > 0 ? " " + sb.toString() + "" : "") :
"");
}
}
}

@ -26,6 +26,6 @@ package net.yacy.cora.federate.solr;
*/
public enum ProcessType {
CLICKDEPTH;
CLICKDEPTH, CITATION;
}

@ -23,6 +23,8 @@ package net.yacy.cora.federate.solr;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
@ -32,16 +34,14 @@ import org.apache.log4j.Logger;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.storage.Configuration;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.schema.CollectionSchema;
public class SchemaConfiguration extends Configuration implements Serializable {
@ -94,56 +94,72 @@ public class SchemaConfiguration extends Configuration implements Serializable {
return false;
}
public boolean postprocessing_references(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURI url, Map<String, Long> hostExtentCount) {
if (!(this.contains(CollectionSchema.references_i) || this.contains(CollectionSchema.references_internal_i) ||
public boolean postprocessing_references(Fulltext fulltext, ReferenceReportCache rrCache, SolrDocument doc, SolrInputDocument sid, DigestURI url, Map<String, Long> hostExtentCount) {
if (!(this.contains(CollectionSchema.references_i) ||
this.contains(CollectionSchema.references_internal_i) ||
this.contains(CollectionSchema.references_internal_id_sxt) || this.contains(CollectionSchema.references_internal_url_sxt) ||
this.contains(CollectionSchema.references_external_i) || this.contains(CollectionSchema.references_exthosts_i))) return false;
Integer all_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName());
Integer internal_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName());
Collection<Object> internal_ids_old = doc == null ? null : doc.getFieldValues(CollectionSchema.references_internal_id_sxt.getSolrFieldName());
Collection<Object> internal_urls_old = doc == null ? null : doc.getFieldValues(CollectionSchema.references_internal_url_sxt.getSolrFieldName());
Integer external_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName());
Integer exthosts_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
Integer hostextc_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.host_extent_i.getSolrFieldName());
ReferenceContainer<CitationReference> references;
try {
int all = 0, internal = 0, external = 0;
references = segment.urlCitation().get(url.hash(), null);
if (references == null) return false; // no references at all
//int references = segment.urlCitation().count(url.hash());
byte[] uh0 = url.hash();
Iterator<CitationReference> ri = references.entries();
HandleSet exthosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0);
while (ri.hasNext()) {
CitationReference ref = ri.next();
byte[] hh = ref.hosthash();
exthosts.put(hh);
all++;
if (ByteBuffer.equals(hh, 0, uh0, 6, 6)) internal++; else external++;
ReferenceReport rr = rrCache.getReferenceReport(url.hash(), false);
List<String> internalIDs = new ArrayList<String>();
HandleSet iids = rr.getInternallIDs();
for (byte[] b: iids) internalIDs.add(ASCII.String(b));
List<String> internalURLs = new ArrayList<String>();
if (this.contains(CollectionSchema.references_internal_url_sxt)) {
// get all urls from the index and store them here
for (String id: internalIDs) {
DigestURI u = fulltext.getURL(ASCII.getBytes(id));
if (u != null) internalURLs.add(u.toNormalform(true));
}
}
boolean change = false;
if (all_old == null || all_old.intValue() != all) {
int all = rr.getExternalCount() + rr.getInternalCount();
if (this.contains(CollectionSchema.references_i) &&
(all_old == null || all_old.intValue() != all)) {
sid.setField(CollectionSchema.references_i.getSolrFieldName(), all);
change = true;
}
if (internal_old == null || internal_old.intValue() != internal) {
sid.setField(CollectionSchema.references_internal_i.getSolrFieldName(), internal);
if (this.contains(CollectionSchema.references_internal_i) &&
(internal_old == null || internal_old.intValue() != rr.getInternalCount())) {
sid.setField(CollectionSchema.references_internal_i.getSolrFieldName(), rr.getInternalCount());
change = true;
}
if (external_old == null || external_old.intValue() != external) {
sid.setField(CollectionSchema.references_external_i.getSolrFieldName(), external);
if (this.contains(CollectionSchema.references_internal_id_sxt) &&
(internal_ids_old == null || internal_ids_old.size() != internalIDs.size())) {
sid.setField(CollectionSchema.references_internal_id_sxt.getSolrFieldName(), internalIDs);
change = true;
}
if (exthosts_old == null || exthosts_old.intValue() != exthosts.size()) {
sid.setField(CollectionSchema.references_exthosts_i.getSolrFieldName(), exthosts.size());
if (this.contains(CollectionSchema.references_internal_url_sxt) &&
(internal_urls_old == null || internal_urls_old.size() != internalURLs.size())) {
sid.setField(CollectionSchema.references_internal_url_sxt.getSolrFieldName(), internalURLs);
change = true;
}
if (this.contains(CollectionSchema.references_external_i) &&
(external_old == null || external_old.intValue() != rr.getExternalCount())) {
sid.setField(CollectionSchema.references_external_i.getSolrFieldName(), rr.getExternalCount());
change = true;
}
if (this.contains(CollectionSchema.references_exthosts_i) &&
(exthosts_old == null || exthosts_old.intValue() != rr.getExternalHostIDs().size())) {
sid.setField(CollectionSchema.references_exthosts_i.getSolrFieldName(), rr.getExternalHostIDs().size());
change = true;
}
Long hostExtent = hostExtentCount == null ? Integer.MAX_VALUE : hostExtentCount.get(url.hosthash());
if (hostextc_old == null || hostextc_old.intValue() != hostExtent) {
if (this.contains(CollectionSchema.host_extent_i) &&
(hostextc_old == null || hostextc_old.intValue() != hostExtent)) {
sid.setField(CollectionSchema.host_extent_i.getSolrFieldName(), hostExtent.intValue());
change = true;
}
return change;
} catch (IOException e) {
} catch (SpaceExceededException e) {
}
return false;
}

@ -321,7 +321,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
ReversibleScoreMap<String> result = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
List<Count> values = facet.getValues();
if (values == null) continue;
for (Count ff: values) result.set(ff.getName(), (int) ff.getCount());
for (Count ff: values) if (ff.getCount() > 0) result.set(ff.getName(), (int) ff.getCount());
facets.put(field, result);
}
return facets;

@ -186,9 +186,9 @@ public class URIMetadataNode {
}
public byte[] referrerHash() {
ArrayList<String> referrer = getStringList(CollectionSchema.referrer_id_txt);
if (referrer == null || referrer.size() == 0) return null;
return ASCII.getBytes(referrer.get(0));
String referrer = getString(CollectionSchema.referrer_id_s);
if (referrer == null || referrer.length() == 0) return null;
return ASCII.getBytes(referrer);
}
public int size() {

@ -234,7 +234,10 @@ public class WorkflowProcessor<J extends WorkflowJob> {
// wait for shutdown
try {
this.executor.shutdown();
this.executor.awaitTermination(60, TimeUnit.SECONDS);
for (int i = 0; i < 60; i++) {
this.executor.awaitTermination(1, TimeUnit.SECONDS);
if (this.input.size() <= 0) break;
}
} catch (final InterruptedException e) {}
}
Log.logInfo("serverProcessor", "queue " + this.processName + ": shutdown.");

@ -35,6 +35,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import org.apache.solr.common.SolrDocument;
@ -45,6 +46,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.ByteOrder;
@ -82,6 +84,7 @@ import net.yacy.search.query.SearchEvent;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.WebgraphConfiguration;
import net.yacy.search.schema.WebgraphSchema;
public class Segment {
@ -278,6 +281,108 @@ public class Segment {
return 999;
}
public ReferenceReportCache getReferenceReportCache() {
return new ReferenceReportCache();
}
public class ReferenceReportCache {
Map<byte[], ReferenceReport> cache;
public ReferenceReportCache() {
this.cache = new TreeMap<byte[], ReferenceReport>(Base64Order.enhancedCoder);
}
public ReferenceReport getReferenceReport(final byte[] id, final boolean acceptSelfReference) throws IOException {
ReferenceReport rr = cache.get(id);
if (rr != null) return rr;
try {
rr = new ReferenceReport(id, acceptSelfReference);
cache.put(id, rr);
return rr;
} catch (SpaceExceededException e) {
Log.logException(e);
throw new IOException(e.getMessage());
}
}
}
/**
* A ReferenceReport object is a container for all referenced to a specific url.
* The class stores the number of links from domain-internal and domain-external backlinks,
* and the host hashes of all externally linking documents,
* all IDs from external hosts and all IDs from the same domain.
*/
public final class ReferenceReport {
private int internal, external;
private HandleSet externalHosts, externalIDs, internalIDs;
public ReferenceReport(final byte[] id, final boolean acceptSelfReference) throws IOException, SpaceExceededException {
this.internal = 0;
this.external = 0;
this.externalHosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0);
this.internalIDs = new RowHandleSet(12, Base64Order.enhancedCoder, 0);
this.externalIDs = new RowHandleSet(12, Base64Order.enhancedCoder, 0);
if (writeToWebgraph()) {
// reqd the references from the webgraph
SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector();
webgraph.commit(true);
BlockingQueue<SolrDocument> docs = webgraph.concurrentDocumentsByQuery(WebgraphSchema.target_id_s.getSolrFieldName() + ":\"" + ASCII.String(id) + "\"", 0, 10000000, 600000, 100, WebgraphSchema.source_id_s.getSolrFieldName());
SolrDocument doc;
try {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String refid = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName());
if (refid == null) continue;
byte[] refidh = ASCII.getBytes(refid);
byte[] hh = new byte[6]; // host hash
System.arraycopy(refidh, 6, hh, 0, 6);
if (ByteBuffer.equals(hh, 0, id, 6, 6)) {
if (acceptSelfReference || !ByteBuffer.equals(refidh, id)) {
internalIDs.put(refidh);
internal++;
}
} else {
externalHosts.put(hh);
externalIDs.put(refidh);
external++;
}
}
} catch (InterruptedException e) {
Log.logException(e);
}
} else {
// read the references from the citation index
ReferenceContainer<CitationReference> references;
references = urlCitation().get(id, null);
if (references == null) return; // no references at all
Iterator<CitationReference> ri = references.entries();
while (ri.hasNext()) {
CitationReference ref = ri.next();
byte[] hh = ref.hosthash(); // host hash
if (ByteBuffer.equals(hh, 0, id, 6, 6)) {
internalIDs.put(ref.urlhash());
internal++;
} else {
externalHosts.put(hh);
externalIDs.put(ref.urlhash());
external++;
}
}
}
}
public int getInternalCount() {
return this.internal;
}
public int getExternalCount() {
return this.external;
}
public HandleSet getExternalHostIDs() {
return this.externalHosts;
}
public HandleSet getExternalIDs() {
return this.externalIDs;
}
public HandleSet getInternallIDs() {
return this.internalIDs;
}
}
public long RWICount() {
if (this.termIndex == null) return 0;
return this.termIndex.sizesMax();
@ -598,7 +703,7 @@ public class Segment {
// ENRICH DOCUMENT WITH RANKING INFORMATION
if (this.connectedCitation()) {
this.fulltext.getDefaultConfiguration().postprocessing_references(this, null, vector, url, null);
this.fulltext.getDefaultConfiguration().postprocessing_references(this.fulltext, this.getReferenceReportCache(), null, vector, url, null);
}
// STORE TO SOLR
String error = null;

@ -39,6 +39,7 @@ import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import net.yacy.cora.document.ASCII;
@ -52,10 +53,15 @@ import net.yacy.cora.federate.solr.ProcessType;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
@ -64,10 +70,13 @@ import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.RowHandleMap;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
@ -87,7 +96,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
* @param configurationFile
* @throws IOException
*/
public CollectionConfiguration(final File configurationFile, boolean lazy) throws IOException {
public CollectionConfiguration(final File configurationFile, final boolean lazy) throws IOException {
super(configurationFile);
super.lazy = lazy;
this.rankings = new ArrayList<Ranking>(4);
@ -115,11 +124,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
}
public Ranking getRanking(int idx) {
public Ranking getRanking(final int idx) {
return this.rankings.get(idx);
}
public Ranking getRanking(String name) {
public Ranking getRanking(final String name) {
if (name == null) return null;
for (int i = 0; i < this.rankings.size(); i++) {
Ranking r = this.rankings.get(i);
@ -163,7 +172,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
* @param doc the solr document
* @return a solr input document
*/
public SolrInputDocument toSolrInputDocument(SolrDocument doc) {
public SolrInputDocument toSolrInputDocument(final SolrDocument doc) {
SolrInputDocument sid = new SolrInputDocument();
for (String name: doc.getFieldNames()) {
if (this.contains(name) && !omitFields.contains(name)) { // check each field if enabled in local Solr schema
@ -173,7 +182,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
return sid;
}
public SolrDocument toSolrDocument(SolrInputDocument doc) {
public SolrDocument toSolrDocument(final SolrInputDocument doc) {
SolrDocument sd = new SolrDocument();
for (SolrInputField field: doc) {
if (this.contains(field.getName()) && !omitFields.contains(field.getName())) { // check each field if enabled in local Solr schema
@ -280,7 +289,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, md.loaddate());
if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, md.freshdate());
if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, md.hosthash());
if ((allAttr || contains(CollectionSchema.referrer_id_txt)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_txt, new String[]{ASCII.String(md.referrerHash())});
if ((allAttr || contains(CollectionSchema.referrer_id_s)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(md.referrerHash()));
if (allAttr || contains(CollectionSchema.md5_s)) add(doc, CollectionSchema.md5_s, md.md5());
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher());
if ((allAttr || contains(CollectionSchema.language_s)) && md.language() != null) add(doc, CollectionSchema.language_s, UTF8.String(md.language()));
@ -328,9 +337,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
public SolrVector yacy2solr(
final String id, final String[] collections, final ResponseHeader responseHeader,
final Document document, Condenser condenser, DigestURI referrerURL, String language,
IndexCell<CitationReference> citations,
WebgraphConfiguration webgraph) {
final Document document, final Condenser condenser, final DigestURI referrerURL, final String language,
final IndexCell<CitationReference> citations,
final WebgraphConfiguration webgraph) {
// we use the SolrCell design as index schema
SolrVector doc = new SolrVector();
final DigestURI digestURI = document.dc_source();
@ -356,6 +365,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
CollectionSchema.clickdepth_i.add(doc, clickdepth); // no lazy value checking to get a '0' into the index
}
if (allAttr || (contains(CollectionSchema.cr_host_chance_d) && contains(CollectionSchema.cr_host_count_i) && contains(CollectionSchema.cr_host_norm_i))) {
processTypes.add(ProcessType.CITATION); // postprocessing needed
}
if (allAttr || contains(CollectionSchema.ip_s)) {
final InetAddress address = digestURI.getInetAddress();
if (address != null) add(doc, CollectionSchema.ip_s, address.getHostAddress());
@ -778,7 +791,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, loadDate);
if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula
if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, document.dc_source().hosthash());
if ((allAttr || contains(CollectionSchema.referrer_id_txt)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_txt, new String[]{ASCII.String(referrerURL.hash())});
if ((allAttr || contains(CollectionSchema.referrer_id_s)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(referrerURL.hash()));
//if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]);
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, document.dc_publisher());
if ((allAttr || contains(CollectionSchema.language_s)) && language != null) add(doc, CollectionSchema.language_s, language);
@ -812,60 +825,264 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
* @param urlCitation
* @return
*/
public void postprocessing(Segment segment) {
public void postprocessing(final Segment segment) {
if (!this.contains(CollectionSchema.process_sxt)) return;
if (!segment.connectedCitation()) return;
SolrConnector connector = segment.fulltext().getDefaultConnector();
// that means we must search for those entries.
connector.commit(true); // make sure that we have latest information that can be found
//BlockingQueue<SolrDocument> docs = index.fulltext().getSolr().concurrentQuery("*:*", 0, 1000, 60000, 10);
BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 10000, 60000, 50);
ReferenceReportCache rrCache = segment.getReferenceReportCache();
Map<byte[], CRV> ranking = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder);
try {
// collect hosts from index which shall take part in citation computation
ReversibleScoreMap<String> hostscore = connector.getFacets(CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(), 10000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
if (hostscore == null) hostscore = new ClusteredScoreMap<String>();
// for each host, do a citation rank computation
for (String host: hostscore.keyList(true)) {
if (hostscore.get(host) <= 0) continue;
// select all documents for each host
CRHost crh = new CRHost(segment, rrCache, host, 0.85d, 6);
int convergence_attempts = 0;
while (convergence_attempts++ < 30) {
if (crh.convergenceStep()) break;
}
Log.logInfo("CollectionConfiguration.CRHost", "convergence for host " + host + " after " + convergence_attempts + " steps");
// we have now the cr for all documents of a specific host; we store them for later use
Map<byte[], CRV> crn = crh.normalize();
crh.log(crn);
ranking.putAll(crn); // accumulate this here for usage in document update later
}
} catch (IOException e2) {
}
// process all documents
BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 10000, 60000, 50);
SolrDocument doc;
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0;
Map<String, Long> hostExtentCache = new HashMap<String, Long>();
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0;
Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
try {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
// for each to-be-processed entry work on the process tag
Collection<Object> proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName());
for (Object tag: proctags) {
try {
DigestURI url = new DigestURI((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName())));
byte[] id = url.hash();
SolrInputDocument sid = this.toSolrInputDocument(doc);
try {
DigestURI url = new DigestURI((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName())));
SolrInputDocument sid = this.toSolrInputDocument(doc);
for (Object tag: proctags) {
// switch over tag types
ProcessType tagtype = ProcessType.valueOf((String) tag);
if (tagtype == ProcessType.CLICKDEPTH) {
if (postprocessing_clickdepth(segment, doc, sid, url, CollectionSchema.clickdepth_i)) proccount_clickdepthchange++;
}
// refresh the link count; it's 'cheap' to do this here
String hosthash = url.hosthash();
if (!hostExtentCache.containsKey(hosthash)) {
StringBuilder q = new StringBuilder();
q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200");
long count = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString());
hostExtentCache.put(hosthash, count);
if (tagtype == ProcessType.CITATION) {
CRV crv = ranking.get(id);
if (crv != null) {
sid.setField(CollectionSchema.cr_host_count_i.getSolrFieldName(), crv.count);
sid.setField(CollectionSchema.cr_host_chance_d.getSolrFieldName(), crv.cr);
sid.setField(CollectionSchema.cr_host_norm_i.getSolrFieldName(), crv.crn);
proccount_citationchange++;
}
}
if (postprocessing_references(segment, doc, sid, url, hostExtentCache)) proccount_referencechange++;
// all processing steps checked, remove the processing tag
sid.removeField(CollectionSchema.process_sxt.getSolrFieldName());
// send back to index
connector.add(sid);
proccount++;
} catch (Throwable e1) {
}
// refresh the link count; it's 'cheap' to do this here
String hosthash = url.hosthash();
if (!hostExtentCache.containsKey(hosthash)) {
StringBuilder q = new StringBuilder();
q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200");
long count = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString());
hostExtentCache.put(hosthash, count);
}
if (postprocessing_references(segment.fulltext(), rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++;
// all processing steps checked, remove the processing tag
sid.removeField(CollectionSchema.process_sxt.getSolrFieldName());
// send back to index
//connector.deleteById(ASCII.String(id));
connector.add(sid);
proccount++;
} catch (Throwable e1) {
}
}
Log.logInfo("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " + proccount_clickdepthchange + " clickdepth values changed, " + proccount_referencechange + " reference-count values changed.");
Log.logInfo("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " +
proccount_clickdepthchange + " clickdepth changes, " +
proccount_referencechange + " reference-count changes," +
proccount_citationchange + " citation ranking changes.");
} catch (InterruptedException e) {
}
}
private static final class CRV {
public double cr;
public int crn, count;
public CRV(final int count, final double cr, final int crn) {this.count = count; this.cr = cr; this.crn = crn;}
public String toString() {
return "count=" + count + ", cr=" + cr + ", crn=" + crn;
}
}
/**
* The CRHost class is a container for all ranking values of a specific host.
* Objects of that class are needed as an environment for repeated convergenceStep() computations,
* which are iterative citation rank computations that are repeated until the ranking values
* converge to stable values.
* The class also contains normalization methods to compute simple integer ranking values out of the
* double relevance values.
*/
private static final class CRHost {
private final Segment segment;
private final Map<byte[], double[]> crt;
private final int cr_host_count;
private final RowHandleMap internal_links_counter;
private double damping;
private int converge_eq_factor;
private ReferenceReportCache rrCache;
public CRHost(final Segment segment, final ReferenceReportCache rrCache, final String host, final double damping, final int converge_digits) {
this.segment = segment;
this.damping = damping;
this.rrCache = rrCache;
this.converge_eq_factor = (int) Math.pow(10.0d, converge_digits);
SolrConnector connector = segment.fulltext().getDefaultConnector();
this.crt = new TreeMap<byte[], double[]>(Base64Order.enhancedCoder);
try {
// select all documents for each host
BlockingQueue<String> ids = connector.concurrentIDsByQuery(CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"", 0, 1000000, 600000);
String id;
while ((id = ids.take()) != AbstractSolrConnector.POISON_ID) {
crt.put(ASCII.getBytes(id), new double[]{0.0d,0.0d}); //{old value, new value}
}
} catch (InterruptedException e2) {
}
this.cr_host_count = crt.size();
double initval = 1.0d / cr_host_count;
for (Map.Entry<byte[], double[]> entry: this.crt.entrySet()) entry.getValue()[0] = initval;
this.internal_links_counter = new RowHandleMap(12, Base64Order.enhancedCoder, 8, 100, "internal_links_counter");
}
/**
* produce a map from IDs to CRV records, normalization entries containing the values that are stored to solr.
* @return
*/
public Map<byte[], CRV> normalize() {
TreeMap<Double, List<byte[]>> reorder = new TreeMap<Double, List<byte[]>>();
for (Map.Entry<byte[], double[]> entry: crt.entrySet()) {
Double d = entry.getValue()[0];
List<byte[]> ds = reorder.get(d);
if (ds == null) {ds = new ArrayList<byte[]>(); reorder.put(d, ds);}
ds.add(entry.getKey());
}
int nextcount = (this.cr_host_count + 1) / 2;
int nextcrn = 0;
Map<byte[], CRV> r = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder);
while (reorder.size() > 0) {
int count = nextcount;
while (reorder.size() > 0 && count > 0) {
Map.Entry<Double, List<byte[]>> next = reorder.pollFirstEntry();
List<byte[]> ids = next.getValue();
count -= ids.size();
double cr = next.getKey();
for (byte[] id: ids) r.put(id, new CRV(this.cr_host_count, cr, nextcrn));
}
nextcrn++;
nextcount = Math.max(1, (nextcount + count + 1) / 2);
}
// finally, increase the crn number in such a way that the maximum is always 10
int inc = 11 - nextcrn; // nextcrn is +1
for (Map.Entry<byte[], CRV> entry: r.entrySet()) entry.getValue().crn += inc;
return r;
}
/**
* log out a complete CRHost set of urls and ranking values
* @param rm
*/
public void log(final Map<byte[], CRV> rm) {
// print out all urls with their cr-values
SolrConnector connector = segment.fulltext().getDefaultConnector();
for (Map.Entry<byte[], CRV> entry: rm.entrySet()) {
try {
String url = (String) connector.getDocumentById(ASCII.String(entry.getKey()), CollectionSchema.sku.getSolrFieldName()).getFieldValue(CollectionSchema.sku.getSolrFieldName());
Log.logInfo("CollectionConfiguration.CRHost", "CR for " + url);
Log.logInfo("CollectionConfiguration.CRHost", ">> " + entry.getValue().toString());
} catch (IOException e) {
Log.logException(e);
}
}
}
/**
* Calculate the number of internal links from a specific document, denoted by the document ID.
* This is a very important attribute for the ranking computation because it is the dividend for the previous ranking attribute.
* The internalLinks value will be requested several times for the same id during the convergenceStep()-steps; therefore it should use a cache.
* This cache is part of the CRHost data structure.
* @param id
* @return the number of links from the document, denoted by the ID to documents within the same domain
*/
public int getInternalLinks(final byte[] id) {
int il = (int) this.internal_links_counter.get(id);
if (il >= 0) return il;
try {
SolrDocument doc = this.segment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(id), CollectionSchema.inboundlinkscount_i.getSolrFieldName());
if (doc == null) {
this.internal_links_counter.put(id, 0);
return 0;
}
Object x = doc.getFieldValue(CollectionSchema.inboundlinkscount_i.getSolrFieldName());
il = (x == null) ? 0 : (x instanceof Integer) ? ((Integer) x).intValue() : (x instanceof Long) ? ((Long) x).intValue() : 0;
this.internal_links_counter.put(id, il);
return il;
} catch (IOException e) {
Log.logException(e);
} catch (SpaceExceededException e) {
Log.logException(e);
}
try {this.internal_links_counter.put(id, 0);} catch (SpaceExceededException e) {}
return 0;
}
/**
* Use the crt cache to compute the next generation of crt values.
* @return
*/
public boolean convergenceStep() {
boolean convergence = true;
double df = (1.0d - damping) / this.cr_host_count;
try {
for (Map.Entry<byte[], double[]> entry: crt.entrySet()) {
byte[] id = entry.getKey();
ReferenceReport rr = this.rrCache.getReferenceReport(id, false);
// sum up the cr of the internal links
HandleSet iids = rr.getInternallIDs();
double ncr = 0.0d;
for (byte[] iid: iids) {
int ilc = getInternalLinks(iid);
if (ilc > 0) { // if (ilc == 0) then the reference report is wrong!
ncr += this.crt.get(iid)[0] / ilc;
}
}
ncr = df + damping * ncr;
if (convergence && !eqd(ncr, entry.getValue()[0])) convergence = false;
entry.getValue()[1] = ncr;
}
// after the loop, replace the old value with the new value in crt
for (Map.Entry<byte[], double[]> entry: crt.entrySet()) {
entry.getValue()[0] = entry.getValue()[1];
}
} catch (IOException e) {
}
return convergence;
}
/**
* helper method to check if two doubles are equal using a specific number of digits
* @param a
* @param b
* @return
*/
private boolean eqd(final double a, final double b) {
return ((int) (a * this.converge_eq_factor)) == ((int) (b * this.converge_eq_factor));
}
}
/**
* this method compresses a list of protocol names to an indexed list.
@ -876,7 +1093,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
* @param protocol
* @return a list of indexed protocol entries
*/
private static List<String> protocolList2indexedList(List<String> protocol) {
private static List<String> protocolList2indexedList(final List<String> protocol) {
List<String> a = new ArrayList<String>();
String p;
for (int i = 0; i < protocol.size(); i++) {

@ -51,7 +51,9 @@ public enum CollectionSchema implements SchemaDeclaration {
httpstatus_i(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
httpstatus_redirect_s(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
references_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references, should be equal to references_internal_i + references_external_i"),
references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host as referenced url"),
references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host to referenced url"),
references_internal_id_sxt(SolrType.string, true, true, true, false, true, "ids of unique http references from same host to referenced url"),
references_internal_url_sxt(SolrType.string, true, true, true, false, true, "urls of unique http references from same host to referenced url"),
references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"),
references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"),
clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
@ -60,7 +62,7 @@ public enum CollectionSchema implements SchemaDeclaration {
// optional but recommended, part of index distribution
load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"),
fresh_date_dt(SolrType.date, true, true, false, false, false, "date until resource shall be considered as fresh"),
referrer_id_txt(SolrType.string, true, true, true, false, false, "ids of referrer to this document"),// byte[] referrerHash();
referrer_id_s(SolrType.string, true, true, false, false, false, "id of the referrer to this document, discovered during crawling"),// byte[] referrerHash();
publisher_t(SolrType.text_general, true, true, false, false, true, "the name of the publisher of the document"),// String dc_publisher();
language_s(SolrType.string, true, true, false, false, false, "the language used in the document"),// byte[] language();
audiolinkscount_i(SolrType.num_integer, true, true, false, false, false, "number of links to audio resources"),// int laudio();
@ -184,6 +186,11 @@ public enum CollectionSchema implements SchemaDeclaration {
opengraph_url_s(SolrType.text_general, true, true, false, false, false, "Open Graph Metadata from og:url metadata field, see http://ogp.me/ns#"),
opengraph_image_s(SolrType.text_general, true, true, false, false, false, "Open Graph Metadata from og:image metadata field, see http://ogp.me/ns#"),
// link structure for ranking
cr_host_count_i(SolrType.num_integer, true, true, false, false, false, "the number of documents within a single host"),
cr_host_chance_d(SolrType.num_double, true, true, false, false, false, "the chance to click on this page when randomly clicking on links within on one host"),
cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10"),
// special values; can only be used if '_val' type is defined in schema file; this is not standard
bold_val(SolrType.num_integer, true, true, true, false, false, "number of occurrences of texts in bold_txt"),
italic_val(SolrType.num_integer, true, true, true, false, false, "number of occurrences of texts in italic_txt"),

Loading…
Cancel
Save