changed method to write the citation index: do not catch up references

during document parsing; instead use the same references that would also
be written into the webgraph. That should cause that the webgraph and
the citation index express the exact same semantic.
pull/1/head
Michael Peter Christen 11 years ago
parent 57ce7eeff3
commit a7dd89c4de

@ -54,7 +54,6 @@ import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.peers.SeedDB;
import net.yacy.repository.Blacklist.BlacklistType;
@ -131,18 +130,6 @@ public final class CrawlStacker {
public Request job(final Request entry) {
// this is the method that is called by the busy thread from outside
if (entry == null) return null;
// record the link graph for this request; this can be overwritten, replaced and enhanced by an index writing process in Segment.storeDocument
byte[] anchorhash = entry.url().hash();
if (entry.referrerhash() != null) {
if (this.indexSegment.connectedCitation()) try {
this.indexSegment.urlCitation().add(anchorhash, new CitationReference(entry.referrerhash(), entry.appdate().getTime()));
} catch (final Exception e) {
ConcurrentLog.logException(e);
}
// TODO: write to webgraph??
}
try {
final String rejectReason = stackCrawl(entry);

@ -52,6 +52,7 @@ import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.MapTools;
import net.yacy.kelondro.util.kelondroException;
import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.utils.crypt;
@ -417,18 +418,10 @@ public class URIMetadataNode extends SolrDocument {
public WordReferenceVars word() {
return this.word;
}
private static List<String> indexedList2protocolList(Collection<Object> iplist, int dimension) {
List<String> a = new ArrayList<String>(dimension);
for (int i = 0; i < dimension; i++) a.add("http");
if (iplist == null) return a;
for (Object ip: iplist) a.set(Integer.parseInt(((String) ip).substring(0, 3)), ((String) ip).substring(4));
return a;
}
public static Iterator<String> getLinks(SolrDocument doc, boolean inbound) {
Collection<Object> urlstub = doc.getFieldValues((inbound ? CollectionSchema.inboundlinks_urlstub_sxt : CollectionSchema.outboundlinks_urlstub_sxt).getSolrFieldName());
Collection<String> urlprot = urlstub == null ? null : indexedList2protocolList(doc.getFieldValues((inbound ? CollectionSchema.inboundlinks_protocol_sxt : CollectionSchema.outboundlinks_protocol_sxt).getSolrFieldName()), urlstub.size());
Collection<String> urlprot = urlstub == null ? null : CollectionConfiguration.indexedList2protocolList(doc.getFieldValues((inbound ? CollectionSchema.inboundlinks_protocol_sxt : CollectionSchema.outboundlinks_protocol_sxt).getSolrFieldName()), urlstub.size());
String u;
LinkedHashSet<String> list = new LinkedHashSet<String>();
if (urlprot != null && urlstub != null) {

@ -170,7 +170,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
}
try {
dump.close(true);
log.info("finished rwi heap dump: " + wordcount + " words, " + urlcount + " word/URL relations in " + (System.currentTimeMillis() - startTime) + " milliseconds");
log.info("finished rwi heap dump: " + wordcount + " terms, " + urlcount + " term/data relations in " + (System.currentTimeMillis() - startTime) + " milliseconds");
} catch (final IOException e) {
log.severe("failed rwi heap dump: " + e.getMessage(), e);
} finally {

@ -29,6 +29,7 @@ package net.yacy.search.index;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
@ -44,7 +45,6 @@ import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
@ -532,7 +532,8 @@ public class Segment {
char docType = Response.docType(document.dc_format());
// CREATE SOLR DOCUMENT
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName);
final CollectionConfiguration collectionConfig = this.fulltext.getDefaultConfiguration();
final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName);
// ENRICH DOCUMENT WITH RANKING INFORMATION
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);
@ -560,23 +561,43 @@ public class Segment {
}
}
// write the edges to the citation reference index
if (this.connectedCitation()) try {
// normal links
for (SolrInputDocument edge: webgraph) {
String referrerhash = (String) edge.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName());
String anchorhash = (String) edge.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName());
if (referrerhash != null && anchorhash != null) {
urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime()));
}
// write the edges to the citation reference index
if (this.connectedCitation()) try {
// we use the subgraph to write the citation index, that shall cause that the webgraph and the citation index is identical
if (collectionConfig.contains(CollectionSchema.inboundlinks_protocol_sxt) || collectionConfig.contains(CollectionSchema.inboundlinks_urlstub_sxt)) {
Collection<Object> inboundlinks_urlstub = vector.getFieldValues(CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName());
List<String> inboundlinks_protocol = inboundlinks_urlstub == null ? null : CollectionConfiguration.indexedList2protocolList(vector.getFieldValues(CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName()), inboundlinks_urlstub.size());
if (inboundlinks_protocol != null && inboundlinks_urlstub != null && inboundlinks_protocol.size() == inboundlinks_urlstub.size() && inboundlinks_urlstub instanceof List<?>) {
for (int i = 0; i < inboundlinks_protocol.size(); i++) {
String targetURL = inboundlinks_protocol.get(i) + "://" + ((String) ((List<?>) inboundlinks_urlstub).get(i));
String referrerhash = id;
String anchorhash = ASCII.String(new DigestURL(targetURL).hash());
if (referrerhash != null && anchorhash != null) {
urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime()));
}
}
}
// media links as well!
for (DigestURL image: document.getImages().keySet()) urlCitationIndex.add(image.hash(), new CitationReference(url.hash(), loadDate.getTime()));
for (AnchorURL audio: document.getAudiolinks().keySet()) urlCitationIndex.add(audio.hash(), new CitationReference(url.hash(), loadDate.getTime()));
for (AnchorURL video: document.getVideolinks().keySet()) urlCitationIndex.add(video.hash(), new CitationReference(url.hash(), loadDate.getTime()));
} catch (Throwable e) {
ConcurrentLog.logException(e);
}
if (collectionConfig.contains(CollectionSchema.outboundlinks_protocol_sxt) || collectionConfig.contains(CollectionSchema.outboundlinks_urlstub_sxt)) {
Collection<Object> outboundlinks_urlstub = vector.getFieldValues(CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName());
List<String> outboundlinks_protocol = outboundlinks_urlstub == null ? null : CollectionConfiguration.indexedList2protocolList(vector.getFieldValues(CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName()), outboundlinks_urlstub.size());
if (outboundlinks_protocol != null && outboundlinks_urlstub != null && outboundlinks_protocol.size() == outboundlinks_urlstub.size() && outboundlinks_urlstub instanceof List<?>) {
for (int i = 0; i < outboundlinks_protocol.size(); i++) {
String targetURL = outboundlinks_protocol.get(i) + "://" + ((String) ((List<?>) outboundlinks_urlstub).get(i));
String referrerhash = id;
String anchorhash = ASCII.String(new DigestURL(targetURL).hash());
if (referrerhash != null && anchorhash != null) {
urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime()));
}
}
}
}
} catch (Throwable e) {
ConcurrentLog.logException(e);
}
if (error != null) {

@ -1750,7 +1750,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
* @param protocol
* @return a list of indexed protocol entries
*/
private static List<String> protocolList2indexedList(final List<String> protocol) {
public static List<String> protocolList2indexedList(final List<String> protocol) {
List<String> a = new ArrayList<String>();
String p;
for (int i = 0; i < protocol.size(); i++) {
@ -1764,6 +1764,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
return a;
}
public static List<String> indexedList2protocolList(Collection<Object> iplist, int dimension) {
List<String> a = new ArrayList<String>(dimension);
for (int i = 0; i < dimension; i++) a.add("http");
if (iplist == null) return a;
for (Object ip: iplist) a.set(Integer.parseInt(((String) ip).substring(0, 3)), ((String) ip).substring(4));
return a;
}
/**
* encode a string containing attributes from anchor rel properties binary:
* bit 0: "me" contained in rel

Loading…
Cancel
Save