diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index bd5d614c5..cf49a3575 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -54,7 +54,6 @@ import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.robots.RobotsTxt; import net.yacy.document.TextParser; -import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.peers.SeedDB; import net.yacy.repository.Blacklist.BlacklistType; @@ -131,18 +130,6 @@ public final class CrawlStacker { public Request job(final Request entry) { // this is the method that is called by the busy thread from outside if (entry == null) return null; - - // record the link graph for this request; this can be overwritten, replaced and enhanced by an index writing process in Segment.storeDocument - byte[] anchorhash = entry.url().hash(); - if (entry.referrerhash() != null) { - if (this.indexSegment.connectedCitation()) try { - this.indexSegment.urlCitation().add(anchorhash, new CitationReference(entry.referrerhash(), entry.appdate().getTime())); - } catch (final Exception e) { - ConcurrentLog.logException(e); - } - - // TODO: write to webgraph?? - } try { final String rejectReason = stackCrawl(entry); diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 8cbe3ebd5..c7fd11199 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -52,6 +52,7 @@ import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.util.MapTools; import net.yacy.kelondro.util.kelondroException; import net.yacy.search.query.QueryParams; +import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; import net.yacy.utils.crypt; @@ -417,18 +418,10 @@ public class URIMetadataNode extends SolrDocument { public WordReferenceVars word() { return this.word; } - - private static List indexedList2protocolList(Collection iplist, int dimension) { - List a = new ArrayList(dimension); - for (int i = 0; i < dimension; i++) a.add("http"); - if (iplist == null) return a; - for (Object ip: iplist) a.set(Integer.parseInt(((String) ip).substring(0, 3)), ((String) ip).substring(4)); - return a; - } public static Iterator getLinks(SolrDocument doc, boolean inbound) { Collection urlstub = doc.getFieldValues((inbound ? CollectionSchema.inboundlinks_urlstub_sxt : CollectionSchema.outboundlinks_urlstub_sxt).getSolrFieldName()); - Collection urlprot = urlstub == null ? null : indexedList2protocolList(doc.getFieldValues((inbound ? CollectionSchema.inboundlinks_protocol_sxt : CollectionSchema.outboundlinks_protocol_sxt).getSolrFieldName()), urlstub.size()); + Collection urlprot = urlstub == null ? null : CollectionConfiguration.indexedList2protocolList(doc.getFieldValues((inbound ? CollectionSchema.inboundlinks_protocol_sxt : CollectionSchema.outboundlinks_protocol_sxt).getSolrFieldName()), urlstub.size()); String u; LinkedHashSet list = new LinkedHashSet(); if (urlprot != null && urlstub != null) { diff --git a/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java b/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java index 671ce1bac..a20dc228b 100644 --- a/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java +++ b/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java @@ -170,7 +170,7 @@ public final class ReferenceContainerCache exte } try { dump.close(true); - log.info("finished rwi heap dump: " + wordcount + " words, " + urlcount + " word/URL relations in " + (System.currentTimeMillis() - startTime) + " milliseconds"); + log.info("finished rwi heap dump: " + wordcount + " terms, " + urlcount + " term/data relations in " + (System.currentTimeMillis() - startTime) + " milliseconds"); } catch (final IOException e) { log.severe("failed rwi heap dump: " + e.getMessage(), e); } finally { diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index c1859430d..7fea19153 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -29,6 +29,7 @@ package net.yacy.search.index; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; +import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.List; @@ -44,7 +45,6 @@ import org.apache.solr.common.SolrInputDocument; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; -import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; @@ -532,7 +532,8 @@ public class Segment { char docType = Response.docType(document.dc_format()); // CREATE SOLR DOCUMENT - final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName); + final CollectionConfiguration collectionConfig = this.fulltext.getDefaultConfiguration(); + final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName); // ENRICH DOCUMENT WITH RANKING INFORMATION this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null); @@ -560,23 +561,43 @@ public class Segment { } } - // write the edges to the citation reference index - if (this.connectedCitation()) try { - // normal links - for (SolrInputDocument edge: webgraph) { - String referrerhash = (String) edge.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()); - String anchorhash = (String) edge.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()); - if (referrerhash != null && anchorhash != null) { - urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime())); + } + + + // write the edges to the citation reference index + if (this.connectedCitation()) try { + // we use the subgraph to write the citation index, that shall cause that the webgraph and the citation index is identical + + if (collectionConfig.contains(CollectionSchema.inboundlinks_protocol_sxt) || collectionConfig.contains(CollectionSchema.inboundlinks_urlstub_sxt)) { + Collection inboundlinks_urlstub = vector.getFieldValues(CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName()); + List inboundlinks_protocol = inboundlinks_urlstub == null ? null : CollectionConfiguration.indexedList2protocolList(vector.getFieldValues(CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName()), inboundlinks_urlstub.size()); + if (inboundlinks_protocol != null && inboundlinks_urlstub != null && inboundlinks_protocol.size() == inboundlinks_urlstub.size() && inboundlinks_urlstub instanceof List) { + for (int i = 0; i < inboundlinks_protocol.size(); i++) { + String targetURL = inboundlinks_protocol.get(i) + "://" + ((String) ((List) inboundlinks_urlstub).get(i)); + String referrerhash = id; + String anchorhash = ASCII.String(new DigestURL(targetURL).hash()); + if (referrerhash != null && anchorhash != null) { + urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime())); + } } } - // media links as well! - for (DigestURL image: document.getImages().keySet()) urlCitationIndex.add(image.hash(), new CitationReference(url.hash(), loadDate.getTime())); - for (AnchorURL audio: document.getAudiolinks().keySet()) urlCitationIndex.add(audio.hash(), new CitationReference(url.hash(), loadDate.getTime())); - for (AnchorURL video: document.getVideolinks().keySet()) urlCitationIndex.add(video.hash(), new CitationReference(url.hash(), loadDate.getTime())); - } catch (Throwable e) { - ConcurrentLog.logException(e); } + if (collectionConfig.contains(CollectionSchema.outboundlinks_protocol_sxt) || collectionConfig.contains(CollectionSchema.outboundlinks_urlstub_sxt)) { + Collection outboundlinks_urlstub = vector.getFieldValues(CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName()); + List outboundlinks_protocol = outboundlinks_urlstub == null ? null : CollectionConfiguration.indexedList2protocolList(vector.getFieldValues(CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName()), outboundlinks_urlstub.size()); + if (outboundlinks_protocol != null && outboundlinks_urlstub != null && outboundlinks_protocol.size() == outboundlinks_urlstub.size() && outboundlinks_urlstub instanceof List) { + for (int i = 0; i < outboundlinks_protocol.size(); i++) { + String targetURL = outboundlinks_protocol.get(i) + "://" + ((String) ((List) outboundlinks_urlstub).get(i)); + String referrerhash = id; + String anchorhash = ASCII.String(new DigestURL(targetURL).hash()); + if (referrerhash != null && anchorhash != null) { + urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime())); + } + } + } + } + } catch (Throwable e) { + ConcurrentLog.logException(e); } if (error != null) { diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index e9345c115..c0d7805a0 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -1750,7 +1750,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param protocol * @return a list of indexed protocol entries */ - private static List protocolList2indexedList(final List protocol) { + public static List protocolList2indexedList(final List protocol) { List a = new ArrayList(); String p; for (int i = 0; i < protocol.size(); i++) { @@ -1764,6 +1764,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri return a; } + public static List indexedList2protocolList(Collection iplist, int dimension) { + List a = new ArrayList(dimension); + for (int i = 0; i < dimension; i++) a.add("http"); + if (iplist == null) return a; + for (Object ip: iplist) a.set(Integer.parseInt(((String) ip).substring(0, 3)), ((String) ip).substring(4)); + return a; + } + /** * encode a string containing attributes from anchor rel properties binary: * bit 0: "me" contained in rel