From 203921006a364dd8a4bca9b5492cad6478411fa7 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 30 Jun 2013 02:11:46 +0200 Subject: [PATCH] redesign of citation index storage --- htroot/HostBrowser.java | 4 +-- source/net/yacy/crawler/CrawlStacker.java | 11 ++++++++ source/net/yacy/search/index/Segment.java | 27 +------------------ .../schema/CollectionConfiguration.java | 14 +++++++--- 4 files changed, 24 insertions(+), 32 deletions(-) diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 3e26b6ca7..bd3e5a3e1 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -560,7 +560,7 @@ public class HostBrowser { StringBuilder sbi = new StringBuilder(); int c = 0; for (String s: references_internal_urls) { - sbi.append("info"); + sbi.append("info"); c++; if (c % 80 == 0) sbi.append("
"); } @@ -568,7 +568,7 @@ public class HostBrowser { StringBuilder sbe = new StringBuilder(); c = 0; for (String s: references_external_urls) { - sbe.append("info"); + sbe.append("info"); c++; if (c % 80 == 0) sbe.append("
"); } diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 641de3af8..d97ec6706 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -54,8 +54,10 @@ import net.yacy.crawler.retrieval.HTTPLoader; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.SMBLoader; import net.yacy.crawler.robots.RobotsTxt; +import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.rwi.IndexCell; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.peers.SeedDB; import net.yacy.repository.Blacklist.BlacklistType; @@ -133,6 +135,15 @@ public final class CrawlStacker { // this is the method that is called by the busy thread from outside if (entry == null) return null; + // record the link graph for this request + byte[] anchorhash = entry.url().hash(); + IndexCell urlCitationIndex = this.indexSegment.urlCitation(); + if (urlCitationIndex != null) try { + urlCitationIndex.add(anchorhash, new CitationReference(entry.referrerhash(), entry.appdate().getTime())); + } catch (final Exception e) { + Log.logException(e); + } + try { final String rejectReason = stackCrawl(entry); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index f6e122e3e..5c5edd8c8 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -33,7 +33,6 @@ import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.Map; -import java.util.Properties; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.BlockingQueue; @@ -498,27 +497,6 @@ public class Segment { return this.segmentPath; } - private int addCitationIndex(final DigestURI url, final Date urlModified, final Map anchors) { - if (anchors == null) return 0; - int refCount = 0; - - // iterate over all outgoing links, this will create a context for those links - final byte[] urlhash = url.hash(); - final long urldate = urlModified.getTime(); - for (Map.Entry anchorEntry: anchors.entrySet()) { - DigestURI anchor = anchorEntry.getKey(); - byte[] refhash = anchor.hash(); - //System.out.println("*** addCitationIndex: urlhash = " + ASCII.String(urlhash) + ", refhash = " + ASCII.String(refhash) + ", urldate = " + urlModified.toString()); - if (this.urlCitationIndex != null) try { - this.urlCitationIndex.add(refhash, new CitationReference(urlhash, urldate)); - } catch (final Exception e) { - Log.logException(e); - } - refCount++; - } - return refCount; - } - public synchronized void close() { this.indexingPutDocumentProcessor.shutdown(); if (this.termIndex != null) this.termIndex.close(); @@ -795,9 +773,6 @@ public class Segment { } } - // STORE PAGE REFERENCES INTO CITATION INDEX - final int refs = addCitationIndex(url, modDate, document.getAnchors()); - // finish index time final long indexingEndTime = System.currentTimeMillis(); @@ -807,7 +782,7 @@ public class Segment { "\n\tDescription: " + dc_title + "\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " + "Size: " + document.getTextLength() + " bytes | " + - "Anchors: " + refs + + //"Anchors: " + refs + "\n\tLinkStorageTime: " + (storageEndTime - startTime) + " ms | " + "indexStorageTime: " + (indexingEndTime - storageEndTime) + " ms"); } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 4e04b8288..238cc4bca 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -488,6 +488,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri Set inboundLinks = document.inboundLinks(); Set outboundLinks = document.outboundLinks(); + Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size()); + Map alllinks = document.getAnchors(); int c = 0; final Object parser = document.getParserObject(); Map images = new HashMap(); @@ -677,7 +679,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri frames[c++] = u.toNormalform(false); } add(doc, CollectionSchema.framesscount_i, frames.length); - if (frames.length > 0) add(doc, CollectionSchema.frames_sxt, frames); + if (frames.length > 0) { + add(doc, CollectionSchema.frames_sxt, frames); + //webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound + } } // IFrames @@ -691,7 +696,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri iframes[c++] = u.toNormalform(false); } add(doc, CollectionSchema.iframesscount_i, iframes.length); - if (iframes.length > 0) add(doc, CollectionSchema.iframes_sxt, iframes); + if (iframes.length > 0) { + add(doc, CollectionSchema.iframes_sxt, iframes); + //webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound + } } // canonical tag @@ -791,10 +799,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.inboundlinksnofollowcount_i)) add(doc, CollectionSchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount()); if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, outboundLinks.size()); if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount()); - Map alllinks = document.getAnchors(); // create a subgraph - Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size()); //if () { webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, inboundLinks, citations); webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, false, outboundLinks, citations);