redesign of citation index storage

pull/1/head
Michael Peter Christen 12 years ago
parent 7c6ccc426c
commit 203921006a

@ -560,7 +560,7 @@ public class HostBrowser {
StringBuilder sbi = new StringBuilder(); StringBuilder sbi = new StringBuilder();
int c = 0; int c = 0;
for (String s: references_internal_urls) { for (String s: references_internal_urls) {
sbi.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>"); sbi.append("<a href='").append("/HostBrowser.html?path=" + s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
c++; c++;
if (c % 80 == 0) sbi.append("<br/>"); if (c % 80 == 0) sbi.append("<br/>");
} }
@ -568,7 +568,7 @@ public class HostBrowser {
StringBuilder sbe = new StringBuilder(); StringBuilder sbe = new StringBuilder();
c = 0; c = 0;
for (String s: references_external_urls) { for (String s: references_external_urls) {
sbe.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>"); sbe.append("<a href='").append("/HostBrowser.html?path=" + s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
c++; c++;
if (c % 80 == 0) sbe.append("<br/>"); if (c % 80 == 0) sbe.append("<br/>");
} }

@ -54,8 +54,10 @@ import net.yacy.crawler.retrieval.HTTPLoader;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.SMBLoader; import net.yacy.crawler.retrieval.SMBLoader;
import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.peers.SeedDB; import net.yacy.peers.SeedDB;
import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.Blacklist.BlacklistType;
@ -133,6 +135,15 @@ public final class CrawlStacker {
// this is the method that is called by the busy thread from outside // this is the method that is called by the busy thread from outside
if (entry == null) return null; if (entry == null) return null;
// record the link graph for this request
byte[] anchorhash = entry.url().hash();
IndexCell<CitationReference> urlCitationIndex = this.indexSegment.urlCitation();
if (urlCitationIndex != null) try {
urlCitationIndex.add(anchorhash, new CitationReference(entry.referrerhash(), entry.appdate().getTime()));
} catch (final Exception e) {
Log.logException(e);
}
try { try {
final String rejectReason = stackCrawl(entry); final String rejectReason = stackCrawl(entry);

@ -33,7 +33,6 @@ import java.util.Collection;
import java.util.Date; import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Properties;
import java.util.Set; import java.util.Set;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
@ -498,27 +497,6 @@ public class Segment {
return this.segmentPath; return this.segmentPath;
} }
private int addCitationIndex(final DigestURI url, final Date urlModified, final Map<DigestURI, Properties> anchors) {
if (anchors == null) return 0;
int refCount = 0;
// iterate over all outgoing links, this will create a context for those links
final byte[] urlhash = url.hash();
final long urldate = urlModified.getTime();
for (Map.Entry<DigestURI, Properties> anchorEntry: anchors.entrySet()) {
DigestURI anchor = anchorEntry.getKey();
byte[] refhash = anchor.hash();
//System.out.println("*** addCitationIndex: urlhash = " + ASCII.String(urlhash) + ", refhash = " + ASCII.String(refhash) + ", urldate = " + urlModified.toString());
if (this.urlCitationIndex != null) try {
this.urlCitationIndex.add(refhash, new CitationReference(urlhash, urldate));
} catch (final Exception e) {
Log.logException(e);
}
refCount++;
}
return refCount;
}
public synchronized void close() { public synchronized void close() {
this.indexingPutDocumentProcessor.shutdown(); this.indexingPutDocumentProcessor.shutdown();
if (this.termIndex != null) this.termIndex.close(); if (this.termIndex != null) this.termIndex.close();
@ -795,9 +773,6 @@ public class Segment {
} }
} }
// STORE PAGE REFERENCES INTO CITATION INDEX
final int refs = addCitationIndex(url, modDate, document.getAnchors());
// finish index time // finish index time
final long indexingEndTime = System.currentTimeMillis(); final long indexingEndTime = System.currentTimeMillis();
@ -807,7 +782,7 @@ public class Segment {
"\n\tDescription: " + dc_title + "\n\tDescription: " + dc_title +
"\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " + "\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " +
"Size: " + document.getTextLength() + " bytes | " + "Size: " + document.getTextLength() + " bytes | " +
"Anchors: " + refs + //"Anchors: " + refs +
"\n\tLinkStorageTime: " + (storageEndTime - startTime) + " ms | " + "\n\tLinkStorageTime: " + (storageEndTime - startTime) + " ms | " +
"indexStorageTime: " + (indexingEndTime - storageEndTime) + " ms"); "indexStorageTime: " + (indexingEndTime - storageEndTime) + " ms");
} }

@ -488,6 +488,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
Set<DigestURI> inboundLinks = document.inboundLinks(); Set<DigestURI> inboundLinks = document.inboundLinks();
Set<DigestURI> outboundLinks = document.outboundLinks(); Set<DigestURI> outboundLinks = document.outboundLinks();
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
Map<DigestURI, Properties> alllinks = document.getAnchors();
int c = 0; int c = 0;
final Object parser = document.getParserObject(); final Object parser = document.getParserObject();
Map<DigestURI, ImageEntry> images = new HashMap<DigestURI, ImageEntry>(); Map<DigestURI, ImageEntry> images = new HashMap<DigestURI, ImageEntry>();
@ -677,7 +679,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
frames[c++] = u.toNormalform(false); frames[c++] = u.toNormalform(false);
} }
add(doc, CollectionSchema.framesscount_i, frames.length); add(doc, CollectionSchema.framesscount_i, frames.length);
if (frames.length > 0) add(doc, CollectionSchema.frames_sxt, frames); if (frames.length > 0) {
add(doc, CollectionSchema.frames_sxt, frames);
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound
}
} }
// IFrames // IFrames
@ -691,7 +696,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
iframes[c++] = u.toNormalform(false); iframes[c++] = u.toNormalform(false);
} }
add(doc, CollectionSchema.iframesscount_i, iframes.length); add(doc, CollectionSchema.iframesscount_i, iframes.length);
if (iframes.length > 0) add(doc, CollectionSchema.iframes_sxt, iframes); if (iframes.length > 0) {
add(doc, CollectionSchema.iframes_sxt, iframes);
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound
}
} }
// canonical tag // canonical tag
@ -791,10 +799,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.inboundlinksnofollowcount_i)) add(doc, CollectionSchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount()); if (allAttr || contains(CollectionSchema.inboundlinksnofollowcount_i)) add(doc, CollectionSchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount());
if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, outboundLinks.size()); if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, outboundLinks.size());
if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount()); if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
Map<DigestURI, Properties> alllinks = document.getAnchors();
// create a subgraph // create a subgraph
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
//if () { //if () {
webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, inboundLinks, citations); webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, inboundLinks, citations);
webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, false, outboundLinks, citations); webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, false, outboundLinks, citations);

Loading…
Cancel
Save