redesign of citation index storage

pull/1/head
Michael Peter Christen 12 years ago
parent 7c6ccc426c
commit 203921006a

@ -560,7 +560,7 @@ public class HostBrowser {
StringBuilder sbi = new StringBuilder();
int c = 0;
for (String s: references_internal_urls) {
sbi.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
sbi.append("<a href='").append("/HostBrowser.html?path=" + s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
c++;
if (c % 80 == 0) sbi.append("<br/>");
}
@ -568,7 +568,7 @@ public class HostBrowser {
StringBuilder sbe = new StringBuilder();
c = 0;
for (String s: references_external_urls) {
sbe.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
sbe.append("<a href='").append("/HostBrowser.html?path=" + s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
c++;
if (c % 80 == 0) sbe.append("<br/>");
}

@ -54,8 +54,10 @@ import net.yacy.crawler.retrieval.HTTPLoader;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.SMBLoader;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.peers.SeedDB;
import net.yacy.repository.Blacklist.BlacklistType;
@ -133,6 +135,15 @@ public final class CrawlStacker {
// this is the method that is called by the busy thread from outside
if (entry == null) return null;
// record the link graph for this request
byte[] anchorhash = entry.url().hash();
IndexCell<CitationReference> urlCitationIndex = this.indexSegment.urlCitation();
if (urlCitationIndex != null) try {
urlCitationIndex.add(anchorhash, new CitationReference(entry.referrerhash(), entry.appdate().getTime()));
} catch (final Exception e) {
Log.logException(e);
}
try {
final String rejectReason = stackCrawl(entry);

@ -33,7 +33,6 @@ import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
@ -498,27 +497,6 @@ public class Segment {
return this.segmentPath;
}
private int addCitationIndex(final DigestURI url, final Date urlModified, final Map<DigestURI, Properties> anchors) {
if (anchors == null) return 0;
int refCount = 0;
// iterate over all outgoing links, this will create a context for those links
final byte[] urlhash = url.hash();
final long urldate = urlModified.getTime();
for (Map.Entry<DigestURI, Properties> anchorEntry: anchors.entrySet()) {
DigestURI anchor = anchorEntry.getKey();
byte[] refhash = anchor.hash();
//System.out.println("*** addCitationIndex: urlhash = " + ASCII.String(urlhash) + ", refhash = " + ASCII.String(refhash) + ", urldate = " + urlModified.toString());
if (this.urlCitationIndex != null) try {
this.urlCitationIndex.add(refhash, new CitationReference(urlhash, urldate));
} catch (final Exception e) {
Log.logException(e);
}
refCount++;
}
return refCount;
}
public synchronized void close() {
this.indexingPutDocumentProcessor.shutdown();
if (this.termIndex != null) this.termIndex.close();
@ -795,9 +773,6 @@ public class Segment {
}
}
// STORE PAGE REFERENCES INTO CITATION INDEX
final int refs = addCitationIndex(url, modDate, document.getAnchors());
// finish index time
final long indexingEndTime = System.currentTimeMillis();
@ -807,7 +782,7 @@ public class Segment {
"\n\tDescription: " + dc_title +
"\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " +
"Size: " + document.getTextLength() + " bytes | " +
"Anchors: " + refs +
//"Anchors: " + refs +
"\n\tLinkStorageTime: " + (storageEndTime - startTime) + " ms | " +
"indexStorageTime: " + (indexingEndTime - storageEndTime) + " ms");
}

@ -488,6 +488,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
Set<DigestURI> inboundLinks = document.inboundLinks();
Set<DigestURI> outboundLinks = document.outboundLinks();
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
Map<DigestURI, Properties> alllinks = document.getAnchors();
int c = 0;
final Object parser = document.getParserObject();
Map<DigestURI, ImageEntry> images = new HashMap<DigestURI, ImageEntry>();
@ -677,7 +679,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
frames[c++] = u.toNormalform(false);
}
add(doc, CollectionSchema.framesscount_i, frames.length);
if (frames.length > 0) add(doc, CollectionSchema.frames_sxt, frames);
if (frames.length > 0) {
add(doc, CollectionSchema.frames_sxt, frames);
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound
}
}
// IFrames
@ -691,7 +696,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
iframes[c++] = u.toNormalform(false);
}
add(doc, CollectionSchema.iframesscount_i, iframes.length);
if (iframes.length > 0) add(doc, CollectionSchema.iframes_sxt, iframes);
if (iframes.length > 0) {
add(doc, CollectionSchema.iframes_sxt, iframes);
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound
}
}
// canonical tag
@ -791,10 +799,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.inboundlinksnofollowcount_i)) add(doc, CollectionSchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount());
if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, outboundLinks.size());
if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
Map<DigestURI, Properties> alllinks = document.getAnchors();
// create a subgraph
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
//if () {
webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, inboundLinks, citations);
webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, false, outboundLinks, citations);

Loading…
Cancel
Save