- enhanced generation of url objects

- enhanced computation of link structure graphics
- enhanced collection of data for link structures
pull/1/head
Michael Peter Christen 12 years ago
parent 4023d88b0b
commit 21fe8339b4

@ -181,7 +181,7 @@ public class CrawlStartScanner_p
final Scanner.Service u = se.next().getKey();
DigestURI uu;
try {
uu = new DigestURI(u.url());
uu = DigestURI.toDigestURI(u.url());
pkmap.put(uu.hash(), uu);
} catch ( final MalformedURLException e ) {
Log.logException(e);
@ -236,7 +236,7 @@ public class CrawlStartScanner_p
while ( se.hasNext() ) {
host = se.next();
try {
u = new DigestURI(host.getKey().url());
u = DigestURI.toDigestURI(host.getKey().url());
urlString = u.toNormalform(true);
if ( host.getValue() == Access.granted
&& Scanner.inIndex(apiCommentCache, urlString) == null ) {

@ -296,7 +296,7 @@ public class Crawler_p {
scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
// get links and generate filter
for (MultiProtocolURI u: scraper.getAnchors().keySet()) {
newRootURLs.add(new DigestURI(u));
newRootURLs.add(DigestURI.toDigestURI(u));
}
} catch (IOException e) {
Log.logException(e);

@ -63,7 +63,7 @@ public class ServerScannerList {
while (se.hasNext()) {
host = se.next();
try {
u = new DigestURI(host.getKey().url());
u = DigestURI.toDigestURI(host.getKey().url());
urlString = u.toNormalform(true);
prop.put("servertable_list_" + i + "_edit", edit ? 1 : 0);
prop.put("servertable_list_" + i + "_edit_pk", ASCII.String(u.hash()));

@ -68,6 +68,17 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<h2>Web Structure</h2>
#(hosts)#::
<fieldset><legend>Host List</legend>
#{list}#
<div style="float:left; padding:1px 5px 1px 5px;">
<div style="width:160px; text-align:left; float: left; white-space:nowrap; overflow:hidden;"><a href="/WatchWebStructure_p.html?host=#[host]#&depth=3&time=1000">#[host]#</a></div>
<div style="width:80px; text-align:right; float: left; white-space:nowrap; overflow:hidden;">#[count]# outlinks</div>
</div>
#{/list}#
</fieldset>
#(/hosts)#
<div id="left">
<form action="/WatchWebStructure_p.html" accept-charset="UTF-8" onsubmit="return checkform(this);">
<fieldset>

@ -4,7 +4,10 @@
//$LastChangedBy$
//
import java.util.Iterator;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.kelondro.data.meta.DigestURI;
@ -27,7 +30,7 @@ public class WatchWebStructure_p {
int width = 1024;
int height = 576;
int depth = 3;
int nodes = 500; // maximum number of host nodes that are painted
int nodes = 300; // maximum number of host nodes that are painted
int time = -1;
String host = "auto";
String besthost;
@ -36,7 +39,7 @@ public class WatchWebStructure_p {
width = post.getInt("width", 1024);
height = post.getInt("height", 576);
depth = post.getInt("depth", 3);
nodes = post.getInt("nodes", width * height * 100 / 1024 / 576);
nodes = post.getInt("nodes", width * height * 300 / 1024 / 576);
time = post.getInt("time", -1);
host = post.get("host", "auto");
color_text = post.get("colortext", color_text);
@ -70,6 +73,22 @@ public class WatchWebStructure_p {
host = "www." + host;
}
}
if (post != null && post.containsKey("hosts")) {
int maxcount = 200;
ReversibleScoreMap<String> score = sb.webStructure.hostReferenceScore();
int c = 0;
Iterator<String> i = score.keys(false);
String h;
while (i.hasNext() && c < maxcount) {
h = i.next();
prop.put("hosts_list_" + c + "_host", h);
prop.put("hosts_list_" + c + "_count", score.get(h));
c++;
}
prop.put("hosts_list", c);
prop.put("hosts", 1);
}
// find start point
if (host == null ||

@ -28,7 +28,6 @@ import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@ -61,7 +60,7 @@ public class WebStructurePicture_p {
int width = 1024;
int height = 576;
int depth = 3;
int nodes = 100; // maximum number of host nodes that are painted
int nodes = 300; // maximum number of host nodes that are painted
int time = -1;
String host = null;
int cyc = 0;
@ -156,16 +155,13 @@ public class WebStructurePicture_p {
final double radius = 1.0 / (1 << nextlayer);
final WebStructureGraph.StructureEntry sr = structure.outgoingReferences(centerhash);
final Map<String, Integer> next = (sr == null) ? new HashMap<String, Integer>() : sr.references;
Map.Entry<String, Integer> entry;
String targethash, targethost;
// first set points to next hosts
final Iterator<Map.Entry<String, Integer>> i = next.entrySet().iterator();
final List<String[]> targets = new ArrayList<String[]>();
int maxtargetrefs = 8, maxthisrefs = 8;
int targetrefs, thisrefs;
double rr, re;
while (i.hasNext() && maxnodes > 0 && System.currentTimeMillis() < timeout) {
entry = i.next();
for (Map.Entry<String, Integer> entry: next.entrySet()) {
targethash = entry.getKey();
targethost = structure.hostHash2hostName(targethash);
if (targethost == null) continue;
@ -181,15 +177,12 @@ public class WebStructurePicture_p {
rr = radius * 0.25 * (1 - targetrefs / (double) maxtargetrefs);
re = radius * 0.5 * (thisrefs / (double) maxthisrefs);
graph.addNode(targethost, x + (radius - rr - re) * Math.cos(angle), y + (radius - rr - re) * Math.sin(angle), nextlayer);
maxnodes--;
mynodes++;
if (maxnodes-- <= 0 || System.currentTimeMillis() >= timeout) break;
}
// recursively set next hosts
final Iterator<String[]> j = targets.iterator();
String[] target;
int nextnodes;
while (j.hasNext()) {
target = j.next();
for (String[] target: targets) {
targethash = target[0];
targethost = target[1];
final GraphPlotter.Point c = graph.getNode(targethost);

@ -79,14 +79,14 @@ public class webstructure {
prop.put("out", 1);
prop.put("in", 1);
WebStructureGraph.StructureEntry sentry = sb.webStructure.outgoingReferences(hosthash);
if (sentry != null) {
if (sentry != null && sentry.references.size() > 0) {
reference(prop, "out", 0, sentry, sb.webStructure);
prop.put("out_domains", 1);
} else {
prop.put("out_domains", 0);
}
sentry = sb.webStructure.incomingReferences(hosthash);
if (sentry != null) {
if (sentry != null && sentry.references.size() > 0) {
reference(prop, "in", 0, sentry, sb.webStructure);
prop.put("in_domains", 1);
} else {
@ -113,7 +113,7 @@ public class webstructure {
int d = 0;
Iterator<MultiProtocolURI> i = scraper.inboundLinks().iterator();
while (i.hasNext()) {
DigestURI refurl = new DigestURI(i.next());
DigestURI refurl = DigestURI.toDigestURI(i.next());
byte[] refhash = refurl.hash();
prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true));
prop.put("references_documents_0_anchors_" + d + "_hash", refhash);
@ -122,7 +122,7 @@ public class webstructure {
}
i = scraper.outboundLinks().iterator();
while (i.hasNext()) {
DigestURI refurl = new DigestURI(i.next());
DigestURI refurl = DigestURI.toDigestURI(i.next());
byte[] refhash = refurl.hash();
prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true));
prop.put("references_documents_0_anchors_" + d + "_hash", refhash);

@ -204,7 +204,7 @@ public final class CrawlStacker {
if (e.getKey() == null) continue;
// delete old entry, if exists to force a re-load of the url (thats wanted here)
final DigestURI url = new DigestURI(e.getKey());
final DigestURI url = DigestURI.toDigestURI(e.getKey());
final byte[] urlhash = url.hash();
if (replace) {
this.indexSegment.fulltext().remove(urlhash);

@ -150,7 +150,7 @@ public final class HTTPLoader {
}
// normalize URL
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
final DigestURI redirectionUrl = DigestURI.toDigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
// restart crawling with new url
this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString);
@ -289,7 +289,7 @@ public final class HTTPLoader {
}
// normalizing URL
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
final DigestURI redirectionUrl = DigestURI.toDigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
// if we are already doing a shutdown we don't need to retry crawling

@ -155,7 +155,7 @@ public class BookmarkHelper {
if ("".equals(title)) {//cannot be displayed
title = url.toString();
}
bm = db.new Bookmark(new DigestURI(url));
bm = db.new Bookmark(DigestURI.toDigestURI(url));
bm.setProperty(Bookmark.BOOKMARK_TITLE, title);
bm.setTags(tags);
bm.setPublic(importPublic);

@ -168,7 +168,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
Document[] theDocs;
// workaround for relative links in file, normally '#' shall be used behind the location, see
// below for reversion of the effects
final DigestURI url = new DigestURI(MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath));
final DigestURI url = DigestURI.toDigestURI(MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath));
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray());

@ -90,7 +90,7 @@ public class tarParser extends AbstractParser implements Parser {
try {
tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize());
subDocs = TextParser.parseSource(new DigestURI(MultiProtocolURI.newURL(url,"#" + name)), mime, null, tmp);
subDocs = TextParser.parseSource(DigestURI.toDigestURI(MultiProtocolURI.newURL(url,"#" + name)), mime, null, tmp);
if (subDocs == null) continue;
for (final Document d: subDocs) docacc.add(d);
} catch (final Parser.Failure e) {

@ -87,7 +87,7 @@ public class zipParser extends AbstractParser implements Parser {
try {
tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(zis, tmp, entry.getSize());
final DigestURI virtualURL = new DigestURI(MultiProtocolURI.newURL(url, "#" + name));
final DigestURI virtualURL = DigestURI.toDigestURI(MultiProtocolURI.newURL(url, "#" + name));
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
docs = TextParser.parseSource(virtualURL, mime, null, tmp);
if (docs == null) continue;

@ -133,13 +133,18 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
/**
* DigestURI from general URI
* @param baseURL
* @param u
*/
public DigestURI(final MultiProtocolURI baseURL) {
super(baseURL);
this.hash = (baseURL instanceof DigestURI) ? ((DigestURI) baseURL).hash : null;
private DigestURI(final MultiProtocolURI u) {
super(u);
this.hash = (u instanceof DigestURI) ? ((DigestURI) u).hash : null;
}
public static DigestURI toDigestURI(MultiProtocolURI u) {
return (u instanceof DigestURI) ? ((DigestURI) u) : new DigestURI(u);
}
/**
* DigestURI from general URI, hash already calculated
* @param baseURL

@ -49,6 +49,8 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.util.LookAheadIterator;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.document.Document;
@ -64,8 +66,7 @@ import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
public class WebStructureGraph
{
public class WebStructureGraph {
public static int maxref = 300; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia)
public static int maxhosts = 50000; // maximum number of hosts in web structure map
@ -75,17 +76,16 @@ public class WebStructureGraph
private final File structureFile;
private final TreeMap<String, byte[]> structure_old; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
private final TreeMap<String, byte[]> structure_new;
private final BlockingQueue<leanrefObject> publicRefDNSResolvingQueue;
private final BlockingQueue<LearnObject> publicRefDNSResolvingQueue;
private final PublicRefDNSResolvingProcess publicRefDNSResolvingWorker;
private final static leanrefObject leanrefObjectPOISON = new leanrefObject(null, null);
private final static LearnObject leanrefObjectPOISON = new LearnObject(null, null);
private static class leanrefObject
{
private static class LearnObject {
private final DigestURI url;
private final Set<MultiProtocolURI> globalRefURLs;
private leanrefObject(final DigestURI url, final Set<MultiProtocolURI> globalRefURLs) {
private LearnObject(final DigestURI url, final Set<MultiProtocolURI> globalRefURLs) {
this.url = url;
this.globalRefURLs = globalRefURLs;
}
@ -95,7 +95,7 @@ public class WebStructureGraph
this.structure_old = new TreeMap<String, byte[]>();
this.structure_new = new TreeMap<String, byte[]>();
this.structureFile = structureFile;
this.publicRefDNSResolvingQueue = new LinkedBlockingQueue<leanrefObject>();
this.publicRefDNSResolvingQueue = new LinkedBlockingQueue<LearnObject>();
// load web structure
Map<String, byte[]> loadedStructureB;
@ -142,7 +142,7 @@ public class WebStructureGraph
@Override
public void run() {
leanrefObject lro;
LearnObject lro;
try {
while ( (lro = WebStructureGraph.this.publicRefDNSResolvingQueue.take()) != leanrefObjectPOISON ) {
learnrefs(lro);
@ -170,7 +170,7 @@ public class WebStructureGraph
globalRefURLs.add(u);
}
}
final leanrefObject lro = new leanrefObject(url, globalRefURLs);
final LearnObject lro = new LearnObject(url, globalRefURLs);
if ( !globalRefURLs.isEmpty() ) {
try {
if ( this.publicRefDNSResolvingWorker.isAlive() ) {
@ -184,34 +184,6 @@ public class WebStructureGraph
}
}
private void learnrefs(final leanrefObject lro) {
final StringBuilder cpg = new StringBuilder(240);
assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString();
//final String refhashp = ASCII.String(lro.url.hash(), 6, 6); // ref hash part
String nexturlhash;
for ( final MultiProtocolURI u : lro.globalRefURLs ) {
if (Switchboard.getSwitchboard().shallTerminate()) break;
final byte[] nexturlhashb = new DigestURI(u).hash();
assert nexturlhashb != null;
if ( nexturlhashb != null ) {
nexturlhash = ASCII.String(nexturlhashb);
assert nexturlhash.length() == 12 : "nexturlhash.length() = "
+ nexturlhash.length()
+ ", nexturlhash = "
+ nexturlhash;
//assert !nexturlhash.substring(6).equals(refhashp);
// this is a global link
cpg.append(nexturlhash); // store complete hash
assert cpg.length() % 12 == 0 : "cpg.length() = "
+ cpg.length()
+ ", cpg = "
+ cpg.toString();
}
}
assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString();
learn(lro.url, cpg);
}
private static int refstr2count(final String refs) {
if ( (refs == null) || (refs.length() <= 8) ) {
return 0;
@ -220,7 +192,7 @@ public class WebStructureGraph
return (refs.length() - 8) / 10;
}
static Map<String, Integer> refstr2map(final String refs) {
private static Map<String, Integer> refstr2map(final String refs) {
if ( (refs == null) || (refs.length() <= 8) ) {
return new HashMap<String, Integer>();
}
@ -240,8 +212,12 @@ public class WebStructureGraph
return map;
}
private static String none2refstr() {
return GenericFormatter.SHORT_DAY_FORMATTER.format();
}
private static String map2refstr(final Map<String, Integer> map) {
final StringBuilder s = new StringBuilder(map.size() * 10);
final StringBuilder s = new StringBuilder(GenericFormatter.PATTERN_SHORT_DAY.length() + map.size() * 10);
s.append(GenericFormatter.SHORT_DAY_FORMATTER.format());
String h;
for ( final Map.Entry<String, Integer> entry : map.entrySet() ) {
@ -265,6 +241,31 @@ public class WebStructureGraph
return s.toString();
}
public boolean exists(final String hosthash) {
// returns a map with a hosthash(String):refcount(Integer) relation
assert hosthash.length() == 6;
SortedMap<String, byte[]> tailMap;
synchronized ( this.structure_old ) {
tailMap = this.structure_old.tailMap(hosthash);
if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) {
return true;
}
}
}
synchronized ( this.structure_new ) {
tailMap = this.structure_new.tailMap(hosthash);
if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) {
return true;
}
}
}
return false;
}
public StructureEntry outgoingReferences(final String hosthash) {
// returns a map with a hosthash(String):refcount(Integer) relation
assert hosthash.length() == 6;
@ -279,7 +280,7 @@ public class WebStructureGraph
final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) {
hostname = key.substring(7);
ref = UTF8.String(tailMap.get(key));
ref = ASCII.String(tailMap.get(key));
date = ref.substring(0, 8);
h = refstr2map(ref);
}
@ -290,7 +291,7 @@ public class WebStructureGraph
if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) {
ref = UTF8.String(tailMap.get(key));
ref = ASCII.String(tailMap.get(key));
if ( hostname.isEmpty() ) {
hostname = key.substring(7);
}
@ -371,7 +372,7 @@ public class WebStructureGraph
private final Row.Entry entry;
public HostReference(final byte[] hostHash, final long modified, final int count) {
private HostReference(final byte[] hostHash, final long modified, final int count) {
assert (hostHash.length == 6) : "hostHash = " + ASCII.String(hostHash);
this.entry = hostReferenceFactory.getRow().newEntry();
this.entry.setCol(0, hostHash);
@ -383,7 +384,7 @@ public class WebStructureGraph
this.entry = hostReferenceFactory.getRow().newEntry(json, true);
}
public HostReference(final Row.Entry entry) {
private HostReference(final Row.Entry entry) {
this.entry = entry;
}
@ -402,7 +403,7 @@ public class WebStructureGraph
return this.entry.getPrimaryKeyBytes();
}
public int count() {
private int count() {
return (int) this.entry.getColLong(2);
}
@ -436,9 +437,9 @@ public class WebStructureGraph
}
public static final HostReferenceFactory hostReferenceFactory = new HostReferenceFactory();
public static ReferenceContainerCache<HostReference> hostReferenceIndexCache = null;
public static long hostReferenceIndexCacheTime = 0;
public static final long hostReferenceIndexCacheTTL = 1000 * 60 * 60 * 12; // 12 hours time to live for cache
private static ReferenceContainerCache<HostReference> hostReferenceIndexCache = null;
private static long hostReferenceIndexCacheTime = 0;
private static final long hostReferenceIndexCacheTTL = 1000 * 60 * 60 * 12; // 12 hours time to live for cache
public synchronized ReferenceContainerCache<HostReference> incomingReferences() {
// we return a cache if the cache is filled and not stale
@ -508,22 +509,6 @@ public class WebStructureGraph
}
}
/*
private void incomingReferencesTest(ReferenceContainerCache<HostReference> idx) {
for (ReferenceContainer<HostReference> references: idx) {
log.logInfo("Term-Host: " + hostHash2hostName(UTF8.String(references.getTermHash())));
Iterator<HostReference> referenceIterator = references.entries();
StringBuilder s = new StringBuilder();
HostReference reference;
while (referenceIterator.hasNext()) {
reference = referenceIterator.next();
s.append(reference.toPropertyForm());
log.logInfo(" ... referenced by " + hostHash2hostName(UTF8.String(reference.metadataHash())) + ", " + reference.count() + " references");
}
}
}
*/
public int referencesCount(final String hosthash) {
// returns the number of hosts that are referenced by this hosthash
assert hosthash.length() == 6 : "hosthash = " + hosthash;
@ -578,21 +563,31 @@ public class WebStructureGraph
return null;
}
private void learn(final DigestURI url, final StringBuilder reference /*string of b64(12digits)-hashes*/) {
final String hosthash = ASCII.String(url.hash(), 6, 6);
private void learnrefs(final LearnObject lro) {
final Set<String> refhosts = new HashSet<String>();
DigestURI du;
String hosthash;
for ( final MultiProtocolURI u : lro.globalRefURLs ) {
if (Switchboard.getSwitchboard().shallTerminate()) break;
du = DigestURI.toDigestURI(u);
hosthash = ASCII.String(du.hash(), 6, 12);
if (!exists(hosthash)) {
// this must be recorded as an host with no references
synchronized ( this.structure_new ) {
this.structure_new.put(hosthash + "," + u.getHost(), UTF8.getBytes(none2refstr()));
}
}
refhosts.add(hosthash);
}
final DigestURI url = lro.url;
hosthash = ASCII.String(url.hash(), 6, 6);
// parse the new reference string and join it with the stored references
final StructureEntry structure = outgoingReferences(hosthash);
final Map<String, Integer> refs =
(structure == null) ? new HashMap<String, Integer>() : structure.references;
assert reference.length() % 12 == 0 : "reference.length() = "
+ reference.length()
+ ", reference = "
+ reference.toString();
String dom;
final Map<String, Integer> refs = (structure == null) ? new HashMap<String, Integer>() : structure.references;
int c;
for ( int i = 0; i < reference.length() / 12; i++ ) {
dom = reference.substring(i * 12 + 6, (i + 1) * 12);
for (String dom: refhosts) {
c = 0;
if ( refs.containsKey(dom) ) {
c = (refs.get(dom)).intValue();
@ -681,14 +676,27 @@ public class WebStructureGraph
}
return maxhost;
}
public ReversibleScoreMap<String> hostReferenceScore() {
ReversibleScoreMap<String> result = new ClusteredScoreMap<String>(ASCII.identityASCIIComparator);
synchronized ( this.structure_old ) {
for ( final Map.Entry<String, byte[]> entry : this.structure_old.entrySet() ) {
result.set(entry.getKey().substring(7), (entry.getValue().length - 8) / 10);
}
}
synchronized ( this.structure_new ) {
for ( final Map.Entry<String, byte[]> entry : this.structure_new.entrySet() ) {
result.set(entry.getKey().substring(7), (entry.getValue().length - 8) / 10);
}
}
return result;
}
public Iterator<StructureEntry> structureEntryIterator(final boolean latest) {
return new StructureIterator(latest);
}
private class StructureIterator extends LookAheadIterator<StructureEntry> implements
Iterator<StructureEntry>
{
private class StructureIterator extends LookAheadIterator<StructureEntry> implements Iterator<StructureEntry> {
private final Iterator<Map.Entry<String, byte[]>> i;
@ -727,23 +735,43 @@ public class WebStructureGraph
}
}
public static class StructureEntry
{
public static class StructureEntry implements Comparable<StructureEntry> {
public String hosthash; // the tail of the host hash
public String hostname; // the host name
public String date; // date of latest change
public Map<String, Integer> references; // a map from the referenced host hash to the number of referenced to that host
private StructureEntry(final String hosthash, final String hostname) {
this(hosthash, hostname, GenericFormatter.SHORT_DAY_FORMATTER.format(), new HashMap<String, Integer>());
}
private StructureEntry(
final String hosthash,
final String hostname,
final String date,
final Map<String, Integer> references) {
final String hosthash,
final String hostname,
final String date,
final Map<String, Integer> references) {
this.hosthash = hosthash;
this.hostname = hostname;
this.date = date;
this.references = references;
}
@Override
public int compareTo(StructureEntry arg0) {
return hosthash.compareTo(arg0.hosthash);
}
@Override
public boolean equals(Object o) {
if (!(o instanceof StructureEntry)) return false;
return hosthash.equals(((StructureEntry)o).hosthash);
}
@Override
public int hashCode() {
return this.hosthash.hashCode();
}
}
public synchronized void close() {

@ -2552,7 +2552,7 @@ public final class Switchboard extends serverSwitch
// CREATE INDEX
final String dc_title = document.dc_title();
final DigestURI url = new DigestURI(document.dc_source());
final DigestURI url = DigestURI.toDigestURI(document.dc_source());
final DigestURI referrerURL = queueEntry.referrerURL();
EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash);
@ -2620,7 +2620,7 @@ public final class Switchboard extends serverSwitch
rssRow.put("title", UTF8.getBytes(rssEntry.getValue()));
rssRow.put("recording_date", new Date());
try {
this.tables.update("rss", new DigestURI(rssEntry.getKey()).hash(), rssRow);
this.tables.update("rss", DigestURI.toDigestURI(rssEntry.getKey()).hash(), rssRow);
} catch ( final IOException e ) {
Log.logException(e);
}
@ -3180,7 +3180,7 @@ public final class Switchboard extends serverSwitch
final Iterator<MultiProtocolURI> i = links.keySet().iterator();
final boolean globalcrawljob = sb.getConfigBool("heuristic.searchresults.crawlglobal",false);
while (i.hasNext()) {
url = new DigestURI(i.next());
url = DigestURI.toDigestURI(i.next());
boolean islocal = url.getHost().contentEquals(startUrl.getHost());
// add all external links or links to different page to crawler
if ( !islocal ) {// || (!startUrl.getPath().endsWith(url.getPath()))) {

@ -266,7 +266,7 @@ public class Segment {
final long urldate = urlModified.getTime();
for (Map.Entry<MultiProtocolURI, Properties> anchorEntry: anchors.entrySet()) {
MultiProtocolURI anchor = anchorEntry.getKey();
byte[] refhash = new DigestURI(anchor).hash();
byte[] refhash = DigestURI.toDigestURI(anchor).hash();
//System.out.println("*** addCitationIndex: urlhash = " + ASCII.String(urlhash) + ", refhash = " + ASCII.String(refhash) + ", urldate = " + urlModified.toString());
if (this.urlCitationIndex != null) try {
this.urlCitationIndex.add(refhash, new CitationReference(urlhash, urldate));

@ -213,7 +213,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
final SolrInputDocument doc = new SolrInputDocument();
final DigestURI digestURI = new DigestURI(md.url());
final DigestURI digestURI = DigestURI.toDigestURI(md.url());
boolean allAttr = this.isEmpty();
if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, "");
@ -341,7 +341,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader header, final Document yacydoc, Condenser condenser, final URIMetadata metadata) {
// we use the SolrCell design as index scheme
final SolrInputDocument doc = new SolrInputDocument();
final DigestURI digestURI = new DigestURI(yacydoc.dc_source());
final DigestURI digestURI = DigestURI.toDigestURI(yacydoc.dc_source());
boolean allAttr = this.isEmpty();
add(doc, YaCySchema.id, id);
if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)

@ -177,7 +177,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
final List<MediaSnippet> result = new ArrayList<MediaSnippet>();
while (i.hasNext()) {
entry = i.next();
url = new DigestURI(entry.getKey());
url = DigestURI.toDigestURI(entry.getKey());
desc = entry.getValue();
if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue;
final int ranking = removeAppearanceHashes(url.toNormalform(true), queryhashes).size() +
@ -202,7 +202,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
final List<MediaSnippet> result = new ArrayList<MediaSnippet>();
while (i.hasNext()) {
ientry = i.next();
url = new DigestURI(ientry.url());
url = DigestURI.toDigestURI(ientry.url());
final String u = url.toString();
if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue;
if (u.indexOf(".ico",0) >= 0 || u.indexOf("favicon",0) >= 0) continue;

@ -308,7 +308,7 @@ public final class HTTPDProxyHandler {
DigestURI url = null;
try {
url = new DigestURI(HeaderFramework.getRequestURL(conProp));
url = DigestURI.toDigestURI(HeaderFramework.getRequestURL(conProp));
if (log.isFine()) log.logFine(reqID +" GET "+ url);
if (log.isFinest()) log.logFinest(reqID +" header: "+ requestHeader);
@ -391,7 +391,7 @@ public final class HTTPDProxyHandler {
final Request request = new Request(
null,
url,
requestHeader.referer() == null ? null : new DigestURI(requestHeader.referer()).hash(),
requestHeader.referer() == null ? null : DigestURI.toDigestURI(requestHeader.referer()).hash(),
"",
cachedResponseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(),
@ -527,7 +527,7 @@ public final class HTTPDProxyHandler {
final Request request = new Request(
null,
url,
requestHeader.referer() == null ? null : new DigestURI(requestHeader.referer()).hash(),
requestHeader.referer() == null ? null : DigestURI.toDigestURI(requestHeader.referer()).hash(),
"",
responseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(),

Loading…
Cancel
Save