diff --git a/htroot/CrawlStartScanner_p.java b/htroot/CrawlStartScanner_p.java index b70d8c7af..295ded2e4 100644 --- a/htroot/CrawlStartScanner_p.java +++ b/htroot/CrawlStartScanner_p.java @@ -181,7 +181,7 @@ public class CrawlStartScanner_p final Scanner.Service u = se.next().getKey(); DigestURI uu; try { - uu = new DigestURI(u.url()); + uu = DigestURI.toDigestURI(u.url()); pkmap.put(uu.hash(), uu); } catch ( final MalformedURLException e ) { Log.logException(e); @@ -236,7 +236,7 @@ public class CrawlStartScanner_p while ( se.hasNext() ) { host = se.next(); try { - u = new DigestURI(host.getKey().url()); + u = DigestURI.toDigestURI(host.getKey().url()); urlString = u.toNormalform(true); if ( host.getValue() == Access.granted && Scanner.inIndex(apiCommentCache, urlString) == null ) { diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 88d38eaf3..eb62823d5 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -296,7 +296,7 @@ public class Crawler_p { scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); // get links and generate filter for (MultiProtocolURI u: scraper.getAnchors().keySet()) { - newRootURLs.add(new DigestURI(u)); + newRootURLs.add(DigestURI.toDigestURI(u)); } } catch (IOException e) { Log.logException(e); diff --git a/htroot/ServerScannerList.java b/htroot/ServerScannerList.java index 49c096dd3..4bd7b531c 100644 --- a/htroot/ServerScannerList.java +++ b/htroot/ServerScannerList.java @@ -63,7 +63,7 @@ public class ServerScannerList { while (se.hasNext()) { host = se.next(); try { - u = new DigestURI(host.getKey().url()); + u = DigestURI.toDigestURI(host.getKey().url()); urlString = u.toNormalform(true); prop.put("servertable_list_" + i + "_edit", edit ? 1 : 0); prop.put("servertable_list_" + i + "_edit_pk", ASCII.String(u.hash())); diff --git a/htroot/WatchWebStructure_p.html b/htroot/WatchWebStructure_p.html index 8683ddf19..9dc529666 100644 --- a/htroot/WatchWebStructure_p.html +++ b/htroot/WatchWebStructure_p.html @@ -68,6 +68,17 @@ To see a list of all APIs, please visit the +
#[host]#
+
#[count]# outlinks
+ + #{/list}# + + #(/hosts)# +
diff --git a/htroot/WatchWebStructure_p.java b/htroot/WatchWebStructure_p.java index 8875d368b..be8ce12ca 100644 --- a/htroot/WatchWebStructure_p.java +++ b/htroot/WatchWebStructure_p.java @@ -4,7 +4,10 @@ //$LastChangedBy$ // +import java.util.Iterator; + import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.data.CrawlProfile; import net.yacy.kelondro.data.meta.DigestURI; @@ -27,7 +30,7 @@ public class WatchWebStructure_p { int width = 1024; int height = 576; int depth = 3; - int nodes = 500; // maximum number of host nodes that are painted + int nodes = 300; // maximum number of host nodes that are painted int time = -1; String host = "auto"; String besthost; @@ -36,7 +39,7 @@ public class WatchWebStructure_p { width = post.getInt("width", 1024); height = post.getInt("height", 576); depth = post.getInt("depth", 3); - nodes = post.getInt("nodes", width * height * 100 / 1024 / 576); + nodes = post.getInt("nodes", width * height * 300 / 1024 / 576); time = post.getInt("time", -1); host = post.get("host", "auto"); color_text = post.get("colortext", color_text); @@ -70,6 +73,22 @@ public class WatchWebStructure_p { host = "www." + host; } } + + if (post != null && post.containsKey("hosts")) { + int maxcount = 200; + ReversibleScoreMap score = sb.webStructure.hostReferenceScore(); + int c = 0; + Iterator i = score.keys(false); + String h; + while (i.hasNext() && c < maxcount) { + h = i.next(); + prop.put("hosts_list_" + c + "_host", h); + prop.put("hosts_list_" + c + "_count", score.get(h)); + c++; + } + prop.put("hosts_list", c); + prop.put("hosts", 1); + } // find start point if (host == null || diff --git a/htroot/WebStructurePicture_p.java b/htroot/WebStructurePicture_p.java index 2caee7b66..9b777de04 100644 --- a/htroot/WebStructurePicture_p.java +++ b/htroot/WebStructurePicture_p.java @@ -28,7 +28,6 @@ import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; -import java.util.Iterator; import java.util.List; import java.util.Map; @@ -61,7 +60,7 @@ public class WebStructurePicture_p { int width = 1024; int height = 576; int depth = 3; - int nodes = 100; // maximum number of host nodes that are painted + int nodes = 300; // maximum number of host nodes that are painted int time = -1; String host = null; int cyc = 0; @@ -156,16 +155,13 @@ public class WebStructurePicture_p { final double radius = 1.0 / (1 << nextlayer); final WebStructureGraph.StructureEntry sr = structure.outgoingReferences(centerhash); final Map next = (sr == null) ? new HashMap() : sr.references; - Map.Entry entry; String targethash, targethost; // first set points to next hosts - final Iterator> i = next.entrySet().iterator(); final List targets = new ArrayList(); int maxtargetrefs = 8, maxthisrefs = 8; int targetrefs, thisrefs; double rr, re; - while (i.hasNext() && maxnodes > 0 && System.currentTimeMillis() < timeout) { - entry = i.next(); + for (Map.Entry entry: next.entrySet()) { targethash = entry.getKey(); targethost = structure.hostHash2hostName(targethash); if (targethost == null) continue; @@ -181,15 +177,12 @@ public class WebStructurePicture_p { rr = radius * 0.25 * (1 - targetrefs / (double) maxtargetrefs); re = radius * 0.5 * (thisrefs / (double) maxthisrefs); graph.addNode(targethost, x + (radius - rr - re) * Math.cos(angle), y + (radius - rr - re) * Math.sin(angle), nextlayer); - maxnodes--; mynodes++; + if (maxnodes-- <= 0 || System.currentTimeMillis() >= timeout) break; } // recursively set next hosts - final Iterator j = targets.iterator(); - String[] target; int nextnodes; - while (j.hasNext()) { - target = j.next(); + for (String[] target: targets) { targethash = target[0]; targethost = target[1]; final GraphPlotter.Point c = graph.getNode(targethost); diff --git a/htroot/api/webstructure.java b/htroot/api/webstructure.java index 313184ba1..76fa0cb2c 100644 --- a/htroot/api/webstructure.java +++ b/htroot/api/webstructure.java @@ -79,14 +79,14 @@ public class webstructure { prop.put("out", 1); prop.put("in", 1); WebStructureGraph.StructureEntry sentry = sb.webStructure.outgoingReferences(hosthash); - if (sentry != null) { + if (sentry != null && sentry.references.size() > 0) { reference(prop, "out", 0, sentry, sb.webStructure); prop.put("out_domains", 1); } else { prop.put("out_domains", 0); } sentry = sb.webStructure.incomingReferences(hosthash); - if (sentry != null) { + if (sentry != null && sentry.references.size() > 0) { reference(prop, "in", 0, sentry, sb.webStructure); prop.put("in_domains", 1); } else { @@ -113,7 +113,7 @@ public class webstructure { int d = 0; Iterator i = scraper.inboundLinks().iterator(); while (i.hasNext()) { - DigestURI refurl = new DigestURI(i.next()); + DigestURI refurl = DigestURI.toDigestURI(i.next()); byte[] refhash = refurl.hash(); prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true)); prop.put("references_documents_0_anchors_" + d + "_hash", refhash); @@ -122,7 +122,7 @@ public class webstructure { } i = scraper.outboundLinks().iterator(); while (i.hasNext()) { - DigestURI refurl = new DigestURI(i.next()); + DigestURI refurl = DigestURI.toDigestURI(i.next()); byte[] refhash = refurl.hash(); prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true)); prop.put("references_documents_0_anchors_" + d + "_hash", refhash); diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 2b0c392a2..ffd388ef4 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -204,7 +204,7 @@ public final class CrawlStacker { if (e.getKey() == null) continue; // delete old entry, if exists to force a re-load of the url (thats wanted here) - final DigestURI url = new DigestURI(e.getKey()); + final DigestURI url = DigestURI.toDigestURI(e.getKey()); final byte[] urlhash = url.hash(); if (replace) { this.indexSegment.fulltext().remove(urlhash); diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index f24dc0146..dfa668c95 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -150,7 +150,7 @@ public final class HTTPLoader { } // normalize URL - final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); + final DigestURI redirectionUrl = DigestURI.toDigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); // restart crawling with new url this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString); @@ -289,7 +289,7 @@ public final class HTTPLoader { } // normalizing URL - final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); + final DigestURI redirectionUrl = DigestURI.toDigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); // if we are already doing a shutdown we don't need to retry crawling diff --git a/source/net/yacy/data/BookmarkHelper.java b/source/net/yacy/data/BookmarkHelper.java index 76abc6fe8..ac22158de 100644 --- a/source/net/yacy/data/BookmarkHelper.java +++ b/source/net/yacy/data/BookmarkHelper.java @@ -155,7 +155,7 @@ public class BookmarkHelper { if ("".equals(title)) {//cannot be displayed title = url.toString(); } - bm = db.new Bookmark(new DigestURI(url)); + bm = db.new Bookmark(DigestURI.toDigestURI(url)); bm.setProperty(Bookmark.BOOKMARK_TITLE, title); bm.setTags(tags); bm.setPublic(importPublic); diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index 3f53af4c6..b4812211a 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -168,7 +168,7 @@ public class sevenzipParser extends AbstractParser implements Parser { Document[] theDocs; // workaround for relative links in file, normally '#' shall be used behind the location, see // below for reversion of the effects - final DigestURI url = new DigestURI(MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath)); + final DigestURI url = DigestURI.toDigestURI(MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath)); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray()); diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index 9edf3986d..3e098d5c5 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -90,7 +90,7 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - subDocs = TextParser.parseSource(new DigestURI(MultiProtocolURI.newURL(url,"#" + name)), mime, null, tmp); + subDocs = TextParser.parseSource(DigestURI.toDigestURI(MultiProtocolURI.newURL(url,"#" + name)), mime, null, tmp); if (subDocs == null) continue; for (final Document d: subDocs) docacc.add(d); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index 8fca97350..a124bd946 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -87,7 +87,7 @@ public class zipParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(zis, tmp, entry.getSize()); - final DigestURI virtualURL = new DigestURI(MultiProtocolURI.newURL(url, "#" + name)); + final DigestURI virtualURL = DigestURI.toDigestURI(MultiProtocolURI.newURL(url, "#" + name)); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); docs = TextParser.parseSource(virtualURL, mime, null, tmp); if (docs == null) continue; diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/kelondro/data/meta/DigestURI.java index 90ca65da3..fb65695f6 100644 --- a/source/net/yacy/kelondro/data/meta/DigestURI.java +++ b/source/net/yacy/kelondro/data/meta/DigestURI.java @@ -133,13 +133,18 @@ public class DigestURI extends MultiProtocolURI implements Serializable { /** * DigestURI from general URI - * @param baseURL + * @param u */ - public DigestURI(final MultiProtocolURI baseURL) { - super(baseURL); - this.hash = (baseURL instanceof DigestURI) ? ((DigestURI) baseURL).hash : null; + private DigestURI(final MultiProtocolURI u) { + super(u); + this.hash = (u instanceof DigestURI) ? ((DigestURI) u).hash : null; } + + public static DigestURI toDigestURI(MultiProtocolURI u) { + return (u instanceof DigestURI) ? ((DigestURI) u) : new DigestURI(u); + } + /** * DigestURI from general URI, hash already calculated * @param baseURL diff --git a/source/net/yacy/peers/graphics/WebStructureGraph.java b/source/net/yacy/peers/graphics/WebStructureGraph.java index 896a21462..a3e8305ae 100644 --- a/source/net/yacy/peers/graphics/WebStructureGraph.java +++ b/source/net/yacy/peers/graphics/WebStructureGraph.java @@ -49,6 +49,8 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.order.Base64Order; +import net.yacy.cora.sorting.ClusteredScoreMap; +import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.util.LookAheadIterator; import net.yacy.cora.util.SpaceExceededException; import net.yacy.document.Document; @@ -64,8 +66,7 @@ import net.yacy.kelondro.rwi.ReferenceFactory; import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; -public class WebStructureGraph -{ +public class WebStructureGraph { public static int maxref = 300; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia) public static int maxhosts = 50000; // maximum number of hosts in web structure map @@ -75,17 +76,16 @@ public class WebStructureGraph private final File structureFile; private final TreeMap structure_old; // ',' to {}* private final TreeMap structure_new; - private final BlockingQueue publicRefDNSResolvingQueue; + private final BlockingQueue publicRefDNSResolvingQueue; private final PublicRefDNSResolvingProcess publicRefDNSResolvingWorker; - private final static leanrefObject leanrefObjectPOISON = new leanrefObject(null, null); + private final static LearnObject leanrefObjectPOISON = new LearnObject(null, null); - private static class leanrefObject - { + private static class LearnObject { private final DigestURI url; private final Set globalRefURLs; - private leanrefObject(final DigestURI url, final Set globalRefURLs) { + private LearnObject(final DigestURI url, final Set globalRefURLs) { this.url = url; this.globalRefURLs = globalRefURLs; } @@ -95,7 +95,7 @@ public class WebStructureGraph this.structure_old = new TreeMap(); this.structure_new = new TreeMap(); this.structureFile = structureFile; - this.publicRefDNSResolvingQueue = new LinkedBlockingQueue(); + this.publicRefDNSResolvingQueue = new LinkedBlockingQueue(); // load web structure Map loadedStructureB; @@ -142,7 +142,7 @@ public class WebStructureGraph @Override public void run() { - leanrefObject lro; + LearnObject lro; try { while ( (lro = WebStructureGraph.this.publicRefDNSResolvingQueue.take()) != leanrefObjectPOISON ) { learnrefs(lro); @@ -170,7 +170,7 @@ public class WebStructureGraph globalRefURLs.add(u); } } - final leanrefObject lro = new leanrefObject(url, globalRefURLs); + final LearnObject lro = new LearnObject(url, globalRefURLs); if ( !globalRefURLs.isEmpty() ) { try { if ( this.publicRefDNSResolvingWorker.isAlive() ) { @@ -184,34 +184,6 @@ public class WebStructureGraph } } - private void learnrefs(final leanrefObject lro) { - final StringBuilder cpg = new StringBuilder(240); - assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString(); - //final String refhashp = ASCII.String(lro.url.hash(), 6, 6); // ref hash part - String nexturlhash; - for ( final MultiProtocolURI u : lro.globalRefURLs ) { - if (Switchboard.getSwitchboard().shallTerminate()) break; - final byte[] nexturlhashb = new DigestURI(u).hash(); - assert nexturlhashb != null; - if ( nexturlhashb != null ) { - nexturlhash = ASCII.String(nexturlhashb); - assert nexturlhash.length() == 12 : "nexturlhash.length() = " - + nexturlhash.length() - + ", nexturlhash = " - + nexturlhash; - //assert !nexturlhash.substring(6).equals(refhashp); - // this is a global link - cpg.append(nexturlhash); // store complete hash - assert cpg.length() % 12 == 0 : "cpg.length() = " - + cpg.length() - + ", cpg = " - + cpg.toString(); - } - } - assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString(); - learn(lro.url, cpg); - } - private static int refstr2count(final String refs) { if ( (refs == null) || (refs.length() <= 8) ) { return 0; @@ -220,7 +192,7 @@ public class WebStructureGraph return (refs.length() - 8) / 10; } - static Map refstr2map(final String refs) { + private static Map refstr2map(final String refs) { if ( (refs == null) || (refs.length() <= 8) ) { return new HashMap(); } @@ -240,8 +212,12 @@ public class WebStructureGraph return map; } + private static String none2refstr() { + return GenericFormatter.SHORT_DAY_FORMATTER.format(); + } + private static String map2refstr(final Map map) { - final StringBuilder s = new StringBuilder(map.size() * 10); + final StringBuilder s = new StringBuilder(GenericFormatter.PATTERN_SHORT_DAY.length() + map.size() * 10); s.append(GenericFormatter.SHORT_DAY_FORMATTER.format()); String h; for ( final Map.Entry entry : map.entrySet() ) { @@ -265,6 +241,31 @@ public class WebStructureGraph return s.toString(); } + public boolean exists(final String hosthash) { + // returns a map with a hosthash(String):refcount(Integer) relation + assert hosthash.length() == 6; + SortedMap tailMap; + synchronized ( this.structure_old ) { + tailMap = this.structure_old.tailMap(hosthash); + if ( !tailMap.isEmpty() ) { + final String key = tailMap.firstKey(); + if ( key.startsWith(hosthash) ) { + return true; + } + } + } + synchronized ( this.structure_new ) { + tailMap = this.structure_new.tailMap(hosthash); + if ( !tailMap.isEmpty() ) { + final String key = tailMap.firstKey(); + if ( key.startsWith(hosthash) ) { + return true; + } + } + } + return false; + } + public StructureEntry outgoingReferences(final String hosthash) { // returns a map with a hosthash(String):refcount(Integer) relation assert hosthash.length() == 6; @@ -279,7 +280,7 @@ public class WebStructureGraph final String key = tailMap.firstKey(); if ( key.startsWith(hosthash) ) { hostname = key.substring(7); - ref = UTF8.String(tailMap.get(key)); + ref = ASCII.String(tailMap.get(key)); date = ref.substring(0, 8); h = refstr2map(ref); } @@ -290,7 +291,7 @@ public class WebStructureGraph if ( !tailMap.isEmpty() ) { final String key = tailMap.firstKey(); if ( key.startsWith(hosthash) ) { - ref = UTF8.String(tailMap.get(key)); + ref = ASCII.String(tailMap.get(key)); if ( hostname.isEmpty() ) { hostname = key.substring(7); } @@ -371,7 +372,7 @@ public class WebStructureGraph private final Row.Entry entry; - public HostReference(final byte[] hostHash, final long modified, final int count) { + private HostReference(final byte[] hostHash, final long modified, final int count) { assert (hostHash.length == 6) : "hostHash = " + ASCII.String(hostHash); this.entry = hostReferenceFactory.getRow().newEntry(); this.entry.setCol(0, hostHash); @@ -383,7 +384,7 @@ public class WebStructureGraph this.entry = hostReferenceFactory.getRow().newEntry(json, true); } - public HostReference(final Row.Entry entry) { + private HostReference(final Row.Entry entry) { this.entry = entry; } @@ -402,7 +403,7 @@ public class WebStructureGraph return this.entry.getPrimaryKeyBytes(); } - public int count() { + private int count() { return (int) this.entry.getColLong(2); } @@ -436,9 +437,9 @@ public class WebStructureGraph } public static final HostReferenceFactory hostReferenceFactory = new HostReferenceFactory(); - public static ReferenceContainerCache hostReferenceIndexCache = null; - public static long hostReferenceIndexCacheTime = 0; - public static final long hostReferenceIndexCacheTTL = 1000 * 60 * 60 * 12; // 12 hours time to live for cache + private static ReferenceContainerCache hostReferenceIndexCache = null; + private static long hostReferenceIndexCacheTime = 0; + private static final long hostReferenceIndexCacheTTL = 1000 * 60 * 60 * 12; // 12 hours time to live for cache public synchronized ReferenceContainerCache incomingReferences() { // we return a cache if the cache is filled and not stale @@ -508,22 +509,6 @@ public class WebStructureGraph } } - /* - private void incomingReferencesTest(ReferenceContainerCache idx) { - for (ReferenceContainer references: idx) { - log.logInfo("Term-Host: " + hostHash2hostName(UTF8.String(references.getTermHash()))); - Iterator referenceIterator = references.entries(); - StringBuilder s = new StringBuilder(); - HostReference reference; - while (referenceIterator.hasNext()) { - reference = referenceIterator.next(); - s.append(reference.toPropertyForm()); - log.logInfo(" ... referenced by " + hostHash2hostName(UTF8.String(reference.metadataHash())) + ", " + reference.count() + " references"); - } - } - } - */ - public int referencesCount(final String hosthash) { // returns the number of hosts that are referenced by this hosthash assert hosthash.length() == 6 : "hosthash = " + hosthash; @@ -578,21 +563,31 @@ public class WebStructureGraph return null; } - private void learn(final DigestURI url, final StringBuilder reference /*string of b64(12digits)-hashes*/) { - final String hosthash = ASCII.String(url.hash(), 6, 6); + + private void learnrefs(final LearnObject lro) { + final Set refhosts = new HashSet(); + DigestURI du; + String hosthash; + for ( final MultiProtocolURI u : lro.globalRefURLs ) { + if (Switchboard.getSwitchboard().shallTerminate()) break; + du = DigestURI.toDigestURI(u); + hosthash = ASCII.String(du.hash(), 6, 12); + if (!exists(hosthash)) { + // this must be recorded as an host with no references + synchronized ( this.structure_new ) { + this.structure_new.put(hosthash + "," + u.getHost(), UTF8.getBytes(none2refstr())); + } + } + refhosts.add(hosthash); + } + final DigestURI url = lro.url; + hosthash = ASCII.String(url.hash(), 6, 6); // parse the new reference string and join it with the stored references final StructureEntry structure = outgoingReferences(hosthash); - final Map refs = - (structure == null) ? new HashMap() : structure.references; - assert reference.length() % 12 == 0 : "reference.length() = " - + reference.length() - + ", reference = " - + reference.toString(); - String dom; + final Map refs = (structure == null) ? new HashMap() : structure.references; int c; - for ( int i = 0; i < reference.length() / 12; i++ ) { - dom = reference.substring(i * 12 + 6, (i + 1) * 12); + for (String dom: refhosts) { c = 0; if ( refs.containsKey(dom) ) { c = (refs.get(dom)).intValue(); @@ -681,14 +676,27 @@ public class WebStructureGraph } return maxhost; } - + + public ReversibleScoreMap hostReferenceScore() { + ReversibleScoreMap result = new ClusteredScoreMap(ASCII.identityASCIIComparator); + synchronized ( this.structure_old ) { + for ( final Map.Entry entry : this.structure_old.entrySet() ) { + result.set(entry.getKey().substring(7), (entry.getValue().length - 8) / 10); + } + } + synchronized ( this.structure_new ) { + for ( final Map.Entry entry : this.structure_new.entrySet() ) { + result.set(entry.getKey().substring(7), (entry.getValue().length - 8) / 10); + } + } + return result; + } + public Iterator structureEntryIterator(final boolean latest) { return new StructureIterator(latest); } - private class StructureIterator extends LookAheadIterator implements - Iterator - { + private class StructureIterator extends LookAheadIterator implements Iterator { private final Iterator> i; @@ -727,23 +735,43 @@ public class WebStructureGraph } } - public static class StructureEntry - { + public static class StructureEntry implements Comparable { + public String hosthash; // the tail of the host hash public String hostname; // the host name public String date; // date of latest change public Map references; // a map from the referenced host hash to the number of referenced to that host + private StructureEntry(final String hosthash, final String hostname) { + this(hosthash, hostname, GenericFormatter.SHORT_DAY_FORMATTER.format(), new HashMap()); + } + private StructureEntry( - final String hosthash, - final String hostname, - final String date, - final Map references) { + final String hosthash, + final String hostname, + final String date, + final Map references) { this.hosthash = hosthash; this.hostname = hostname; this.date = date; this.references = references; } + + @Override + public int compareTo(StructureEntry arg0) { + return hosthash.compareTo(arg0.hosthash); + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof StructureEntry)) return false; + return hosthash.equals(((StructureEntry)o).hosthash); + } + + @Override + public int hashCode() { + return this.hosthash.hashCode(); + } } public synchronized void close() { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index e76346010..d337e3528 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2552,7 +2552,7 @@ public final class Switchboard extends serverSwitch // CREATE INDEX final String dc_title = document.dc_title(); - final DigestURI url = new DigestURI(document.dc_source()); + final DigestURI url = DigestURI.toDigestURI(document.dc_source()); final DigestURI referrerURL = queueEntry.referrerURL(); EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash); @@ -2620,7 +2620,7 @@ public final class Switchboard extends serverSwitch rssRow.put("title", UTF8.getBytes(rssEntry.getValue())); rssRow.put("recording_date", new Date()); try { - this.tables.update("rss", new DigestURI(rssEntry.getKey()).hash(), rssRow); + this.tables.update("rss", DigestURI.toDigestURI(rssEntry.getKey()).hash(), rssRow); } catch ( final IOException e ) { Log.logException(e); } @@ -3180,7 +3180,7 @@ public final class Switchboard extends serverSwitch final Iterator i = links.keySet().iterator(); final boolean globalcrawljob = sb.getConfigBool("heuristic.searchresults.crawlglobal",false); while (i.hasNext()) { - url = new DigestURI(i.next()); + url = DigestURI.toDigestURI(i.next()); boolean islocal = url.getHost().contentEquals(startUrl.getHost()); // add all external links or links to different page to crawler if ( !islocal ) {// || (!startUrl.getPath().endsWith(url.getPath()))) { diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index f6b7c27ed..8f67db66d 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -266,7 +266,7 @@ public class Segment { final long urldate = urlModified.getTime(); for (Map.Entry anchorEntry: anchors.entrySet()) { MultiProtocolURI anchor = anchorEntry.getKey(); - byte[] refhash = new DigestURI(anchor).hash(); + byte[] refhash = DigestURI.toDigestURI(anchor).hash(); //System.out.println("*** addCitationIndex: urlhash = " + ASCII.String(urlhash) + ", refhash = " + ASCII.String(refhash) + ", urldate = " + urlModified.toString()); if (this.urlCitationIndex != null) try { this.urlCitationIndex.add(refhash, new CitationReference(urlhash, urldate)); diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index be40bf440..2b236e90f 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -213,7 +213,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } final SolrInputDocument doc = new SolrInputDocument(); - final DigestURI digestURI = new DigestURI(md.url()); + final DigestURI digestURI = DigestURI.toDigestURI(md.url()); boolean allAttr = this.isEmpty(); if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); @@ -341,7 +341,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader header, final Document yacydoc, Condenser condenser, final URIMetadata metadata) { // we use the SolrCell design as index scheme final SolrInputDocument doc = new SolrInputDocument(); - final DigestURI digestURI = new DigestURI(yacydoc.dc_source()); + final DigestURI digestURI = DigestURI.toDigestURI(yacydoc.dc_source()); boolean allAttr = this.isEmpty(); add(doc, YaCySchema.id, id); if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before) diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java index 395375736..cd424551c 100644 --- a/source/net/yacy/search/snippet/MediaSnippet.java +++ b/source/net/yacy/search/snippet/MediaSnippet.java @@ -177,7 +177,7 @@ public class MediaSnippet implements Comparable, Comparator result = new ArrayList(); while (i.hasNext()) { entry = i.next(); - url = new DigestURI(entry.getKey()); + url = DigestURI.toDigestURI(entry.getKey()); desc = entry.getValue(); if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue; final int ranking = removeAppearanceHashes(url.toNormalform(true), queryhashes).size() + @@ -202,7 +202,7 @@ public class MediaSnippet implements Comparable, Comparator result = new ArrayList(); while (i.hasNext()) { ientry = i.next(); - url = new DigestURI(ientry.url()); + url = DigestURI.toDigestURI(ientry.url()); final String u = url.toString(); if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue; if (u.indexOf(".ico",0) >= 0 || u.indexOf("favicon",0) >= 0) continue; diff --git a/source/net/yacy/server/http/HTTPDProxyHandler.java b/source/net/yacy/server/http/HTTPDProxyHandler.java index abf7467a8..95bea1ec0 100644 --- a/source/net/yacy/server/http/HTTPDProxyHandler.java +++ b/source/net/yacy/server/http/HTTPDProxyHandler.java @@ -308,7 +308,7 @@ public final class HTTPDProxyHandler { DigestURI url = null; try { - url = new DigestURI(HeaderFramework.getRequestURL(conProp)); + url = DigestURI.toDigestURI(HeaderFramework.getRequestURL(conProp)); if (log.isFine()) log.logFine(reqID +" GET "+ url); if (log.isFinest()) log.logFinest(reqID +" header: "+ requestHeader); @@ -391,7 +391,7 @@ public final class HTTPDProxyHandler { final Request request = new Request( null, url, - requestHeader.referer() == null ? null : new DigestURI(requestHeader.referer()).hash(), + requestHeader.referer() == null ? null : DigestURI.toDigestURI(requestHeader.referer()).hash(), "", cachedResponseHeader.lastModified(), sb.crawler.defaultProxyProfile.handle(), @@ -527,7 +527,7 @@ public final class HTTPDProxyHandler { final Request request = new Request( null, url, - requestHeader.referer() == null ? null : new DigestURI(requestHeader.referer()).hash(), + requestHeader.referer() == null ? null : DigestURI.toDigestURI(requestHeader.referer()).hash(), "", responseHeader.lastModified(), sb.crawler.defaultProxyProfile.handle(),