diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html index 3b06b914c..ed9e7844a 100644 --- a/htroot/Crawler_p.html +++ b/htroot/Crawler_p.html @@ -178,6 +178,26 @@ #{/list}# +#(linkstructure)#:: + +
+ +
+#(/linkstructure)#

Crawled Pages

diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index eb62823d5..dd607bfa8 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -171,7 +171,7 @@ public class Crawler_p { try { DigestURI crawlingStartURL = new DigestURI(crawlingStart); rootURLs.add(crawlingStartURL); - crawlName += crawlingStartURL.getHost() + "_"; + crawlName += crawlingStartURL.getHost() + ','; if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false; } catch (MalformedURLException e) { @@ -180,8 +180,11 @@ public class Crawler_p { } else { crawlName = crawlingFile.getName(); } - if (crawlName.length() > 80) crawlName = crawlName.substring(0, 80); - if (crawlName.endsWith("_")) crawlName = crawlName.substring(0, crawlName.length() - 1); + if (crawlName.length() > 256) { + int p = crawlName.lastIndexOf(','); + if (p >= 8) crawlName = crawlName.substring(0, p); + } + if (crawlName.endsWith(",")) crawlName = crawlName.substring(0, crawlName.length() - 1); // set the crawl filter @@ -515,16 +518,41 @@ public class Crawler_p { final int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160); CrawlProfile profile; // put active crawls into list + String hosts = ""; for (final byte[] h: sb.crawler.getActive()) { profile = sb.crawler.getActive(h); if (CrawlProfile.ignoreNames.contains(profile.name())) continue; profile.putProfileEntry("crawlProfilesShow_list_", prop, true, dark, count, domlistlength); + if (profile.urlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN) { + hosts = hosts + "," + profile.name(); + } dark = !dark; count++; } prop.put("crawlProfilesShow_list", count); prop.put("crawlProfilesShow", count == 0 ? 0 : 1); + if (count > 0) { + // collect the host names for 'wide' crawls which can be visualized + boolean showLinkstructure = hosts.length() > 0; + /* + // check if there is actually something to see + if (showLinkstructure) { + showLinkstructure = false; + for (String host: hosts.substring(1).split(",")) { + String hash = null; + try {hash = ASCII.String((new DigestURI("http://" + host)).hash(), 6, 6);} catch (final MalformedURLException e) {Log.logException(e);} + if (hash != null && sb.webStructure.referencesCount(hash) > 0) {showLinkstructure = true; break;} + } + } + */ + if (showLinkstructure) { + prop.put("crawlProfilesShow_linkstructure", 1); + prop.put("crawlProfilesShow_linkstructure_hosts", hosts.substring(1)); + } else { + prop.put("crawlProfilesShow_linkstructure", 0); + } + } // return rewrite properties return prop; diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 10e436a1a..894427fce 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -133,7 +133,7 @@ public class QuickCrawlLink_p { CrawlProfile pe = null; try { pe = new CrawlProfile( - crawlingStartURL.toNormalform(true), + (crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost(), crawlingMustMatch, //crawlerUrlMustMatch crawlingMustNotMatch, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch diff --git a/htroot/WebStructurePicture_p.java b/htroot/WebStructurePicture_p.java index 1d703049d..f48212fe7 100644 --- a/htroot/WebStructurePicture_p.java +++ b/htroot/WebStructurePicture_p.java @@ -116,7 +116,7 @@ public class WebStructurePicture_p { String hash = null; try {hash = ASCII.String((new DigestURI("http://" + host)).hash(), 6, 6);} catch (final MalformedURLException e) {Log.logException(e);} Map.Entry centernode = new AbstractMap.SimpleEntry(hash, host); - double angle = 2.0d * i * Math.PI / hostlist.length - Math.PI / hostlist.length; + double angle = 2.0d * i * Math.PI / hostlist.length; if (hostlist.length == 3) angle -= Math.PI / 2; if (hostlist.length == 4) angle += Math.PI / 4; graph.addNode(centernode.getValue(), Math.cos(angle) / 8, Math.sin(angle) / 8, 0); @@ -128,7 +128,8 @@ public class WebStructurePicture_p { // test with: http://localhost:8090/WebStructurePicture_p.png?pa=1&ral=0.7&raa=0.5&rar=2&rel=0.5&rea=1&rer=2 GraphPlotter.Ribbon rAll = new GraphPlotter.Ribbon(post.getFloat("ral", 0.1f), post.getFloat("raa", 0.1f), post.getFloat("rar", 0.1f)); GraphPlotter.Ribbon rEdge = new GraphPlotter.Ribbon(post.getFloat("rel", 0.05f), post.getFloat("rea", 0.1f), post.getFloat("rer", 0.1f)); - for (int i = 0; i < post.getInt("pa", 1); i++) graph = graph.physics(rAll, rEdge); + int pa = post.getInt("pa", 0); + for (int i = 0; i < pa; i++) graph = graph.physics(rAll, rEdge); } // draw the graph diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index dfa668c95..bad4a153b 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -156,6 +156,8 @@ public final class HTTPLoader { this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString); this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl); + this.sb.webStructure.generateCitationReference(url, redirectionUrl); + if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) { this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode); } diff --git a/source/net/yacy/peers/graphics/WebStructureGraph.java b/source/net/yacy/peers/graphics/WebStructureGraph.java index d7bea501c..49e0bcb4f 100644 --- a/source/net/yacy/peers/graphics/WebStructureGraph.java +++ b/source/net/yacy/peers/graphics/WebStructureGraph.java @@ -171,9 +171,27 @@ public class WebStructureGraph { } } final LearnObject lro = new LearnObject(url, globalRefURLs); + if (!globalRefURLs.isEmpty()) { + try { + if (this.publicRefDNSResolvingWorker.isAlive()) { + this.publicRefDNSResolvingQueue.put(lro); + } else { + learnrefs(lro); + } + } catch ( final InterruptedException e ) { + learnrefs(lro); + } + } + } + + public void generateCitationReference(final DigestURI from, final DigestURI to) { + final HashSet globalRefURLs = new HashSet(); + final String refhost = from.getHost(); + if (refhost != null && to.getHost() != null && !to.getHost().equals(refhost)) globalRefURLs.add(to); + final LearnObject lro = new LearnObject(from, globalRefURLs); if ( !globalRefURLs.isEmpty() ) { try { - if ( this.publicRefDNSResolvingWorker.isAlive() ) { + if (this.publicRefDNSResolvingWorker.isAlive()) { this.publicRefDNSResolvingQueue.put(lro); } else { learnrefs(lro); diff --git a/source/net/yacy/visualization/GraphPlotter.java b/source/net/yacy/visualization/GraphPlotter.java index 592c59494..39ea0c22d 100644 --- a/source/net/yacy/visualization/GraphPlotter.java +++ b/source/net/yacy/visualization/GraphPlotter.java @@ -182,7 +182,7 @@ public class GraphPlotter implements Cloneable { } public boolean hasEdge(final String fromNode, final String toNode) { - return this.edges.contains(fromNode + "-" + toNode); + return this.edges.contains(fromNode + '-' + toNode); } public void setEdge(final String fromNode, final String toNode) { @@ -190,18 +190,18 @@ public class GraphPlotter implements Cloneable { final Point to = this.nodes.get(toNode); assert from != null; assert to != null; - this.edges.add(fromNode + "$" + toNode); + this.edges.add(fromNode + '$' + toNode); } public Collection getEdges(final String node, boolean start) { Collection c = new ArrayList(); if (start) { - String s = node + "$"; + String s = node + '$'; for (String e: this.edges) { if (e.startsWith(s)) c.add(e.substring(s.length())); } } else { - String s = "$" + node; + String s = '$' + node; for (String e: this.edges) { if (e.endsWith(s)) c.add(e.substring(0, e.length() - s.length())); } @@ -219,7 +219,7 @@ public class GraphPlotter implements Cloneable { entry = i.next(); name = entry.getKey(); c = entry.getValue(); - System.out.println("point(" + c.x + ", " + c.y + ", " + c.layer + ") [" + name + "]"); + System.out.println("point(" + c.x + ", " + c.y + ", " + c.layer + ") [" + name + ']'); } final Iterator j = this.edges.iterator(); while (j.hasNext()) {