Michael Peter Christen 6 years ago
commit c347e7d3f8

@ -76,12 +76,12 @@ NO OTHER SOFTWARE IS REQUIRED!
Startup and Shutdown of YaCy:
- on GNU/Linux and OpenBSD:
to start: execute ./startYACY.sh
to stop : execute ./stopYACY.sh
- to start: execute `./startYACY.sh`
- to stop : execute `./stopYACY.sh`
- on Windows:
to start: double-click startYACY.bat
to stop : double-click stopYACY.bat
- to start: double-click `startYACY.bat`
- to stop : double-click `stopYACY.bat`
- on Mac OS X:
please use the Mac Application and start or stop it like any
@ -135,10 +135,14 @@ More details for YaCy on Heroku in [Heroku.md](Heroku.md).
## Port 8090 is bad, people are not allowed to access that port
You can forward port 80 to 8090 with iptables:
```bash
iptables -t nat -A PREROUTING -p tcp --dport 80 -j REDIRECT --to-port 8090
```
On some operation systems, you must first enable access to the ports you are using like:
```bash
iptables -I INPUT -m tcp -p tcp --dport 8090 -j ACCEPT
```
## How can I scale this; how much ram is needed; disk space?

@ -20,6 +20,7 @@
<storeHTCache>#(storeHTCache)#false::true#(/storeHTCache)#</storeHTCache>
<remoteIndexing>#(remoteIndexing)#false::true#(/remoteIndexing)#</remoteIndexing>
<cacheStrategy>#[cacheStrategy]#</cacheStrategy>
<crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType>
<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>

@ -317,6 +317,27 @@
Obey html-robots-nofollow: <input type="checkbox" name="obeyHtmlRobotsNofollow" id="obeyHtmlRobotsNofollow" #(obeyHtmlRobotsNofollowChecked)#::checked="checked"#(/obeyHtmlRobotsNofollowChecked)# /><!--<br/>
Follow Frames: <input type="checkbox" name="followFrames" id="followFrames" #(followFramesChecked)#::checked="checked"#(/followFramesChecked)# />-->
</dd>
<dt>Media Type detection</dt>
<dd>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Media Type checking info"/>
<span style="right:0px; width:30em;" id="mediaTypeCheckingInfo">
Not loading URLs with unsupported file extension is faster but less accurate.
Indeed, for some web resources the actual Media Type is not consistent with the URL file extension. Here are some examples:
<ul>
<li><a href="https://en.wikipedia.org/wiki/.de" target="_blank">https://en.wikipedia.org/wiki/.de</a> : the .de extension is unknown, but the actual Media Type of this page is text/html</li>
<li><a href="https://en.wikipedia.org/wiki/Ask.com" target="_blank">https://en.wikipedia.org/wiki/Ask.com</a> : the .com extension is not supported (executable file format), but the actual Media Type of this page is text/html</li>
<li><a href="https://commons.wikimedia.org/wiki/File:YaCy_logo.png" target="_blank">https://commons.wikimedia.org/wiki/File:YaCy_logo.png</a> : the .png extension is a supported image format, but the actual Media Type of this page is text/html</li>
</ul>
</span>
</div>
<label>
<input type="radio" aria-describedby="mediaTypeCheckingInfo" name="crawlerAlwaysCheckMediaType" value="false" #(crawlerAlwaysCheckMediaType)#checked="checked"::#(/crawlerAlwaysCheckMediaType)# /> Do not load URLs with an unsupported file extension
</label>
<label>
<input type="radio" name="crawlerAlwaysCheckMediaType" value="true" #(crawlerAlwaysCheckMediaType)#::checked="checked"#(/crawlerAlwaysCheckMediaType)# /> Always cross check file extension against Content-Type header
</label>
</dd>
<dt>Load Filter on URLs</dt>
<dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>.

@ -213,6 +213,13 @@ public class CrawlStartExpert {
prop.put("obeyHtmlRobotsNoindexChecked", post.getBoolean("obeyHtmlRobotsNoindex") ? 1 : 0);
prop.put("obeyHtmlRobotsNofollowChecked", post.getBoolean("obeyHtmlRobotsNofollow") ? 1 : 0);
}
// always cross-check URL file extension against actual Media Type ?
if (post == null) {
prop.put("crawlerAlwaysCheckMediaType", true);
} else {
prop.put("crawlerAlwaysCheckMediaType", post.getBoolean("crawlerAlwaysCheckMediaType"));
}
// Load Filter on URLs (range)
if (post != null && post.containsKey("range")) {

@ -290,7 +290,7 @@ window.setInterval("setTableSize()", 1000);
<td>
<form style="float:right;" action="Crawler_p.html"><input type="submit" name="hidewebstructuregraph" class="btn btn-default btn-xs" value="hide graphic"/><form>
</td></tr></table>
<script src="js/d3.v3.min.js"></script>
<script src="js/d3.v5.min.js"></script>
<script src="js/hypertree.js"></script>
<div id="linkstructure"></div>
<script>$(document).ready(linkstructure("#[hosts]#", "#linkstructure", 1280, 720, 3000, 700));</script>::

@ -332,7 +332,7 @@ public class Crawler_p {
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); // catch also all linked media documents without loading them
boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); // catch also all linked media documents even when no parser is available
env.setConfig("crawlingDirectDocByURL", directDocByURL);
final String collection = post.get("collection", "user");
@ -633,6 +633,8 @@ public class Crawler_p {
.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, solrQueryMustMatch);
profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, solrQueryMustNotMatch);
profile.put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key,
post.getBoolean("crawlerAlwaysCheckMediaType"));
handle = ASCII.getBytes(profile.handle());

@ -187,8 +187,8 @@ var solr= $.getJSON("solr/collection1/select?q=*:*&defType=edismax&start=0&rows=
<table class="sortable" style="float:left; border-width: 0">
<thead>
<tr>
<th style="text-align:center; width:32"></th>
<th style="text-align:left; width: 600" class="listing">Path</th>
<th style="text-align:center; width:32px"></th>
<th style="text-align:left; width: 600px" class="listing">Path</th>
<th style="text-align:right; padding:2px;" class="listingem">stored</th>
<th style="text-align:right; padding:2px;" class="listingem">linked</th>
<th style="text-align:right; padding:2px;" class="listingem">pending</th>
@ -196,6 +196,7 @@ var solr= $.getJSON("solr/collection1/select?q=*:*&defType=edismax&start=0&rows=
<th style="text-align:right; padding:2px;" class="listingem">failed</th>
</tr>
</thead>
<tbody>
#(root)#
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
<td style="text-align:center"></td>
@ -226,11 +227,12 @@ var solr= $.getJSON("solr/collection1/select?q=*:*&defType=edismax&start=0&rows=
</tr>
#(/type)#
#{/list}#
</tbody>
</table>
</fieldset>
#(linkgraph)#<div style="text-align:center"><form><input name="showlinkstructure" onClick="location.href = location.toString() + '&showlinkstructure=';" class="btn btn-default btn-xs" value="show link structure graph"/></form></div>::
<script src="js/d3.v3.min.js"></script>
<script src="js/d3.v5.min.js"></script>
<script src="js/hypertree.js"></script>
<div id="linkstructure"></div>
<script>$(document).ready(linkstructure("#[host]#", "#linkstructure", 1280, 720, 3000, 700));</script>

@ -32,7 +32,9 @@ import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import org.apache.solr.common.SolrDocument;
@ -417,7 +419,11 @@ public class HostBrowser {
q.append(" AND ").append(CollectionSchema.url_paths_sxt.getSolrFieldName()).append(AbstractSolrConnector.CATCHALL_DTERM);
}
}
BlockingQueue<SolrDocument> docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000, TIMEOUT, 100, 1, false,
final int pageSize = 100;
final BlockingQueue<SolrDocument> docs = new ArrayBlockingQueue<>(pageSize);
final List<String> queries = new ArrayList<>();
queries.add(q.toString());
final Thread solrQueryTask = new Thread(fulltext.getDefaultConnector().newDocumentsByQueriesTask(docs, queries, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000, TIMEOUT, pageSize, 1,
CollectionSchema.id.getSolrFieldName(),
CollectionSchema.sku.getSolrFieldName(),
CollectionSchema.failreason_s.getSolrFieldName(),
@ -433,8 +439,8 @@ public class HostBrowser {
CollectionSchema.references_exthosts_i.getSolrFieldName(),
CollectionSchema.cr_host_chance_d.getSolrFieldName(),
CollectionSchema.cr_host_norm_i.getSolrFieldName()
);
SolrDocument doc;
));
solrQueryTask.start();
Set<String> storedDocs = new HashSet<String>();
Map<String, FailType> errorDocs = new HashMap<String, FailType>();
Set<String> inboundLinks = new HashSet<String>();
@ -445,60 +451,72 @@ public class HostBrowser {
final Collection<String> reloadURLs = new ArrayList<String>();
final Set<String> reloadURLCollection = new HashSet<String>();
long timeoutList = System.currentTimeMillis() + TIMEOUT;
long remainingTime = TIMEOUT;
long timeoutReferences = System.currentTimeMillis() + 6000;
ReferenceReportCache rrCache = sb.index.getReferenceReportCache();
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
FailType error = errortype == null ? null : FailType.valueOf(errortype);
String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), rrCache, doc, ids, System.currentTimeMillis() < timeoutReferences));
if (u.startsWith(path)) {
if (delete) {
deleteIDs.add(ids);
} else {
if (error == null) storedDocs.add(u); else {
if (reload404 && error == FailType.fail) {
ArrayList<String> collections = (ArrayList<String>) doc.getFieldValue(CollectionSchema.collection_sxt.getSolrFieldName());
if (collections != null) reloadURLCollection.addAll(collections);
reloadURLs.add(u);
}
if (authorized) errorDocs.put(u, error);
}
}
} else if (complete) {
if (error == null) storedDocs.add(u); else {
if (authorized) errorDocs.put(u, error);
}
}
if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); // add the current link
if (error == null) {
hostsize++;
// collect inboundlinks to browse the host
Iterator<String> links = URIMetadataNode.getLinks(doc, true);
while (links.hasNext()) {
u = links.next();
if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u);
}
try {
SolrDocument doc = docs.poll(remainingTime, TimeUnit.MILLISECONDS);
while (doc != AbstractSolrConnector.POISON_DOCUMENT && doc != null) {
String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
FailType error = errortype == null ? null : FailType.valueOf(errortype);
String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), rrCache, doc, ids, System.currentTimeMillis() < timeoutReferences));
if (u.startsWith(path)) {
if (delete) {
deleteIDs.add(ids);
} else {
if (error == null) storedDocs.add(u); else {
if (reload404 && error == FailType.fail) {
ArrayList<String> collections = (ArrayList<String>) doc.getFieldValue(CollectionSchema.collection_sxt.getSolrFieldName());
if (collections != null) reloadURLCollection.addAll(collections);
reloadURLs.add(u);
}
if (authorized) errorDocs.put(u, error);
}
}
} else if (complete) {
if (error == null) storedDocs.add(u); else {
if (authorized) errorDocs.put(u, error);
}
}
if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); // add the current link
if (error == null) {
hostsize++;
// collect inboundlinks to browse the host
Iterator<String> links = URIMetadataNode.getLinks(doc, true);
while (links.hasNext()) {
u = links.next();
if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u);
}
// collect referrer links
links = URIMetadataNode.getLinks(doc, false);
while (links.hasNext()) {
u = links.next();
try {
MultiProtocolURL mu = new MultiProtocolURL(u);
if (mu.getHost() != null) {
ReversibleScoreMap<String> lks = outboundHosts.get(mu.getHost());
if (lks == null) {
lks = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
outboundHosts.put(mu.getHost(), lks);
}
lks.set(u, u.length());
}
} catch (final MalformedURLException e) {}
}
}
if (System.currentTimeMillis() > timeoutList) break;
// collect referrer links
links = URIMetadataNode.getLinks(doc, false);
while (links.hasNext()) {
u = links.next();
try {
MultiProtocolURL mu = new MultiProtocolURL(u);
if (mu.getHost() != null) {
ReversibleScoreMap<String> lks = outboundHosts.get(mu.getHost());
if (lks == null) {
lks = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
outboundHosts.put(mu.getHost(), lks);
}
lks.set(u, u.length());
}
} catch (final MalformedURLException e) {}
}
}
remainingTime = timeoutList - System.currentTimeMillis();
if (remainingTime <= 0) {
break;
}
doc = docs.poll(remainingTime, TimeUnit.MILLISECONDS);
}
} finally {
/* Ensure termination and proper resources release of the query thread */
solrQueryTask.interrupt();
}
if (deleteIDs.size() > 0) sb.remove(deleteIDs);
if (reloadURLs.size() > 0) {

@ -50,6 +50,7 @@ import net.yacy.crawler.data.Snapshots;
import net.yacy.crawler.data.Snapshots.Revisions;
import net.yacy.crawler.data.Transactions;
import net.yacy.document.ImageParser;
import net.yacy.http.servlets.TemplateMissingParameterException;
import net.yacy.http.servlets.TemplateProcessingException;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.peers.graphics.EncodedImage;
@ -69,14 +70,25 @@ public class snapshot {
public static Object respond(final RequestHeader header, serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env;
final serverObjects defaultResponse = new serverObjects();
final boolean authenticated = sb.adminAuthenticated(header) >= 2;
final String ext = header.get(HeaderFramework.CONNECTION_PROP_EXT, "");
if(ext.isEmpty()) {
throw new TemplateProcessingException("Missing extension. Try with rss, xml, json, pdf, png or jpg." + ext,
HttpStatus.SC_BAD_REQUEST);
}
if (ext.equals("rss")) {
// create a report about the content of the snapshot directory
if (!authenticated) return null;
if (!authenticated) {
defaultResponse.authenticationRequired();
return defaultResponse;
}
int maxcount = post == null ? 10 : post.getInt("maxcount", 10);
int depthx = post == null ? -1 : post.getInt("depth", -1);
Integer depth = depthx == -1 ? null : depthx;
@ -108,7 +120,10 @@ public class snapshot {
if (post == null) post = new serverObjects();
final boolean xml = ext.equals("xml");
final boolean pdf = ext.equals("pdf");
if (pdf && !authenticated) return null;
if (pdf && !authenticated) {
defaultResponse.authenticationRequired();
return defaultResponse;
}
final boolean pngjpg = ext.equals("png") || ext.equals(DEFAULT_EXT);
String urlhash = post.get("urlhash", "");
String url = post.get("url", "");
@ -127,7 +142,6 @@ public class snapshot {
ConcurrentLog.logException(e);
}
}
if (url.length() == 0 && durl != null) url = durl.toNormalform(true);
if (ext.equals("json")) {
// command interface: view and change a transaction state, get metadata about transactions in the past
@ -141,7 +155,10 @@ public class snapshot {
for (Map.Entry<String, Integer> state: Transactions.sizes().entrySet()) sizes.put(state.getKey(), state.getValue());
result.put("size", sizes);
} else if (command.equals("list")) {
if (!authenticated) return null;
if (!authenticated) {
defaultResponse.authenticationRequired();
return defaultResponse;
}
// return a status of the transaction archive
String host = post.get("host");
String depth = post.get("depth");
@ -179,7 +196,10 @@ public class snapshot {
}
}
} else if (command.equals("commit")) {
if (!authenticated) return null;
if (!authenticated) {
defaultResponse.authenticationRequired();
return defaultResponse;
}
Revisions r = Transactions.commit(urlhash);
if (r != null) {
result.put("result", "success");
@ -191,7 +211,10 @@ public class snapshot {
}
result.put("urlhash", urlhash);
} else if (command.equals("rollback")) {
if (!authenticated) return null;
if (!authenticated) {
defaultResponse.authenticationRequired();
return defaultResponse;
}
Revisions r = Transactions.rollback(urlhash);
if (r != null) {
result.put("result", "success");
@ -235,30 +258,36 @@ public class snapshot {
}
// for the following methods we always need the durl to fetch data
if (durl == null) return null;
if (durl == null) {
throw new TemplateMissingParameterException("Missing valid url or urlhash parameter");
}
if (xml) {
Collection<File> xmlSnapshots = Transactions.findPaths(durl, "xml", Transactions.State.ANY);
File xmlFile = null;
if (xmlSnapshots.size() == 0) {
return null;
if (xmlSnapshots.isEmpty()) {
throw new TemplateProcessingException("Could not find the xml snapshot file.", HttpStatus.SC_NOT_FOUND);
}
xmlFile = xmlSnapshots.iterator().next();
try {
byte[] xmlBinary = FileUtils.read(xmlFile);
return new ByteArrayInputStream(xmlBinary);
} catch (IOException e) {
} catch (final IOException e) {
ConcurrentLog.logException(e);
return null;
throw new TemplateProcessingException("Could not read the xml snapshot file.");
}
}
if (pdf || pngjpg) {
Collection<File> pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.INVENTORY);
File pdfFile = null;
if (pdfSnapshots.size() == 0) {
if (pdfSnapshots.isEmpty()) {
// if the client is authenticated, we create the pdf on the fly!
if (!authenticated) return null;
if (!authenticated) {
throw new TemplateProcessingException(
"Could not find the pdf snapshot file. You must be authenticated to generate one on the fly.",
HttpStatus.SC_NOT_FOUND);
}
SolrDocument sd = sb.index.fulltext().getMetadata(durl.hash());
boolean success = false;
if (sd == null) {
@ -269,19 +298,25 @@ public class snapshot {
}
if (success) {
pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.ANY);
if (pdfSnapshots.size() != 0) pdfFile = pdfSnapshots.iterator().next();
if (!pdfSnapshots.isEmpty()) {
pdfFile = pdfSnapshots.iterator().next();
}
}
} else {
pdfFile = pdfSnapshots.iterator().next();
}
if (pdfFile == null) return null;
if (pdfFile == null) {
throw new TemplateProcessingException(
"Could not find the pdf snapshot file and could not generate one on the fly.",
HttpStatus.SC_NOT_FOUND);
}
if (pdf) {
try {
byte[] pdfBinary = FileUtils.read(pdfFile);
return new ByteArrayInputStream(pdfBinary);
} catch (IOException e) {
} catch (final IOException e) {
ConcurrentLog.logException(e);
return null;
throw new TemplateProcessingException("Could not read the pdf snapshot file.");
}
}
@ -338,6 +373,8 @@ public class snapshot {
}
}
return null;
throw new TemplateProcessingException(
"Unsupported extension : " + ext + ". Try with rss, xml, json, pdf, png or jpg.",
HttpStatus.SC_BAD_REQUEST);
}
}

@ -32,6 +32,22 @@ circle {
}
text {
font: 9px sans-serif;
pointer-events: none;
cursor: default;
text-shadow: 0 1px 0 #fff, 1px 0 0 #fff, 0 -1px 0 #fff, -1px 0 0 #fff;
}
text tspan.truncated {
display: none;
}
text:hover tspan.truncated {
display: inherit;
}
text tspan.ellipsis {
display: inherit;
}
text:hover tspan.ellipsis {
display: none;
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -20,15 +20,26 @@
function linkstructure(hostname, element, width, height, maxtime, maxnodes) {
var nodes = {};
var links = [];
var linkstructure = {};
$.getJSON("api/linkstructure.json?about=" + hostname + "&maxtime=" + maxtime + "&maxnodes=" + maxnodes, function(linkstructure) {
links = linkstructure.graph;
links.forEach(function(link) {
link.source = nodes[link.source] || (nodes[link.source] = {name: link.source, type:"Inbound"});
link.target = nodes[link.target] || (nodes[link.target] = {name: link.target, type:link.type});
});
var force = d3.layout.force().nodes(d3.values(nodes)).links(links).size([width, height]).linkDistance(60).charge(-800).on("tick", tick).start();
force.gravity(0.7);
/* attract nodes to the center - was set with force.gravity(0.7) in d3v3 */
var forceX = d3.forceX(width / 2).strength(0.7);
var forceY = d3.forceY(height / 2).strength(0.7);
var link = d3.forceLink(links).distance(60).strength(1);
var simulation = d3.forceSimulation()
.nodes(d3.values(nodes))
.force('link', link)
.force("center", d3.forceCenter(width / 2, height / 2)) // center elements - was set with size([width, height]) in d3v3
.force('charge', d3.forceManyBody().strength(-800))
.force('x', forceX)
.force('y', forceY)
.on("tick", ticked);
var svg = d3.select(element).append("svg").attr("id", "hypertree").attr("width", width).attr("height", height);
svg.append("defs").selectAll("marker")
.data(["Dead", "Outbound", "Inbound"])
@ -49,15 +60,25 @@ function linkstructure(hostname, element, width, height, maxtime, maxnodes) {
svg.append("text").attr("x", 10).attr("y", height - 10).text("blue: links to other domains").attr("style", "font-size:9px").attr("fill", "lightblue");
svg.append("text").attr("x", 10).attr("y", height).text("red: dead links").attr("style", "font-size:9px").attr("fill", "red");
var path = svg.append("g")
.selectAll("path").data(force.links()).enter().append("path")
.selectAll("path").data(link.links()).enter().append("path")
.attr("class",function(d) {return "hypertree-link " + d.type; })
.attr("marker-end", function(d) { return "url(#" + d.type + ")";});
var circle = svg.append("g").selectAll("circle").data(force.nodes()).enter().append("circle").attr("r", 4).call(force.drag);
var circle = svg.append("g").selectAll("circle").data(simulation.nodes()).enter().append("circle").attr("r", 4).call(d3.drag());
var maxTextLength = 40;
var text = svg.append("g")
.selectAll("text").data(force.nodes()).enter().append("text").attr("x", 8).attr("y", ".31em")
.selectAll("text").data(simulation.nodes()).enter().append("text").attr("x", 8).attr("y", ".31em")
.attr("style", function(d) {return d.type == "Outbound" ? "fill:#888888;" : "fill:#000000;";})
.text(function(d) {return d.name;});
function tick() {
.text(function(d) {/* Limit the length of nodes visible text to improve readability */ return d.name.substring(0, Math.min(d.name.length, maxTextLength));});
text.append("tspan")
.attr("class", "truncated")
.text(function(d) {/* The end of large texts is wraped in a tspan, made visible on mouse overing */return d.name.length > maxTextLength ? d.name.substring(maxTextLength) : ""});
text.append("tspan")
.attr("class", "ellipsis")
.text(function(d) {/* Add an ellipsis to mark long texts that are truncated */ return d.name.length > maxTextLength ? "..." : ""});
function ticked() {
path.attr("d", linkArc);
circle.attr("transform", transform);
text.attr("transform", transform);

@ -99,9 +99,9 @@
<td><a href="env/bootstrap/js/typeahead.jquery.js">typeahead.jquery.js</a> (0.10.5)</td>
</tr>
<tr>
<td><a href="js/d3.v3.min.js">d3.v3.min.js</a></td>
<td><a href="js/d3.v5.min.js">d3.v5.min.js</a></td>
<td><a href="http://opensource.org/licenses/BSD-3-Clause">Modified-BSD</a></td>
<td><a href="https://raw.githubusercontent.com/d3/d3.github.com/b3382f60bf721923c7c649709adcfb4c8b66d994/d3.v3.js">d3.v3.js</a> (3.4.4)</td>
<td><a href="https://unpkg.com/d3@5.7.0/dist/d3.js">d3.js</a> (5.7.0)</td>
</tr>
<tr>
<td><a href="js/highslide/highslide.js">highslide.js</a></td>

@ -21,6 +21,7 @@
package net.yacy.cora.federate.solr.connector;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
@ -31,24 +32,17 @@ import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.LookAheadIterator;
import net.yacy.kelondro.data.word.Word;
import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.FacetField;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.FacetField.Count;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
@ -58,6 +52,14 @@ import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.DisMaxParams;
import org.apache.solr.common.params.FacetParams;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.LookAheadIterator;
import net.yacy.kelondro.data.word.Word;
import net.yacy.search.schema.CollectionSchema;
public abstract class AbstractSolrConnector implements SolrConnector {
protected static Set<String> SOLR_ID_FIELDS = new HashSet<String>();
@ -170,19 +172,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
}
/**
* Get results from solr queries as a stream of documents.
* The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
* The method returns immediately and feeds the search results into the queue
* @param querystrings the list of solr query strings
* @param sort the solr sort string, may be null to be not used
* @param offset first result offset
* @param maxcount the maximum number of results
* @param maxtime the maximum time in milliseconds
* @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used
* @param concurrency is the number of AbstractSolrConnector.POISON_DOCUMENT entries to add at the end of the feed
* @param prefetchIDs if true, then first all IDs are fetched and then all documents are queries by the ID. If false then documents are retrieved directly
* @param fields list of fields
* @return a blocking queue which is terminated with AbstractSolrConnector.POISON_DOCUMENT as last element
* {@inheritDoc}
*/
@Override
public BlockingQueue<SolrDocument> concurrentDocumentsByQueries(
@ -195,12 +185,11 @@ public abstract class AbstractSolrConnector implements SolrConnector {
final int concurrency,
final boolean prefetchIDs,
final String ... fields) {
assert buffersize > 0;
if (!prefetchIDs) return concurrentDocumentsByQueriesNoPrefetch(querystrings, sort, offset, maxcount, maxtime, buffersize, concurrency, fields);
final BlockingQueue<SolrDocument> queue = buffersize <= 0 ? new LinkedBlockingQueue<SolrDocument>() : new ArrayBlockingQueue<SolrDocument>(Math.max(buffersize, concurrency));
if (querystrings.size() == 0) {
for (int i = 0; i < Math.max(1, concurrency); i++) try {queue.put(AbstractSolrConnector.POISON_DOCUMENT);} catch (final InterruptedException e1) {}
return queue;
if (!prefetchIDs) {
final Thread t = new Thread(newDocumentsByQueriesTask(queue, querystrings, sort, offset, maxcount, maxtime, buffersize, concurrency, fields));
t.start();
return queue;
}
final BlockingQueue<String> idQueue = concurrentIDsByQueries(querystrings, sort, offset, maxcount, maxtime, Math.min(maxcount, 10000000), concurrency);
final long endtime = maxtime < 0 || maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; // we know infinity!
@ -235,7 +224,9 @@ public abstract class AbstractSolrConnector implements SolrConnector {
return queue;
}
private BlockingQueue<SolrDocument> concurrentDocumentsByQueriesNoPrefetch(
@Override
public Runnable newDocumentsByQueriesTask(
final BlockingQueue<SolrDocument> queue,
final List<String> querystrings,
final String sort,
final int offset,
@ -244,59 +235,85 @@ public abstract class AbstractSolrConnector implements SolrConnector {
final int buffersize,
final int concurrency,
final String ... fields) {
assert buffersize > 0;
final BlockingQueue<SolrDocument> queue = buffersize <= 0 ? new LinkedBlockingQueue<SolrDocument>() : new ArrayBlockingQueue<SolrDocument>(buffersize);
if (querystrings.size() == 0) {
for (int i = 0; i < Math.max(1, concurrency); i++) try {queue.put(AbstractSolrConnector.POISON_DOCUMENT);} catch (final InterruptedException e1) {}
return queue;
Objects.requireNonNull(queue, "The queue parameter must not be null.");
if (querystrings == null || querystrings.isEmpty()) {
return () -> {
for (int i = 0; i < Math.max(1, concurrency); i++) {
try {
queue.put(AbstractSolrConnector.POISON_DOCUMENT);
} catch (final InterruptedException e1) {
Thread.currentThread().interrupt(); // preserve interrupted thread state
}
}
};
}
final long endtime = maxtime < 0 || maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; // we know infinity!
final int ps = buffersize < 0 ? pagesize_docs : Math.min(pagesize_docs, buffersize);
final int maxretries = 6;
final Thread t = new Thread() {
@Override
public void run() {
try {
for (String querystring: querystrings) {
this.setName("AbstractSolrConnector:concurrentDocumentsByQueryNoPrefetch(" + querystring + ")");
int o = offset;
int count = 0;
int retry = 0;
loop: while (System.currentTimeMillis() < endtime && count < maxcount) {
try {
SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, ps), fields);
for (SolrDocument d: sdl) {
try {queue.put(d);} catch (final InterruptedException e) {break;}
count++;
}
if (sdl.size() < ps) {
//System.out.println("sdl.size() = " + sdl.size() + ", pagesize = " + pagesize);
break loop; // finished
}
o += sdl.size();
retry = 0;
} catch (final SolrException | IOException e) {
ConcurrentLog.logException(e);
if (retry++ < maxretries) {
// remote Solr may be temporary down, so we wait a bit
try {Thread.sleep(100);} catch (InterruptedException e1) {}
continue loop;
}
// fail
ConcurrentLog.severe("AbstractSolrConnector", "aborted concurrentDocumentsByQueryNoPrefetch after " + maxretries + " retries: " + e.getMessage());
break;
}
return () -> {
long remainingTime = endtime - System.currentTimeMillis();
try {
for (final String querystring: querystrings) {
Thread.currentThread().setName("AbstractSolrConnector:concurrentDocumentsByQueryNoPrefetch(" + querystring + ")");
int o = offset;
int count = 0;
int retry = 0;
loop: while (remainingTime > 0 && count < maxcount) {
try {
final SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, ps), fields);
for (final SolrDocument d: sdl) {
if (endtime != Long.MAX_VALUE) {
/*
* A timeout is defined : we must not use here queue.put() otherwise this
* thread could indefinitely wait here when the queue is full and the
* consumer thread has stopped taking in the queue.
*/
if (!queue.offer(d, remainingTime, TimeUnit.MILLISECONDS)) {
break;
}
} else {
queue.put(d);
}
count++;
}
if (sdl.size() < ps) {
break loop; // finished
}
o += sdl.size();
retry = 0;
} catch(final InterruptedIOException e) {
throw new InterruptedException(); // rethrow to finish the process
} catch (final SolrException | IOException e) {
ConcurrentLog.logException(e);
if (retry++ < maxretries) {
// remote Solr may be temporary down, so we wait a bit
Thread.sleep(100);
continue loop;
}
// fail
ConcurrentLog.severe("AbstractSolrConnector", "aborted concurrentDocumentsByQueryNoPrefetch after " + maxretries + " retries: " + e.getMessage());
break;
}
}
} catch (Throwable e) {} finally {
for (int i = 0; i < Math.max(1, concurrency); i++) {
try {queue.put(AbstractSolrConnector.POISON_DOCUMENT);} catch (final InterruptedException e1) {}
remainingTime = endtime - System.currentTimeMillis();
}
}
} catch(final InterruptedException e) {
Thread.currentThread().interrupt(); // preserve interrupted thread state
} catch (final RuntimeException e) {
ConcurrentLog.logException(e);
} finally {
/* Add poison elements only when the thread has not been interrupted */
for (int i = 0; i < Math.max(1, concurrency); i++) {
try {
queue.put(AbstractSolrConnector.POISON_DOCUMENT);
} catch (final InterruptedException e1) {
Thread.currentThread().interrupt(); // preserve interrupted thread state
break; // thread is interrupted : in that case we no more try to add poison elements to the queue
}
}
}
};
t.start();
return queue;
}
/**

@ -224,9 +224,11 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
public LinkedHashMap<String, ReversibleScoreMap<String>> getFacets(String query, int maxresults, final String ... fields) throws IOException;
/**
* Get results from a solr query as a stream of documents.
* <p>Get results from solr queries as a stream of documents.
* The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
* The method returns immediately and feeds the search results into the queue
* The method returns immediately and feeds the search results into the queue.</p>
* <p><strong>Important</strong> : be careful if the consumer thread(s) terminate before taking the poison document(s) from the queue,
* as the producer thread(s) may indefinitely block on their last step (adding poison element) because the queue would be full.</p>
* @param querystring the solr query string
* @param sort the solr sort string, may be null to be not used
* @param offset first result offset
@ -249,6 +251,27 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
final boolean prefetchIDs,
final String ... fields);
/**
* Creates a new runnable task to run a given list of Solr queries and fill a
* results queue by packets of a limited number of results.
*
* @param queue the results queue. Must not be null.
* @param querystrings a list of Solr queries
* @param sort an eventual Solr sort criteria
* @param offset the results offset position for each query
* @param maxcount the maximum number of documents per query to retrieve
* @param maxtime the total maximum time to spend. Unlimited when the value
* is negative or equals to Long.MAX_VALUE
* @param buffersize this is the maximum size of a page of results to retrieve
* in one step when running a query
* @param concurrency the number of consuming threads
* @param fields the indexed fields to retrieve
* @return a ready to run task
*/
public Runnable newDocumentsByQueriesTask(final BlockingQueue<SolrDocument> queue, final List<String> querystrings,
final String sort, final int offset, final int maxcount, final long maxtime, final int buffersize,
final int concurrency, final String... fields);
/**
* Get results from solr queries as a stream of documents.
* The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.

@ -78,6 +78,13 @@ public class ClientIdentification {
public final static String customAgentName = "Custom Agent";
public final static String browserAgentName = "Random Browser";
public static Agent browserAgent;
/**
* provide system information (this is part of YaCy protocol)
*/
public static final String yacySystem = System.getProperty("os.arch", "no-os-arch") + " " +
System.getProperty("os.name", "no-os-name") + " " + System.getProperty("os.version", "no-os-version") +
"; " + "java " + System.getProperty("java.version", "no-java-version") + "; " + generateLocation(); // keep this before the following static initialization block as this constant is used by generateYaCyBot()
static {
generateYaCyBot("new");
@ -87,13 +94,6 @@ public class ClientIdentification {
agents.put(yacyProxyAgentName, yacyProxyAgent);
}
/**
* provide system information (this is part of YaCy protocol)
*/
public static final String yacySystem = System.getProperty("os.arch", "no-os-arch") + " " +
System.getProperty("os.name", "no-os-name") + " " + System.getProperty("os.version", "no-os-version") +
"; " + "java " + System.getProperty("java.version", "no-java-version") + "; " + generateLocation();
/**
* produce a YaCy user agent string
* @param addinfo

@ -43,6 +43,9 @@ import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.ImageView;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.document.ImageParser;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.OS;
@ -58,10 +61,16 @@ import org.apache.pdfbox.rendering.PDFRenderer;
public class Html2Image {
// Mac
// to install wkhtmltopdf, download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html
/**
* Path to wkhtmltopdf executable on Mac OS when installed using
* wkhtmltox-n.n.n.macos-cocoa.pkg from https://wkhtmltopdf.org/downloads.html.
* This can also be a path on Debian or another Gnu/Linux distribution.
*/
private final static File wkhtmltopdfMac = new File("/usr/local/bin/wkhtmltopdf");
// to install imagemagick, download from http://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip
// the convert command from imagemagick needs ghostscript, if not present on older macs, download a version of gs from http://pages.uoregon.edu/koch/
private final static File wkhtmltopdfMac = new File("/usr/local/bin/wkhtmltopdf"); // sometimes this is also the path on debian
private final static File convertMac1 = new File("/opt/local/bin/convert");
private final static File convertMac2 = new File("/opt/ImageMagick/bin/convert");
@ -69,11 +78,27 @@ public class Html2Image {
// to install: apt-get install wkhtmltopdf imagemagick xvfb ghostscript
private final static File wkhtmltopdfDebian = new File("/usr/bin/wkhtmltopdf"); // there is no wkhtmltoimage, use convert to create images
private final static File convertDebian = new File("/usr/bin/convert");
/**
* Path to wkhtmltopdf executable on Windows, when installed with default
* settings using wkhtmltox-n.n.n.msvc2015-win64.exe from
* https://wkhtmltopdf.org/downloads.html
*/
private static final File WKHTMLTOPDF_WINDOWS = new File("C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe");
/**
* Path to wkhtmltopdf executable on Windows, when installed with default
* settings using wkhtmltox-n.n.n.msvc2015-win32.exe from
* https://wkhtmltopdf.org/downloads.html
*/
private static final File WKHTMLTOPDF_WINDOWS_X86 = new File(
"C:\\Program Files (x86)\\wkhtmltopdf\\bin\\wkhtmltopdf.exe");
private static boolean usexvfb = false;
public static boolean wkhtmltopdfAvailable() {
return wkhtmltopdfMac.exists() || wkhtmltopdfDebian.exists();
return OS.isWindows ? (WKHTMLTOPDF_WINDOWS.exists() || WKHTMLTOPDF_WINDOWS_X86.exists())
: (wkhtmltopdfMac.exists() || wkhtmltopdfDebian.exists());
}
public static boolean convertAvailable() {
@ -107,7 +132,9 @@ public class Html2Image {
}
private static boolean writeWkhtmltopdfInternal(final String url, final String proxy, final File destination, final String userAgent, final String acceptLanguage, final boolean ignoreErrors) {
final File wkhtmltopdf = wkhtmltopdfMac.exists() ? wkhtmltopdfMac : wkhtmltopdfDebian;
final File wkhtmltopdf = OS.isWindows
? (WKHTMLTOPDF_WINDOWS.exists() ? WKHTMLTOPDF_WINDOWS : WKHTMLTOPDF_WINDOWS_X86)
: (wkhtmltopdfMac.exists() ? wkhtmltopdfMac : wkhtmltopdfDebian);
String commandline =
wkhtmltopdf.getAbsolutePath() + " -q --title '" + url + "' " +
//acceptLanguage == null ? "" : "--custom-header 'Accept-Language' '" + acceptLanguage + "' " +
@ -285,12 +312,54 @@ public class Html2Image {
ImageIO.write(img, destination.getName().endsWith("jpg") ? "jpg" : "png", destination);
}
public static void main(String[] args) {
try {
Html2Image.writeSwingImage(args[0], new Dimension(1200, 2000), new File(args[1]));
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Test PDF or image snapshot generation for a given URL.
* @param args main arguments list:
* <ol>
* <li>Source remote URL (required)</li>
* <li>Target local file path (required)</li>
* <li>Snapshot generation method identifier (optional) :
* <ul>
* <li>"wkhtmltopdf" (default): generate a PDF snapshot using external wkhtmltopdf tool.</li>
* <li>"swing" : use JRE provided Swing to generate a jpg or png image snapshot.</li>
* </ul>
* </li>
* </ol>
*/
public static void main(String[] args) {
try {
if (args.length < 2) {
System.out.println("Missing required parameter(s).");
System.out.println("Usage : java " + Html2Image.class.getName()
+ " <url> <target-file[.pdf|.jpg|.png]> [wkhtmltopdf|swing]");
return;
}
if (args.length < 3 || "wkhtmltopdf".equals(args[2])) {
if(Html2Image.wkhtmltopdfAvailable()) {
Html2Image.writeWkhtmltopdf(args[0], null, ClientIdentification.yacyInternetCrawlerAgent.userAgent,
"en-us,en;q=0.5", new File(args[1]));
} else {
System.out.println("Unable to locate wkhtmltopdf executable on this system!");
}
} else if ("swing".equals(args[2])) {
try {
Html2Image.writeSwingImage(args[0], new Dimension(1200, 2000), new File(args[1]));
} catch (final IOException e) {
e.printStackTrace();
}
} else {
System.out.println("Unknown method : please specify either wkhtmltopdf or swing");
}
} finally {
/* Shutdown running threads */
Domains.close();
try {
HTTPClient.closeConnectionManager();
} catch (final InterruptedException e) {
Thread.currentThread().interrupt(); // restore interrupted state
}
ConcurrentLog.shutdown();
}
}
}

@ -374,13 +374,20 @@ public final class CrawlStacker implements WorkflowTask<Request>{
return error;
}
// check availability of parser and maxfilesize
String warning = null;
//ContentDomain contentDomain = entry.url().getContentDomainFromExt();
if (TextParser.supportsExtension(entry.url()) != null) {
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots);
//if (warning != null && this.log.isFine()) this.log.logFine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
return null;
if (!profile.isCrawlerAlwaysCheckMediaType() && TextParser.supportsExtension(entry.url()) != null) {
if(profile.isIndexNonParseableUrls()) {
/* Unsupported file extension and no cross-checking of Media Type : add immediately to the noload stack to index only URL metadata */
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots);
if (warning != null && CrawlStacker.log.isFine()) {
CrawlStacker.log.fine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true) + " - not pushed to " + NoticedURL.StackType.NOLOAD + " stack : " + warning);
}
return null;
}
error = "URL '" + entry.url().toString() + "' file extension is not supported and indexing of linked non-parsable documents is disabled.";
CrawlStacker.log.info(error);
return error;
}
if (global) {

@ -96,6 +96,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
FOLLOW_FRAMES ("followFrames", false, CrawlAttribute.BOOLEAN, "Flag if frames shall be followed (no by default)"),
OBEY_HTML_ROBOTS_NOINDEX ("obeyHtmlRobotsNoindex", false, CrawlAttribute.BOOLEAN, "Obey html-robots-noindex"),
OBEY_HTML_ROBOTS_NOFOLLOW ("obeyHtmlRobotsNofollow", false, CrawlAttribute.BOOLEAN, "Obey html-robots-nofollow"),
CRAWLER_ALWAYS_CHECK_MEDIA_TYPE("crawlerAlwaysCheckMediaType", false, CrawlAttribute.BOOLEAN, "Always cross check file extension against actual Media Type"),
CRAWLER_URL_MUSTMATCH ("crawlerURLMustMatch", false, CrawlAttribute.STRING, "URL Must-Match Filter"),
CRAWLER_URL_MUSTNOTMATCH ("crawlerURLMustNotMatch", false, CrawlAttribute.STRING, "URL Must-Not-Match Filter"),
CRAWLER_IP_MUSTMATCH ("crawlerIPMustMatch", false, CrawlAttribute.STRING, "IP Must-Match Filter"),
@ -239,6 +240,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(CrawlAttribute.HANDLE.key, handle);
put(CrawlAttribute.NAME.key, name);
put(CrawlAttribute.AGENT_NAME.key, userAgentName);
put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true);
put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
@ -673,11 +675,29 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
}
public boolean directDocByURL() {
/**
* @return true when URLs of unsupported resources (no parser available or denied format) should
* be indexed as links (with metadata only on URL and not on content).
*/
public boolean isIndexNonParseableUrls() {
final String r = get(CrawlAttribute.DIRECT_DOC_BY_URL.key);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
/**
* @return true when the crawler must always cross check the eventual URL file
* extension against the actual Media Type, even when file extension is
* unknown or unsupported. False when the crawler should not load URLs
* with an unknown or unsupported file extension.
*/
public boolean isCrawlerAlwaysCheckMediaType() {
final String r = get(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key);
if (r == null) {
return false;
}
return (r.equals(Boolean.TRUE.toString()));
}
public CacheStrategy cacheStrategy() {
final String r = get(CrawlAttribute.CACHE_STRAGEGY.key);
@ -889,7 +909,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_agentName", this.get(CrawlAttribute.AGENT_NAME.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_userAgent", this.getAgent().userAgent);
prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth());
prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.directDocByURL() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.isIndexNonParseableUrls() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_recrawlIfOlder", this.recrawlIfOlder() == Long.MAX_VALUE ? "eternity" : (new Date(this.recrawlIfOlder()).toString()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_domMaxPages", this.domMaxPages());
//prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (this.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(this.domMaxPages())); // TODO: remove, replace with 'domMaxPages'
@ -903,6 +923,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.put(CRAWL_PROFILE_PREFIX + count + "_storeHTCache", this.storeHTCache() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", this.remoteIndexing() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_cacheStrategy", this.get(CrawlAttribute.CACHE_STRAGEGY.key));
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlerAlwaysCheckMediaType", this.isCrawlerAlwaysCheckMediaType());
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key));

@ -55,10 +55,18 @@ public class NoticedURL {
LOCAL, GLOBAL, REMOTE, NOLOAD;
}
private Balancer coreStack; // links found by crawling to depth-1
private Balancer limitStack; // links found by crawling at target depth
private Balancer remoteStack; // links from remote crawl orders (init on demand)
private Balancer noloadStack; // links that are not passed to a loader; the index will be generated from the Request entry
/** links found by crawling to depth-1 */
private Balancer coreStack;
/** links found by crawling at target depth */
private Balancer limitStack;
/** links from remote crawl orders (init on demand) */
private Balancer remoteStack;
/** links that are not passed to a loader; the index will be generated from the Request entry */
private Balancer noloadStack;
private final File cachePath;
protected NoticedURL(

@ -742,8 +742,12 @@ public class Response {
// -ranges in request
// we checked that in shallStoreCache
// check if document can be indexed
if (this.responseHeader != null) {
/*
* Eventually check if a parser supports the media yype. Depending on the crawl
* profile, the indexingDocumentProcessor can eventually index only URL metadata
* using the generic parser for unsupported media types
*/
if (this.responseHeader != null && !profile().isIndexNonParseableUrls()) {
final String mimeType = this.responseHeader.getContentType();
final String parserError = TextParser.supportsMime(mimeType);
if (parserError != null && TextParser.supportsExtension(url()) != null) return "no parser available: " + parserError;

@ -241,6 +241,29 @@ public final class TextParser {
return docs;
}
/**
* Apply only the generic parser to the given content from location.
*/
public static Document[] genericParseSource(
final DigestURL location,
String mimeType,
final String charset,
final Set<String> ignoreClassNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
final byte[] content
) throws Parser.Failure {
if (AbstractParser.log.isFine()) {
AbstractParser.log.fine("Parsing '" + location + "' from byte-array, applying only the generic parser");
}
mimeType = normalizeMimeType(mimeType);
Set<Parser> idioms = new HashSet<>();
idioms.add(TextParser.genericIdiom);
return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
}
private static Document[] parseSource(
final DigestURL location,
String mimeType,
@ -644,7 +667,7 @@ public final class TextParser {
* @param url the given url
* @param mimeType the given mime type
* @return a list of Idiom parsers that may be appropriate for the given criteria
* @throws Parser.Failure
* @throws Parser.Failure when the file extension or the MIME type is denied
*/
private static Set<Parser> parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure {
final Set<Parser> idioms = new LinkedHashSet<Parser>(2); // LinkedSet to maintain order (genericParser should be last)
@ -661,7 +684,12 @@ public final class TextParser {
// check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied)
String ext = MultiProtocolURL.getFileExtension(url.getFileName());
if (ext != null && ext.length() > 0) {
if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
/* We do not throw here an exception when the media type is provided and inconsistent with the extension (if it is not supported an exception has already beeen thrown).
* Otherwise we would reject URLs with an apparently unsupported extension but whose actual Media Type is supported (for example text/html).
* Notable example : wikimedia commons pages, such as https://commons.wikimedia.org/wiki/File:YaCy_logo.png */
if (denyExtensionx.containsKey(ext) && (mimeType1 == null || mimeType1.equals(mimeOf(ext)))) {
throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
}
idiom = ext2parser.get(ext);
if (idiom != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser
idioms.addAll(idiom);

@ -34,6 +34,7 @@ import java.util.StringTokenizer;
import javax.net.ssl.KeyManagerFactory;
import javax.net.ssl.SSLContext;
import org.eclipse.jetty.http.HttpMethod;
import org.eclipse.jetty.http.HttpVersion;
import org.eclipse.jetty.server.Connector;
import org.eclipse.jetty.server.Handler;
@ -144,18 +145,20 @@ public class Jetty9HttpServerImpl implements YaCyHttpServer {
//sholder.setInitParameter("welcomeFile", "index.html"); // default is index.html, welcome.html
htrootContext.addServlet(sholder, "/*");
/* Handle gzip compression of responses to user agents accepting it */
final GzipHandler gzipHandler;
if (sb.getConfigBool(SwitchboardConstants.SERVER_RESPONSE_COMPRESS_GZIP,
final GzipHandler gzipHandler = new GzipHandler();
/*
* Decompression of incoming requests body is required for index distribution
* APIs /yacy/transferRWI.html and /yacy/transferURL.html This was previously
* handled by a GZIPRequestWrapper in the YaCyDefaultServlet.
*/
gzipHandler.setInflateBufferSize(4096);
if (!sb.getConfigBool(SwitchboardConstants.SERVER_RESPONSE_COMPRESS_GZIP,
SwitchboardConstants.SERVER_RESPONSE_COMPRESS_GZIP_DEFAULT)) {
gzipHandler = new GzipHandler();
/*
* Ensure decompression of requests body is disabled : it is already handled by
* the GZIPRequestWrapper in the YaCyDefaultServlet
*/
gzipHandler.setInflateBufferSize(0);
htrootContext.setGzipHandler(gzipHandler);
/* Gzip compression of responses can be disabled by user configuration */
gzipHandler.setExcludedMethods(HttpMethod.GET.asString(), HttpMethod.POST.asString());
}
htrootContext.setGzipHandler(gzipHandler);
// -----------------------------------------------------------------------------
// here we set and map the mandatory servlets, needed for typical YaCy operation

@ -42,25 +42,33 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.zip.GZIPInputStream;
import javax.servlet.ReadListener;
import javax.servlet.RequestDispatcher;
import javax.servlet.ServletContext;
import javax.servlet.ServletException;
import javax.servlet.ServletInputStream;
import javax.servlet.UnavailableException;
import javax.servlet.http.Cookie;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletRequestWrapper;
import javax.servlet.http.HttpServletResponse;
import org.apache.commons.fileupload.FileItem;
import org.apache.commons.fileupload.FileItemFactory;
import org.apache.commons.fileupload.disk.DiskFileItemFactory;
import org.apache.commons.fileupload.servlet.ServletFileUpload;
import org.eclipse.jetty.http.HttpHeader;
import org.eclipse.jetty.http.HttpMethod;
import org.eclipse.jetty.http.MimeTypes;
import org.eclipse.jetty.io.WriterOutputStream;
import org.eclipse.jetty.server.InclusiveByteRange;
import org.eclipse.jetty.util.MultiPartOutputStream;
import org.eclipse.jetty.util.URIUtil;
import org.eclipse.jetty.util.resource.Resource;
import com.google.common.net.HttpHeaders;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.order.Base64Order;
@ -75,38 +83,19 @@ import net.yacy.data.InvalidURLLicenceException;
import net.yacy.data.TransactionManager;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.NamePrefixThreadFactory;
import net.yacy.peers.Seed;
import net.yacy.peers.graphics.EncodedImage;
import net.yacy.peers.operation.yacyBuildProperties;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.server.http.HTTPDFileHandler;
import net.yacy.server.http.TemplateEngine;
import net.yacy.server.serverClassLoader;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
import net.yacy.server.servletProperties;
import net.yacy.server.http.HTTPDFileHandler;
import net.yacy.server.http.TemplateEngine;
import net.yacy.visualization.RasterPlotter;
import org.apache.commons.fileupload.FileItem;
import org.apache.commons.fileupload.FileItemFactory;
import org.apache.commons.fileupload.disk.DiskFileItemFactory;
import org.apache.commons.fileupload.servlet.ServletFileUpload;
import org.eclipse.jetty.http.HttpHeader;
import org.eclipse.jetty.http.HttpMethod;
import org.eclipse.jetty.http.MimeTypes;
import org.eclipse.jetty.io.WriterOutputStream;
import org.eclipse.jetty.server.InclusiveByteRange;
import org.eclipse.jetty.util.MultiPartOutputStream;
import org.eclipse.jetty.util.URIUtil;
import org.eclipse.jetty.util.resource.Resource;
import com.google.common.net.HttpHeaders;
import com.google.common.util.concurrent.SimpleTimeLimiter;
import com.google.common.util.concurrent.TimeLimiter;
import com.google.common.util.concurrent.UncheckedTimeoutException;
/**
* YaCyDefaultServlet based on Jetty DefaultServlet.java
* handles static files and the YaCy servlets.
@ -152,8 +141,6 @@ public class YaCyDefaultServlet extends HttpServlet {
protected static final File TMPDIR = new File(System.getProperty("java.io.tmpdir"));
protected static final int SIZE_FILE_THRESHOLD = 1024 * 1024 * 1024; // 1GB is a lot but appropriate for multi-document pushed using the push_p.json servlet
protected static final FileItemFactory DISK_FILE_ITEM_FACTORY = new DiskFileItemFactory(SIZE_FILE_THRESHOLD, TMPDIR);
private final static TimeLimiter timeLimiter = new SimpleTimeLimiter(Executors.newCachedThreadPool(
new NamePrefixThreadFactory(YaCyDefaultServlet.class.getSimpleName() + ".timeLimiter")));
/* ------------------------------------------------------------ */
@Override
public void init() throws UnavailableException {
@ -866,12 +853,7 @@ public class YaCyDefaultServlet extends HttpServlet {
RequestHeader legacyRequestHeader = generateLegacyRequestHeader(request, target, targetExt);
// add multipart-form fields to parameter
if (ServletFileUpload.isMultipartContent(request)) {
final String bodyEncoding = request.getHeader(HeaderFramework.CONTENT_ENCODING);
if (HeaderFramework.CONTENT_ENCODING_GZIP.equalsIgnoreCase(bodyEncoding)) {
parseMultipart(new GZIPRequestWrapper(request),args);
} else {
parseMultipart(request, args);
}
parseMultipart(request, args);
}
// eof modification to read attribute
Object tmp;
@ -1336,122 +1318,4 @@ public class YaCyDefaultServlet extends HttpServlet {
ConcurrentLog.info("FILEHANDLER", ex.getMessage());
}
}
/**
* wraps request to uncompress gzip'ed input stream
*/
private class GZIPRequestWrapper extends HttpServletRequestWrapper {
private final ServletInputStream is;
public GZIPRequestWrapper(HttpServletRequest request) throws IOException {
super(request);
this.is = new GZIPRequestStream(request);
}
@Override
public ServletInputStream getInputStream() throws IOException {
return is;
}
}
private class GZIPRequestStream extends ServletInputStream {
private final GZIPInputStream in;
private final ServletInputStream sin;
public GZIPRequestStream(HttpServletRequest request) throws IOException {
sin = request.getInputStream();
in = new GZIPInputStream(sin);
}
@Override
public int read() throws IOException {
return in.read();
}
@Override
public int read(byte[] b) throws IOException {
return read(b, 0, b.length);
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
try {
return timeLimiter.callWithTimeout(new CallableReader(in, b, off, len), len + 600, TimeUnit.MILLISECONDS, false);
} catch (final UncheckedTimeoutException e) {
return -1;
} catch (Exception e) {
throw new IOException(e);
}
}
@Override
public void close() throws IOException {
in.close();
}
@Override
public int available() throws IOException {
return in.available();
}
@Override
public synchronized void mark(int readlimit) {
in.mark(readlimit);
}
@Override
public boolean markSupported() {
return in.markSupported();
}
@Override
public synchronized void reset() throws IOException {
in.reset();
}
@Override
public long skip(long n) throws IOException {
return in.skip(n);
}
@Override
public boolean isFinished() {
try {
return available() < 1;
} catch (final IOException ex) {
return true;
}
}
@Override
public boolean isReady() {
return sin.isReady() && !isFinished();
}
@Override
public void setReadListener(ReadListener rl) {
sin.setReadListener(rl);
}
}
private class CallableReader implements Callable<Integer> {
private int off, len;
private byte[] b;
private GZIPInputStream in;
public CallableReader(final GZIPInputStream in, byte[] b, int off, int len) {
this.in = in;
this.b = b;
this.off = off;
this.len = len;
}
@Override
public Integer call() throws Exception {
return in.read(b, off, len);
}
}
}

@ -2071,8 +2071,12 @@ public final class Switchboard extends serverSwitch {
noIndexReason = response.shallIndexCacheForCrawler();
}
// check if the parser supports the mime type
if ( noIndexReason == null ) {
/*
* Eventually check if a parser supports the media type. Depending on the crawl
* profile, the indexingDocumentProcessor can eventually index only URL metadata
* using the generic parser for unsupported media types
*/
if ( noIndexReason == null && !response.profile().isIndexNonParseableUrls()) {
noIndexReason = TextParser.supports(response.url(), response.getMimeType());
}
@ -3009,18 +3013,40 @@ public final class Switchboard extends serverSwitch {
}
}
assert response.getContent() != null;
try {
// parse the document
documents =
TextParser.parseSource(
new AnchorURL(response.url()),
response.getMimeType(),
response.getCharacterEncoding(),
response.profile().ignoreDivClassName(),
response.profile().scraper(),
response.profile().timezoneOffset(),
response.depth(),
response.getContent());
final String supportError = TextParser.supports(response.url(), response.getMimeType());
if (supportError != null) {
/* No parser available or format is denied */
if(response.profile().isIndexNonParseableUrls()) {
/* Apply the generic parser add the URL as a simple link (no content metadata) to the index */
documents = TextParser.genericParseSource(new AnchorURL(response.url()),
response.getMimeType(),
response.getCharacterEncoding(),
response.profile().ignoreDivClassName(),
response.profile().scraper(),
response.profile().timezoneOffset(),
response.depth(),
response.getContent());
} else {
this.log.warn("Resource '" + response.url().toNormalform(true) + "' is not supported. " + supportError);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(response.url(), response.depth(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, supportError, -1);
return null;
}
} else {
// parse the document
documents =
TextParser.parseSource(
new AnchorURL(response.url()),
response.getMimeType(),
response.getCharacterEncoding(),
response.profile().ignoreDivClassName(),
response.profile().scraper(),
response.profile().timezoneOffset(),
response.depth(),
response.getContent());
}
if ( documents == null ) {
throw new Parser.Failure("Parser returned null.", response.url());
}
@ -3070,22 +3096,39 @@ public final class Switchboard extends serverSwitch {
// get the hyperlinks
final Map<AnchorURL, String> hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow());
if (response.profile().indexMedia()) {
for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(new AnchorURL(entry.getKey()), entry.getValue());
}
}
final boolean addAllLinksToCrawlStack = response.profile().isIndexNonParseableUrls() /* unsupported resources have to be indexed as pure links if no parser support them */
|| response.profile().isCrawlerAlwaysCheckMediaType() /* the crawler must always load resources to double-check the actual Media Type even on unsupported file extensions */;
/* Handle media links */
for (Map.Entry<DigestURL, String> entry : Document.getImagelinks(documents).entrySet()) {
if (addAllLinksToCrawlStack
|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
hl.put(new AnchorURL(entry.getKey()), entry.getValue());
}
}
for (Map.Entry<DigestURL, String> entry : Document.getApplinks(documents).entrySet()) {
if (addAllLinksToCrawlStack
|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
hl.put(new AnchorURL(entry.getKey()), entry.getValue());
}
}
for (Map.Entry<DigestURL, String> entry : Document.getVideolinks(documents).entrySet()) {
if (addAllLinksToCrawlStack
|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
hl.put(new AnchorURL(entry.getKey()), entry.getValue());
}
}
for (Map.Entry<DigestURL, String> entry : Document.getAudiolinks(documents).entrySet()) {
if (addAllLinksToCrawlStack
|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
hl.put(new AnchorURL(entry.getKey()), entry.getValue());
}
}
// add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
if (response.profile().directDocByURL()) {
for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(new AnchorURL(entry.getKey()), entry.getValue());
}
for (Map.Entry<DigestURL, String> d: Document.getApplinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
for (Map.Entry<DigestURL, String> d: Document.getVideolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
for (Map.Entry<DigestURL, String> d: Document.getAudiolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
}
// insert those hyperlinks to the crawler
MultiProtocolURL nextUrl;
for ( final Map.Entry<AnchorURL, String> nextEntry : hl.entrySet() ) {

@ -21,11 +21,14 @@
package net.yacy.search.schema;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import net.yacy.cora.document.id.DigestURL;
@ -62,7 +65,11 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
if (hostname.startsWith("www.")) hostname = hostname.substring(4);
StringBuilder q = new StringBuilder();
q.append(CollectionSchema.host_s.getSolrFieldName()).append(':').append(hostname).append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(':').append("www.").append(hostname);
BlockingQueue<SolrDocument> docs = solrConnector.concurrentDocumentsByQuery(q.toString(), CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, maxnodes, maxtime, 100, 1, true,
final int pageSize = 100;
final BlockingQueue<SolrDocument> docs = new ArrayBlockingQueue<>(pageSize);
final List<String> queries = new ArrayList<>();
queries.add(q.toString());
final Thread solrQueryTask = new Thread(solrConnector.newDocumentsByQueriesTask(docs, queries, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, maxnodes, maxtime, pageSize, 1,
CollectionSchema.id.getSolrFieldName(),
CollectionSchema.sku.getSolrFieldName(),
CollectionSchema.failreason_s.getSolrFieldName(),
@ -71,7 +78,8 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(),
CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName()
);
));
solrQueryTask.start();
SolrDocument doc;
Map<String, FailType> errorDocs = new HashMap<String, FailType>();
HyperlinkEdges inboundEdges = new HyperlinkEdges();
@ -80,7 +88,12 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
try {
retrieval: while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
MultiProtocolURL from = new MultiProtocolURL(u);
MultiProtocolURL from;
try {
from = new MultiProtocolURL(u);
} catch (final MalformedURLException e1) {
continue;
}
String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
FailType error = errortype == null ? null : FailType.valueOf(errortype);
if (error != null) {
@ -94,7 +107,9 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
HyperlinkEdge.Target linkurl = new HyperlinkEdge.Target(link, HyperlinkType.Inbound);
inboundEdges.addEdge(from, linkurl);
if (stopURL != null && linkurl.equals(stopURL)) break retrieval;
} catch (MalformedURLException e) {}
} catch (final MalformedURLException e) {
/* Continue on the next link */
}
}
links = URIMetadataNode.getLinks(doc, false); // outbound
while (links.hasNext()) {
@ -103,42 +118,49 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
HyperlinkEdge.Target linkurl = new HyperlinkEdge.Target(link, HyperlinkType.Outbound);
outboundEdges.addEdge(from, linkurl);
if (stopURL != null && linkurl.equals(stopURL)) break retrieval;
} catch (MalformedURLException e) {}
} catch (final MalformedURLException e) {
/* Continue on the next link */
}
}
}
if (inboundEdges.size() + outboundEdges.size() > maxnodes) {
break retrieval;
}
}
} catch (InterruptedException e) {
} catch (MalformedURLException e) {
} catch (final InterruptedException e) {
Thread.currentThread().interrupt(); // preserve interrupted thread state
} finally {
/* Ensure termination and proper resources release of the query thread */
solrQueryTask.interrupt();
}
// we use the errorDocs to mark all edges with endpoint to error documents
Iterator<HyperlinkEdge> i = inboundEdges.iterator();
HyperlinkEdge edge;
while (i.hasNext()) {
edge = i.next();
if (errorDocs.containsKey(edge.target.toNormalform(true))) {
i.remove();
edge.target.type = HyperlinkType.Dead;
errorEdges.add(edge);
}
}
i = outboundEdges.iterator();
while (i.hasNext()) {
edge = i.next();
if (errorDocs.containsKey(edge.target.toNormalform(true))) {
i.remove();
edge.target.type = HyperlinkType.Dead;
errorEdges.add(edge);
}
if(!Thread.currentThread().isInterrupted()) {
// we use the errorDocs to mark all edges with endpoint to error documents
Iterator<HyperlinkEdge> i = inboundEdges.iterator();
HyperlinkEdge edge;
while (i.hasNext()) {
edge = i.next();
if (errorDocs.containsKey(edge.target.toNormalform(true))) {
i.remove();
edge.target.type = HyperlinkType.Dead;
errorEdges.add(edge);
}
}
i = outboundEdges.iterator();
while (i.hasNext()) {
edge = i.next();
if (errorDocs.containsKey(edge.target.toNormalform(true))) {
i.remove();
edge.target.type = HyperlinkType.Dead;
errorEdges.add(edge);
}
}
// we put all edges together in a specific order which is used to create nodes in a svg display:
// notes that appear first are possible painted over by nodes coming later.
// less important nodes shall appear therefore first
this.edges.addAll(outboundEdges);
this.edges.addAll(inboundEdges);
this.edges.addAll(errorEdges);
}
// we put all edges together in a specific order which is used to create nodes in a svg display:
// notes that appear first are possible painted over by nodes coming later.
// less important nodes shall appear therefore first
this.edges.addAll(outboundEdges);
this.edges.addAll(inboundEdges);
this.edges.addAll(errorEdges);
}
public void path(final Segment segment, DigestURL from, DigestURL to, final int maxtime, final int maxnodes) {

Loading…
Cancel
Save