Merge branch 'master' of https://github.com/yacy/yacy_search_server.git

6 years ago · c347e7d3f8
parent 848e9304d9 8e72863a7f
commit c347e7d3f8
27 changed files with 665 additions and 430 deletions
--- a/README.md
+++ b/README.md
@ -76,12 +76,12 @@ NO OTHER SOFTWARE IS REQUIRED!
 Startup and Shutdown of YaCy:

 - on GNU/Linux and OpenBSD:
-to start: execute ./startYACY.sh
-to stop : execute ./stopYACY.sh
+   - to start: execute `./startYACY.sh`
+   - to stop : execute `./stopYACY.sh`

 - on Windows:
-to start: double-click startYACY.bat
-to stop : double-click stopYACY.bat
+   - to start: double-click `startYACY.bat`
+   - to stop : double-click `stopYACY.bat`

 - on Mac OS X:
 please use the Mac Application and start or stop it like any
@ -135,10 +135,14 @@ More details for YaCy on Heroku in [Heroku.md](Heroku.md).
 ## Port 8090 is bad, people are not allowed to access that port

 You can forward port 80 to 8090 with iptables:
+```bash
 iptables -t nat -A PREROUTING -p tcp --dport 80 -j REDIRECT --to-port 8090
+```
+
 On some operation systems, you must first enable access to the ports you are using like:
+```bash
 iptables -I INPUT -m tcp -p tcp --dport 8090 -j ACCEPT
-
+```

 ## How can I scale this; how much ram is needed; disk space?

--- a/htroot/CrawlProfileEditor_p.xml
+++ b/htroot/CrawlProfileEditor_p.xml
@ -20,6 +20,7 @@
 		<storeHTCache>#(storeHTCache)#false::true#(/storeHTCache)#</storeHTCache>
 		<remoteIndexing>#(remoteIndexing)#false::true#(/remoteIndexing)#</remoteIndexing>
 		<cacheStrategy>#[cacheStrategy]#</cacheStrategy>
+		<crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType>
 		<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
 		<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
 		<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@ -317,6 +317,27 @@
            Obey html-robots-nofollow: <input type="checkbox" name="obeyHtmlRobotsNofollow" id="obeyHtmlRobotsNofollow" #(obeyHtmlRobotsNofollowChecked)#::checked="checked"#(/obeyHtmlRobotsNofollowChecked)# /><!--<br/>
            Follow Frames: <input type="checkbox" name="followFrames" id="followFrames" #(followFramesChecked)#::checked="checked"#(/followFramesChecked)# />-->
          </dd>
+ 	        <dt>Media Type detection</dt>
+	        <dd>
+            	<div class="info" style="float:right">
+            		<img src="env/grafics/i16.gif" width="16" height="16" alt="Media Type checking info"/>
+            		<span style="right:0px; width:30em;" id="mediaTypeCheckingInfo">
+            			Not loading URLs with unsupported file extension is faster but less accurate. 
+            			Indeed, for some web resources the actual Media Type is not consistent with the URL file extension. Here are some examples:
+            			<ul>
+            				<li><a href="https://en.wikipedia.org/wiki/.de"  target="_blank">https://en.wikipedia.org/wiki/.de</a> : the .de extension is unknown, but the actual Media Type of this page is text/html</li>
+            				<li><a href="https://en.wikipedia.org/wiki/Ask.com"  target="_blank">https://en.wikipedia.org/wiki/Ask.com</a> : the .com extension is not supported (executable file format), but the actual Media Type of this page is text/html</li>
+            				<li><a href="https://commons.wikimedia.org/wiki/File:YaCy_logo.png"  target="_blank">https://commons.wikimedia.org/wiki/File:YaCy_logo.png</a> : the .png extension is a supported image format, but the actual Media Type of this page is text/html</li>
+            			</ul> 
+            		</span>
+            	</div>
+            	<label>
+          			<input type="radio" aria-describedby="mediaTypeCheckingInfo" name="crawlerAlwaysCheckMediaType" value="false" #(crawlerAlwaysCheckMediaType)#checked="checked"::#(/crawlerAlwaysCheckMediaType)# /> Do not load URLs with an unsupported file extension
+	          	</label>
+    	      	<label>
+        	  		<input type="radio" name="crawlerAlwaysCheckMediaType" value="true" #(crawlerAlwaysCheckMediaType)#::checked="checked"#(/crawlerAlwaysCheckMediaType)# /> Always cross check file extension against Content-Type header
+          		</label>
+	        </dd>
 	        <dt>Load Filter on URLs</dt>
 	        <dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
            The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>.
--- a/htroot/CrawlStartExpert.java
+++ b/htroot/CrawlStartExpert.java
@ -213,6 +213,13 @@ public class CrawlStartExpert {
            prop.put("obeyHtmlRobotsNoindexChecked", post.getBoolean("obeyHtmlRobotsNoindex") ? 1 : 0);
            prop.put("obeyHtmlRobotsNofollowChecked", post.getBoolean("obeyHtmlRobotsNofollow") ? 1 : 0);
        }
+        
+        // always cross-check URL file extension against actual Media Type ?
+		if (post == null) {
+			prop.put("crawlerAlwaysCheckMediaType", true);
+		} else {
+			prop.put("crawlerAlwaysCheckMediaType", post.getBoolean("crawlerAlwaysCheckMediaType"));
+		}

        // Load Filter on URLs (range)
        if (post != null && post.containsKey("range")) {
--- a/htroot/Crawler_p.html
+++ b/htroot/Crawler_p.html
@ -290,7 +290,7 @@ window.setInterval("setTableSize()", 1000);
 <td>
  <form style="float:right;" action="Crawler_p.html"><input type="submit" name="hidewebstructuregraph" class="btn btn-default btn-xs" value="hide graphic"/><form>
 </td></tr></table>
-<script src="js/d3.v3.min.js"></script>
+<script src="js/d3.v5.min.js"></script>
 <script src="js/hypertree.js"></script>
 <div id="linkstructure"></div>
 <script>$(document).ready(linkstructure("#[hosts]#", "#linkstructure", 1280, 720, 3000, 700));</script>::
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -332,7 +332,7 @@ public class Crawler_p {
                env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
                if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;

-                boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); // catch also all linked media documents without loading them
+                boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); // catch also all linked media documents even when no parser is available
                env.setConfig("crawlingDirectDocByURL", directDocByURL);

                final String collection = post.get("collection", "user");
@ -633,6 +633,8 @@ public class Crawler_p {
 							.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
 					profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, solrQueryMustMatch);
 					profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, solrQueryMustNotMatch);
+					profile.put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key,
+							post.getBoolean("crawlerAlwaysCheckMediaType"));
 					
                    
                    handle = ASCII.getBytes(profile.handle());
--- a/htroot/HostBrowser.html
+++ b/htroot/HostBrowser.html
@ -187,8 +187,8 @@ var solr= $.getJSON("solr/collection1/select?q=*:*&defType=edismax&start=0&rows=
      <table class="sortable" style="float:left; border-width: 0">
      <thead>
      	<tr>
-        	<th style="text-align:center; width:32"></th>
-        	<th style="text-align:left; width: 600" class="listing">Path</th>
+        	<th style="text-align:center; width:32px"></th>
+        	<th style="text-align:left; width: 600px" class="listing">Path</th>
        	<th style="text-align:right; padding:2px;" class="listingem">stored</th>
        	<th style="text-align:right; padding:2px;" class="listingem">linked</th>
        	<th style="text-align:right; padding:2px;" class="listingem">pending</th>
@ -196,6 +196,7 @@ var solr= $.getJSON("solr/collection1/select?q=*:*&defType=edismax&start=0&rows=
        	<th style="text-align:right; padding:2px;" class="listingem">failed</th>
      	</tr>
      </thead>
+      <tbody>
      #(root)#
      <tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
        <td style="text-align:center"></td>
@ -226,11 +227,12 @@ var solr= $.getJSON("solr/collection1/select?q=*:*&defType=edismax&start=0&rows=
        </tr>
        #(/type)#
      #{/list}#
+      </tbody>
      </table>
    </fieldset>
    
    #(linkgraph)#<div style="text-align:center"><form><input name="showlinkstructure" onClick="location.href = location.toString() + '&showlinkstructure=';" class="btn btn-default btn-xs" value="show link structure graph"/></form></div>::
-    <script src="js/d3.v3.min.js"></script>
+    <script src="js/d3.v5.min.js"></script>
    <script src="js/hypertree.js"></script>
    <div id="linkstructure"></div>
    <script>$(document).ready(linkstructure("#[host]#", "#linkstructure", 1280, 720, 3000, 700));</script>
--- a/htroot/HostBrowser.java
+++ b/htroot/HostBrowser.java
@ -32,7 +32,9 @@ import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;
 import java.util.TreeMap;
+import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.TimeUnit;
 import java.util.regex.Pattern;

 import org.apache.solr.common.SolrDocument;
@ -417,7 +419,11 @@ public class HostBrowser {
                        q.append(" AND ").append(CollectionSchema.url_paths_sxt.getSolrFieldName()).append(AbstractSolrConnector.CATCHALL_DTERM);
                    }
                }
-                BlockingQueue<SolrDocument> docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000, TIMEOUT, 100, 1, false,
+                final int pageSize = 100;
+                final BlockingQueue<SolrDocument> docs = new ArrayBlockingQueue<>(pageSize);
+                final List<String> queries = new ArrayList<>();
+                queries.add(q.toString());
+                final Thread solrQueryTask = new Thread(fulltext.getDefaultConnector().newDocumentsByQueriesTask(docs, queries, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000, TIMEOUT, pageSize, 1,
                        CollectionSchema.id.getSolrFieldName(),
                        CollectionSchema.sku.getSolrFieldName(),
                        CollectionSchema.failreason_s.getSolrFieldName(),
@ -433,8 +439,8 @@ public class HostBrowser {
                        CollectionSchema.references_exthosts_i.getSolrFieldName(),
                        CollectionSchema.cr_host_chance_d.getSolrFieldName(),
                        CollectionSchema.cr_host_norm_i.getSolrFieldName()   
-                        );
-                SolrDocument doc;
+                        ));
+                solrQueryTask.start();
                Set<String> storedDocs = new HashSet<String>();
                Map<String, FailType> errorDocs = new HashMap<String, FailType>();
                Set<String> inboundLinks = new HashSet<String>();
@ -445,60 +451,72 @@ public class HostBrowser {
                final Collection<String> reloadURLs = new ArrayList<String>();
                final Set<String> reloadURLCollection = new HashSet<String>();
                long timeoutList = System.currentTimeMillis() + TIMEOUT;
+                long remainingTime = TIMEOUT;
                long timeoutReferences = System.currentTimeMillis() + 6000;
                ReferenceReportCache rrCache = sb.index.getReferenceReportCache();
-                while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
-                    String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
-                    String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
-                    FailType error = errortype == null ? null : FailType.valueOf(errortype);
-                    String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
-                    infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), rrCache, doc, ids, System.currentTimeMillis() < timeoutReferences));
-                    if (u.startsWith(path)) {
-                        if (delete) {
-                            deleteIDs.add(ids);
-                        } else {
-                            if (error == null) storedDocs.add(u); else {
-                                if (reload404 && error == FailType.fail) {
-                                    ArrayList<String> collections = (ArrayList<String>) doc.getFieldValue(CollectionSchema.collection_sxt.getSolrFieldName());
-                                    if (collections != null) reloadURLCollection.addAll(collections);
-                                    reloadURLs.add(u);
-                                }
-                                if (authorized) errorDocs.put(u, error);
-                            }
-                        }
-                    } else if (complete) {
-                        if (error == null) storedDocs.add(u); else {
-                            if (authorized) errorDocs.put(u, error);
-                        }
-                    }
-                    if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); // add the current link
-                    if (error == null) {
-                        hostsize++;
-                        // collect inboundlinks to browse the host
-                        Iterator<String> links = URIMetadataNode.getLinks(doc, true);
-                        while (links.hasNext()) {
-                            u = links.next();
-                            if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u);
-                        }
+                try {
+                	SolrDocument doc = docs.poll(remainingTime, TimeUnit.MILLISECONDS);
+                	while (doc != AbstractSolrConnector.POISON_DOCUMENT && doc != null) {
+                		String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
+                		String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
+                		FailType error = errortype == null ? null : FailType.valueOf(errortype);
+                		String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
+                		infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), rrCache, doc, ids, System.currentTimeMillis() < timeoutReferences));
+                		if (u.startsWith(path)) {
+                			if (delete) {
+                				deleteIDs.add(ids);
+                			} else {
+                				if (error == null) storedDocs.add(u); else {
+                					if (reload404 && error == FailType.fail) {
+                						ArrayList<String> collections = (ArrayList<String>) doc.getFieldValue(CollectionSchema.collection_sxt.getSolrFieldName());
+                						if (collections != null) reloadURLCollection.addAll(collections);
+                						reloadURLs.add(u);
+                					}
+                					if (authorized) errorDocs.put(u, error);
+                				}
+                			}
+                		} else if (complete) {
+                			if (error == null) storedDocs.add(u); else {
+                				if (authorized) errorDocs.put(u, error);
+                			}
+                		}
+                		if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); // add the current link
+                		if (error == null) {
+                			hostsize++;
+                			// collect inboundlinks to browse the host
+                			Iterator<String> links = URIMetadataNode.getLinks(doc, true);
+                			while (links.hasNext()) {
+                				u = links.next();
+                				if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u);
+                			}
                        
-                        // collect referrer links
-                        links = URIMetadataNode.getLinks(doc, false);
-                        while (links.hasNext()) {
-                            u = links.next();
-                            try {
-                                MultiProtocolURL mu = new MultiProtocolURL(u);
-                                if (mu.getHost() != null) {
-                                    ReversibleScoreMap<String> lks = outboundHosts.get(mu.getHost());
-                                    if (lks == null) {
-                                        lks = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
-                                        outboundHosts.put(mu.getHost(), lks);
-                                    }
-                                    lks.set(u, u.length());
-                                }
-                            } catch (final MalformedURLException e) {}
-                        }
-                    }
-                    if (System.currentTimeMillis() > timeoutList) break;
+                			// collect referrer links
+                			links = URIMetadataNode.getLinks(doc, false);
+                			while (links.hasNext()) {
+                				u = links.next();
+                				try {
+                					MultiProtocolURL mu = new MultiProtocolURL(u);
+                					if (mu.getHost() != null) {
+                						ReversibleScoreMap<String> lks = outboundHosts.get(mu.getHost());
+                						if (lks == null) {
+                							lks = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
+                							outboundHosts.put(mu.getHost(), lks);
+                						}
+                						lks.set(u, u.length());
+                					}
+                				} catch (final MalformedURLException e) {}
+                			}
+                		}
+                		
+                		remainingTime = timeoutList - System.currentTimeMillis();
+                		if (remainingTime <= 0) {
+                			break;
+                		}
+                		doc = docs.poll(remainingTime, TimeUnit.MILLISECONDS);
+                	}
+                } finally {
+                	/* Ensure termination and proper resources release of the query thread */
+               		solrQueryTask.interrupt();
                }
                if (deleteIDs.size() > 0) sb.remove(deleteIDs);
                if (reloadURLs.size() > 0) {
--- a/htroot/api/snapshot.java
+++ b/htroot/api/snapshot.java
@ -50,6 +50,7 @@ import net.yacy.crawler.data.Snapshots;
 import net.yacy.crawler.data.Snapshots.Revisions;
 import net.yacy.crawler.data.Transactions;
 import net.yacy.document.ImageParser;
+import net.yacy.http.servlets.TemplateMissingParameterException;
 import net.yacy.http.servlets.TemplateProcessingException;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.peers.graphics.EncodedImage;
@ -69,14 +70,25 @@ public class snapshot {

    public static Object respond(final RequestHeader header, serverObjects post, final serverSwitch env) {
        final Switchboard sb = (Switchboard) env;
+        
+    	final serverObjects defaultResponse = new serverObjects();
+    	

        final boolean authenticated = sb.adminAuthenticated(header) >= 2;
        final String ext = header.get(HeaderFramework.CONNECTION_PROP_EXT, "");
        
+        if(ext.isEmpty()) {
+			throw new TemplateProcessingException("Missing extension. Try with rss, xml, json, pdf, png or jpg." + ext,
+					HttpStatus.SC_BAD_REQUEST);
+        }
+        
        
        if (ext.equals("rss")) {
            // create a report about the content of the snapshot directory
-            if (!authenticated) return null;
+            if (!authenticated) {
+            	defaultResponse.authenticationRequired();
+            	return defaultResponse;
+            }
            int maxcount = post == null ? 10 : post.getInt("maxcount", 10);
            int depthx = post == null ? -1 : post.getInt("depth", -1);
            Integer depth = depthx == -1 ? null : depthx;
@ -108,7 +120,10 @@ public class snapshot {
        if (post == null) post = new serverObjects();
        final boolean xml = ext.equals("xml");
        final boolean pdf = ext.equals("pdf");
-        if (pdf && !authenticated) return null;
+        if (pdf && !authenticated) {
+        	defaultResponse.authenticationRequired();
+        	return defaultResponse;
+        }
        final boolean pngjpg = ext.equals("png") || ext.equals(DEFAULT_EXT);
        String urlhash = post.get("urlhash", "");
        String url = post.get("url", "");
@ -127,7 +142,6 @@ public class snapshot {
                ConcurrentLog.logException(e);
            }
        }
-        if (url.length() == 0 && durl != null) url = durl.toNormalform(true);

        if (ext.equals("json")) {
            // command interface: view and change a transaction state, get metadata about transactions in the past
@ -141,7 +155,10 @@ public class snapshot {
                    for (Map.Entry<String, Integer> state: Transactions.sizes().entrySet()) sizes.put(state.getKey(), state.getValue());
                    result.put("size", sizes);
                } else if (command.equals("list")) {
-                    if (!authenticated) return null;
+                    if (!authenticated) {
+                    	defaultResponse.authenticationRequired();
+                    	return defaultResponse;
+                    }
                    // return a status of the transaction archive
                    String host = post.get("host");
                    String depth = post.get("depth");
@ -179,7 +196,10 @@ public class snapshot {
                        }
                    }
                } else if (command.equals("commit")) {
-                    if (!authenticated) return null;
+                    if (!authenticated) {
+                    	defaultResponse.authenticationRequired();
+                    	return defaultResponse;
+                    }
                    Revisions r = Transactions.commit(urlhash);
                    if (r != null) {
                        result.put("result", "success");
@ -191,7 +211,10 @@ public class snapshot {
                    }
                    result.put("urlhash", urlhash);
                } else if (command.equals("rollback")) {
-                    if (!authenticated) return null;
+                    if (!authenticated) {
+                    	defaultResponse.authenticationRequired();
+                    	return defaultResponse;
+                    }
                    Revisions r = Transactions.rollback(urlhash);
                    if (r != null) {
                        result.put("result", "success");
@ -235,30 +258,36 @@ public class snapshot {
        }
        
        // for the following methods we always need the durl to fetch data
-        if (durl == null) return null;
+        if (durl == null) {
+        	throw new TemplateMissingParameterException("Missing valid url or urlhash parameter");
+        }
        
        if (xml) {
            Collection<File> xmlSnapshots = Transactions.findPaths(durl, "xml", Transactions.State.ANY);
            File xmlFile = null;
-            if (xmlSnapshots.size() == 0) {
-                return null;
+            if (xmlSnapshots.isEmpty()) {
+				throw new TemplateProcessingException("Could not find the xml snapshot file.", HttpStatus.SC_NOT_FOUND);
            }
            xmlFile = xmlSnapshots.iterator().next();
            try {
                byte[] xmlBinary = FileUtils.read(xmlFile);
                return new ByteArrayInputStream(xmlBinary);
-            } catch (IOException e) {
+            } catch (final IOException e) {
                ConcurrentLog.logException(e);
-                return null;
+                throw new TemplateProcessingException("Could not read the xml snapshot file.");
            }
        }
        
        if (pdf || pngjpg) {
            Collection<File> pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.INVENTORY);
            File pdfFile = null;
-            if (pdfSnapshots.size() == 0) {
+            if (pdfSnapshots.isEmpty()) {
                // if the client is authenticated, we create the pdf on the fly!
-                if (!authenticated) return null;
+                if (!authenticated) {
+					throw new TemplateProcessingException(
+							"Could not find the pdf snapshot file. You must be authenticated to generate one on the fly.",
+							HttpStatus.SC_NOT_FOUND);
+                }
                SolrDocument sd = sb.index.fulltext().getMetadata(durl.hash());
                boolean success = false;
                if (sd == null) {
@ -269,19 +298,25 @@ public class snapshot {
                }
                if (success) {
                    pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.ANY);
-                    if (pdfSnapshots.size() != 0) pdfFile = pdfSnapshots.iterator().next();
+                    if (!pdfSnapshots.isEmpty()) {
+                    	pdfFile = pdfSnapshots.iterator().next();
+                    }
                }
            } else {
                pdfFile = pdfSnapshots.iterator().next();
            }
-            if (pdfFile == null) return null;
+            if (pdfFile == null) {
+				throw new TemplateProcessingException(
+						"Could not find the pdf snapshot file and could not generate one on the fly.",
+						HttpStatus.SC_NOT_FOUND);
+            }
            if (pdf) {
                try {
                    byte[] pdfBinary = FileUtils.read(pdfFile);
                    return new ByteArrayInputStream(pdfBinary);
-                } catch (IOException e) {
+                } catch (final IOException e) {
                    ConcurrentLog.logException(e);
-                    return null;
+					throw new TemplateProcessingException("Could not read the pdf snapshot file.");
                }
            }
            
@ -338,6 +373,8 @@ public class snapshot {
            }
        }
        
-        return null;
+		throw new TemplateProcessingException(
+				"Unsupported extension : " + ext + ". Try with rss, xml, json, pdf, png or jpg.",
+				HttpStatus.SC_BAD_REQUEST);
    }
 }
--- a/htroot/env/hypertree.css
+++ b/htroot/env/hypertree.css
@ -32,6 +32,22 @@ circle {
 }
 text {
  font: 9px sans-serif;
-  pointer-events: none;
+  cursor: default;
  text-shadow: 0 1px 0 #fff, 1px 0 0 #fff, 0 -1px 0 #fff, -1px 0 0 #fff;
 }
+
+text tspan.truncated {
+	display: none;
+}
+
+text:hover tspan.truncated {
+	display: inherit;
+}
+
+text tspan.ellipsis {
+	display: inherit;
+}
+
+text:hover tspan.ellipsis {
+	display: none;
+}
--- a/htroot/js/d3.v3.min.js
+++ b/htroot/js/d3.v3.min.js
--- a/htroot/js/d3.v5.min.js
+++ b/htroot/js/d3.v5.min.js
--- a/htroot/js/hypertree.js
+++ b/htroot/js/hypertree.js
@ -20,15 +20,26 @@
 function linkstructure(hostname, element, width, height, maxtime, maxnodes) {
 	var nodes = {};
 	var links = [];
-	var linkstructure = {};
 	$.getJSON("api/linkstructure.json?about=" + hostname + "&maxtime=" + maxtime + "&maxnodes=" + maxnodes, function(linkstructure) {
 		links = linkstructure.graph;
 		links.forEach(function(link) {
 			  link.source = nodes[link.source] || (nodes[link.source] = {name: link.source, type:"Inbound"});
 			  link.target = nodes[link.target] || (nodes[link.target] = {name: link.target, type:link.type});
 		});
-		var force = d3.layout.force().nodes(d3.values(nodes)).links(links).size([width, height]).linkDistance(60).charge(-800).on("tick", tick).start();
-		force.gravity(0.7);
+		
+		/* attract nodes to the center - was set with force.gravity(0.7) in d3v3 */
+		var forceX = d3.forceX(width / 2).strength(0.7);
+		var forceY = d3.forceY(height / 2).strength(0.7);
+		
+		var link = d3.forceLink(links).distance(60).strength(1);
+		var simulation = d3.forceSimulation()
+			.nodes(d3.values(nodes))
+			.force('link', link)
+			.force("center", d3.forceCenter(width / 2, height / 2)) // center elements - was set with size([width, height]) in d3v3
+			.force('charge', d3.forceManyBody().strength(-800))
+			.force('x', forceX)
+			.force('y',  forceY)
+			.on("tick", ticked);
 		var svg = d3.select(element).append("svg").attr("id", "hypertree").attr("width", width).attr("height", height);
 		svg.append("defs").selectAll("marker")
 		    .data(["Dead", "Outbound", "Inbound"])
@ -49,15 +60,25 @@ function linkstructure(hostname, element, width, height, maxtime, maxnodes) {
 		svg.append("text").attr("x", 10).attr("y", height - 10).text("blue: links to other domains").attr("style", "font-size:9px").attr("fill", "lightblue");
 		svg.append("text").attr("x", 10).attr("y", height).text("red: dead links").attr("style", "font-size:9px").attr("fill", "red");
 		var path = svg.append("g")
-			.selectAll("path").data(force.links()).enter().append("path")
+			.selectAll("path").data(link.links()).enter().append("path")
 			.attr("class",function(d) {return "hypertree-link " + d.type; })
 			.attr("marker-end", function(d) { return "url(#" + d.type + ")";});
-		var circle = svg.append("g").selectAll("circle").data(force.nodes()).enter().append("circle").attr("r", 4).call(force.drag);
+		var circle = svg.append("g").selectAll("circle").data(simulation.nodes()).enter().append("circle").attr("r", 4).call(d3.drag());
+		var maxTextLength = 40;
 		var text = svg.append("g")
-			.selectAll("text").data(force.nodes()).enter().append("text").attr("x", 8).attr("y", ".31em")
+			.selectAll("text").data(simulation.nodes()).enter().append("text").attr("x", 8).attr("y", ".31em")
 			.attr("style", function(d) {return d.type == "Outbound" ? "fill:#888888;" : "fill:#000000;";})
-			.text(function(d) {return d.name;});
-		function tick() {
+			.text(function(d) {/* Limit the length of nodes visible text to improve readability */ return d.name.substring(0, Math.min(d.name.length, maxTextLength));});
+		text.append("tspan")
+			.attr("class", "truncated")
+			.text(function(d) {/* The end of large texts is wraped in a tspan, made visible on mouse overing */return d.name.length > maxTextLength ? d.name.substring(maxTextLength) : ""});
+		
+		text.append("tspan")
+			.attr("class", "ellipsis")
+			.text(function(d) {/* Add an ellipsis to mark long texts that are truncated */ return d.name.length > maxTextLength ? "..." : ""});
+
+		
+		function ticked() {
 		  path.attr("d", linkArc);
 		  circle.attr("transform", transform);
 		  text.attr("transform", transform);
--- a/htroot/jslicense.html
+++ b/htroot/jslicense.html
@ -99,9 +99,9 @@
 			<td><a href="env/bootstrap/js/typeahead.jquery.js">typeahead.jquery.js</a> (0.10.5)</td>
 		</tr>
 		<tr>
-			<td><a href="js/d3.v3.min.js">d3.v3.min.js</a></td>
+			<td><a href="js/d3.v5.min.js">d3.v5.min.js</a></td>
 			<td><a href="http://opensource.org/licenses/BSD-3-Clause">Modified-BSD</a></td>
-			<td><a href="https://raw.githubusercontent.com/d3/d3.github.com/b3382f60bf721923c7c649709adcfb4c8b66d994/d3.v3.js">d3.v3.js</a>  (3.4.4)</td>
+			<td><a href="https://unpkg.com/d3@5.7.0/dist/d3.js">d3.js</a>  (5.7.0)</td>
 		</tr>
 		<tr>
 			<td><a href="js/highslide/highslide.js">highslide.js</a></td>
--- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java
@ -21,6 +21,7 @@
 package net.yacy.cora.federate.solr.connector;

 import java.io.IOException;
+import java.io.InterruptedIOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Date;
@ -31,24 +32,17 @@ import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
+import java.util.Objects;
 import java.util.Set;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.TimeUnit;

-import net.yacy.cora.document.encoding.UTF8;
-import net.yacy.cora.sorting.ClusteredScoreMap;
-import net.yacy.cora.sorting.ReversibleScoreMap;
-import net.yacy.cora.util.ConcurrentLog;
-import net.yacy.cora.util.LookAheadIterator;
-import net.yacy.kelondro.data.word.Word;
-import net.yacy.search.schema.CollectionSchema;
-
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.response.FacetField;
-import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.client.solrj.response.FacetField.Count;
+import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentList;
 import org.apache.solr.common.SolrException;
@ -58,6 +52,14 @@ import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.DisMaxParams;
 import org.apache.solr.common.params.FacetParams;

+import net.yacy.cora.document.encoding.UTF8;
+import net.yacy.cora.sorting.ClusteredScoreMap;
+import net.yacy.cora.sorting.ReversibleScoreMap;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.cora.util.LookAheadIterator;
+import net.yacy.kelondro.data.word.Word;
+import net.yacy.search.schema.CollectionSchema;
+
 public abstract class AbstractSolrConnector implements SolrConnector {

    protected static Set<String> SOLR_ID_FIELDS = new HashSet<String>();
@ -170,19 +172,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
    }
    
    /**
-     * Get results from solr queries as a stream of documents.
-     * The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
-     * The method returns immediately and feeds the search results into the queue
-     * @param querystrings the list of solr query strings
-     * @param sort the solr sort string, may be null to be not used
-     * @param offset first result offset
-     * @param maxcount the maximum number of results
-     * @param maxtime the maximum time in milliseconds
-     * @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used
-     * @param concurrency is the number of AbstractSolrConnector.POISON_DOCUMENT entries to add at the end of the feed
-     * @param prefetchIDs if true, then first all IDs are fetched and then all documents are queries by the ID. If false then documents are retrieved directly
-     * @param fields list of fields
-     * @return a blocking queue which is terminated with AbstractSolrConnector.POISON_DOCUMENT as last element
+     * {@inheritDoc}
     */
    @Override
    public BlockingQueue<SolrDocument> concurrentDocumentsByQueries(
@ -195,12 +185,11 @@ public abstract class AbstractSolrConnector implements SolrConnector {
            final int concurrency,
            final boolean prefetchIDs,
            final String ... fields) {
-        assert buffersize > 0;
-        if (!prefetchIDs) return concurrentDocumentsByQueriesNoPrefetch(querystrings, sort, offset, maxcount, maxtime, buffersize, concurrency, fields);
        final BlockingQueue<SolrDocument> queue = buffersize <= 0 ? new LinkedBlockingQueue<SolrDocument>() : new ArrayBlockingQueue<SolrDocument>(Math.max(buffersize, concurrency));
-        if (querystrings.size() == 0) {
-            for (int i = 0; i < Math.max(1, concurrency); i++) try {queue.put(AbstractSolrConnector.POISON_DOCUMENT);} catch (final InterruptedException e1) {}
-            return queue;
+        if (!prefetchIDs) {
+        	final Thread t = new Thread(newDocumentsByQueriesTask(queue, querystrings, sort, offset, maxcount, maxtime, buffersize, concurrency, fields));
+        	t.start();
+        	return queue;
        }
        final BlockingQueue<String> idQueue = concurrentIDsByQueries(querystrings, sort, offset, maxcount, maxtime, Math.min(maxcount, 10000000), concurrency);
        final long endtime = maxtime < 0 || maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; // we know infinity!
@ -235,7 +224,9 @@ public abstract class AbstractSolrConnector implements SolrConnector {
        return queue;
    }
    
-    private BlockingQueue<SolrDocument> concurrentDocumentsByQueriesNoPrefetch(
+    @Override
+    public Runnable newDocumentsByQueriesTask(
+    		final BlockingQueue<SolrDocument> queue,
            final List<String> querystrings,
            final String sort,
            final int offset,
@ -244,59 +235,85 @@ public abstract class AbstractSolrConnector implements SolrConnector {
            final int buffersize,
            final int concurrency,
            final String ... fields) {
-        assert buffersize > 0;
-        final BlockingQueue<SolrDocument> queue = buffersize <= 0 ? new LinkedBlockingQueue<SolrDocument>() : new ArrayBlockingQueue<SolrDocument>(buffersize);
-        if (querystrings.size() == 0) {
-            for (int i = 0; i < Math.max(1, concurrency); i++) try {queue.put(AbstractSolrConnector.POISON_DOCUMENT);} catch (final InterruptedException e1) {}
-            return queue;
+    	Objects.requireNonNull(queue, "The queue parameter must not be null.");
+    	
+        if (querystrings == null || querystrings.isEmpty()) {
+			return () -> {
+				for (int i = 0; i < Math.max(1, concurrency); i++) {
+					try {
+						queue.put(AbstractSolrConnector.POISON_DOCUMENT);
+					} catch (final InterruptedException e1) {
+						Thread.currentThread().interrupt(); // preserve interrupted thread state
+					}
+				}
+			};
        }
        final long endtime = maxtime < 0 || maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; // we know infinity!
        final int ps = buffersize < 0 ? pagesize_docs : Math.min(pagesize_docs, buffersize);
        final int maxretries = 6;
-        final Thread t = new Thread() {
-            @Override
-            public void run() {
-                try {
-                    for (String querystring: querystrings) {
-                        this.setName("AbstractSolrConnector:concurrentDocumentsByQueryNoPrefetch(" + querystring + ")");
-                        int o = offset;
-                        int count = 0;
-                        int retry = 0;
-                        loop: while (System.currentTimeMillis() < endtime && count < maxcount) {
-                            try {
-                                SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, ps), fields);
-                                for (SolrDocument d: sdl) {
-                                    try {queue.put(d);} catch (final InterruptedException e) {break;}
-                                    count++;
-                                }
-                                if (sdl.size() < ps) {
-                                    //System.out.println("sdl.size() = " + sdl.size() + ", pagesize = " + pagesize);
-                                    break loop; // finished
-                                }
-                                o += sdl.size();
-                                retry = 0;
-                            } catch (final SolrException | IOException e) {
-                                ConcurrentLog.logException(e);
-                                if (retry++ < maxretries) {
-                                    // remote Solr may be temporary down, so we wait a bit
-                                    try {Thread.sleep(100);} catch (InterruptedException e1) {}
-                                    continue loop;
-                                }
-                                // fail
-                                ConcurrentLog.severe("AbstractSolrConnector", "aborted concurrentDocumentsByQueryNoPrefetch after " + maxretries + " retries: " + e.getMessage());
-                                break;
-                            }
+        return () -> {
+        	long remainingTime = endtime - System.currentTimeMillis();
+            try {
+                for (final String querystring: querystrings) {
+                    Thread.currentThread().setName("AbstractSolrConnector:concurrentDocumentsByQueryNoPrefetch(" + querystring + ")");
+                    int o = offset;
+                    int count = 0;
+                    int retry = 0;
+                    loop: while (remainingTime > 0 && count < maxcount) {
+                          try {
+                             final SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, ps), fields);
+                             for (final SolrDocument d: sdl) {
+                            		if (endtime != Long.MAX_VALUE) {
+                            			/*
+                            			 * A timeout is defined : we must not use here queue.put() otherwise this
+                            			 * thread could indefinitely wait here when the queue is full and the
+                            			 * consumer thread has stopped taking in the queue.
+                            			 */
+                            			if (!queue.offer(d, remainingTime, TimeUnit.MILLISECONDS)) {
+                            				break;
+                            			}
+                            		} else {
+                            			queue.put(d);
+                            		}
+                                count++;
+                             }
+                             if (sdl.size() < ps) {
+                                break loop; // finished
+                             }
+                             o += sdl.size();
+                             retry = 0;
+                         } catch(final InterruptedIOException e) {
+                        	 throw new InterruptedException(); // rethrow to finish the process
+                         } catch (final SolrException | IOException e) {
+                             ConcurrentLog.logException(e);
+                             if (retry++ < maxretries) {
+                                // remote Solr may be temporary down, so we wait a bit
+								Thread.sleep(100);
+                                continue loop;
+                             }
+                             // fail
+                             ConcurrentLog.severe("AbstractSolrConnector", "aborted concurrentDocumentsByQueryNoPrefetch after " + maxretries + " retries: " + e.getMessage());
+                             break;
                        }
-                    }
-                } catch (Throwable e) {} finally {
-                    for (int i = 0; i < Math.max(1, concurrency); i++) {
-                        try {queue.put(AbstractSolrConnector.POISON_DOCUMENT);} catch (final InterruptedException e1) {}
+                        remainingTime = endtime - System.currentTimeMillis();
                    }
                }
+            } catch(final InterruptedException e) {
+            	Thread.currentThread().interrupt(); // preserve interrupted thread state
+            } catch (final RuntimeException e) {
+                ConcurrentLog.logException(e);
+            } finally {
+               	/* Add poison elements only when the thread has not been interrupted */
+               	for (int i = 0; i < Math.max(1, concurrency); i++) {
+               		try {
+               			queue.put(AbstractSolrConnector.POISON_DOCUMENT);
+               		} catch (final InterruptedException e1) {
+               			Thread.currentThread().interrupt(); // preserve interrupted thread state
+               			break; // thread is interrupted : in that case we no more try to add poison elements to the queue
+               		}
+               	}
            }
        };
-        t.start();
-        return queue;
    }
    
    /**
--- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java
@ -224,9 +224,11 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
    public LinkedHashMap<String, ReversibleScoreMap<String>> getFacets(String query, int maxresults, final String ... fields) throws IOException;
    
    /**
-     * Get results from a solr query as a stream of documents.
+     * <p>Get results from solr queries as a stream of documents.
     * The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
-     * The method returns immediately and feeds the search results into the queue
+     * The method returns immediately and feeds the search results into the queue.</p>
+     * <p><strong>Important</strong> : be careful if the consumer thread(s) terminate before taking the poison document(s) from the queue, 
+     * as the producer thread(s) may indefinitely block on their last step (adding poison element) because the queue would be full.</p>
     * @param querystring the solr query string
     * @param sort the solr sort string, may be null to be not used
     * @param offset first result offset
@ -249,6 +251,27 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
            final boolean prefetchIDs,
            final String ... fields);
    
+	/**
+	 * Creates a new runnable task to run a given list of Solr queries and fill a
+	 * results queue by packets of a limited number of results.
+	 * 
+	 * @param queue        the results queue. Must not be null.
+	 * @param querystrings a list of Solr queries
+	 * @param sort         an eventual Solr sort criteria
+	 * @param offset       the results offset position for each query
+	 * @param maxcount     the maximum number of documents per query to retrieve
+	 * @param maxtime      the total maximum time to spend. Unlimited when the value
+	 *                     is negative or equals to Long.MAX_VALUE
+	 * @param buffersize   this is the maximum size of a page of results to retrieve
+	 *                     in one step when running a query
+	 * @param concurrency  the number of consuming threads
+	 * @param fields       the indexed fields to retrieve
+	 * @return a ready to run task
+	 */
+	public Runnable newDocumentsByQueriesTask(final BlockingQueue<SolrDocument> queue, final List<String> querystrings,
+			final String sort, final int offset, final int maxcount, final long maxtime, final int buffersize,
+			final int concurrency, final String... fields);
+    
    /**
     * Get results from solr queries as a stream of documents.
     * The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
--- a/source/net/yacy/cora/protocol/ClientIdentification.java
+++ b/source/net/yacy/cora/protocol/ClientIdentification.java
@ -78,6 +78,13 @@ public class ClientIdentification {
    public final static String customAgentName = "Custom Agent";
    public final static String browserAgentName = "Random Browser";
    public static Agent browserAgent;
+
+    /**
+     * provide system information (this is part of YaCy protocol)
+     */
+    public static final String yacySystem = System.getProperty("os.arch", "no-os-arch") + " " +
+            System.getProperty("os.name", "no-os-name") + " " + System.getProperty("os.version", "no-os-version") +
+            "; " + "java " + System.getProperty("java.version", "no-java-version") + "; " + generateLocation(); // keep this before the following static initialization block as this constant is used by generateYaCyBot()
    
    static {
        generateYaCyBot("new");
@ -87,13 +94,6 @@ public class ClientIdentification {
        agents.put(yacyProxyAgentName, yacyProxyAgent);
    }
    
-    /**
-     * provide system information (this is part of YaCy protocol)
-     */
-    public static final String yacySystem = System.getProperty("os.arch", "no-os-arch") + " " +
-            System.getProperty("os.name", "no-os-name") + " " + System.getProperty("os.version", "no-os-version") +
-            "; " + "java " + System.getProperty("java.version", "no-java-version") + "; " + generateLocation();
-
    /**
     * produce a YaCy user agent string
     * @param addinfo
--- a/source/net/yacy/cora/util/Html2Image.java
+++ b/source/net/yacy/cora/util/Html2Image.java
@ -43,6 +43,9 @@ import javax.swing.text.html.HTMLEditorKit;
 import javax.swing.text.html.ImageView;

 import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.cora.protocol.ClientIdentification;
+import net.yacy.cora.protocol.Domains;
+import net.yacy.cora.protocol.http.HTTPClient;
 import net.yacy.document.ImageParser;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.OS;
@ -58,10 +61,16 @@ import org.apache.pdfbox.rendering.PDFRenderer;
 public class Html2Image {
    
    // Mac
-    // to install wkhtmltopdf, download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html
+	/**
+	 * Path to wkhtmltopdf executable on Mac OS when installed using
+	 * wkhtmltox-n.n.n.macos-cocoa.pkg from https://wkhtmltopdf.org/downloads.html.
+	 * This can also be a path on Debian or another Gnu/Linux distribution.
+	 */
+	private final static File wkhtmltopdfMac = new File("/usr/local/bin/wkhtmltopdf");
+    
    // to install imagemagick, download from http://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip
    // the convert command from imagemagick needs ghostscript, if not present on older macs, download a version of gs from http://pages.uoregon.edu/koch/
-    private final static File wkhtmltopdfMac = new File("/usr/local/bin/wkhtmltopdf");  // sometimes this is also the path on debian
+    
    private final static File convertMac1 = new File("/opt/local/bin/convert");
    private final static File convertMac2 = new File("/opt/ImageMagick/bin/convert");
    
@ -69,11 +78,27 @@ public class Html2Image {
    // to install: apt-get install wkhtmltopdf imagemagick xvfb ghostscript
    private final static File wkhtmltopdfDebian = new File("/usr/bin/wkhtmltopdf"); // there is no wkhtmltoimage, use convert to create images
    private final static File convertDebian = new File("/usr/bin/convert");
+    
+	/**
+	 * Path to wkhtmltopdf executable on Windows, when installed with default
+	 * settings using wkhtmltox-n.n.n.msvc2015-win64.exe from
+	 * https://wkhtmltopdf.org/downloads.html
+	 */
+	private static final File WKHTMLTOPDF_WINDOWS = new File("C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe");
+
+	/**
+	 * Path to wkhtmltopdf executable on Windows, when installed with default
+	 * settings using wkhtmltox-n.n.n.msvc2015-win32.exe from
+	 * https://wkhtmltopdf.org/downloads.html
+	 */
+	private static final File WKHTMLTOPDF_WINDOWS_X86 = new File(
+			"C:\\Program Files (x86)\\wkhtmltopdf\\bin\\wkhtmltopdf.exe");

    private static boolean usexvfb = false;

    public static boolean wkhtmltopdfAvailable() {
-        return wkhtmltopdfMac.exists() || wkhtmltopdfDebian.exists();
+		return OS.isWindows ? (WKHTMLTOPDF_WINDOWS.exists() || WKHTMLTOPDF_WINDOWS_X86.exists())
+				: (wkhtmltopdfMac.exists() || wkhtmltopdfDebian.exists());
    }
    
    public static boolean convertAvailable() {
@ -107,7 +132,9 @@ public class Html2Image {
    }
    
    private static boolean writeWkhtmltopdfInternal(final String url, final String proxy, final File destination, final String userAgent, final String acceptLanguage, final boolean ignoreErrors) {
-        final File wkhtmltopdf = wkhtmltopdfMac.exists() ? wkhtmltopdfMac : wkhtmltopdfDebian;
+		final File wkhtmltopdf = OS.isWindows
+				? (WKHTMLTOPDF_WINDOWS.exists() ? WKHTMLTOPDF_WINDOWS : WKHTMLTOPDF_WINDOWS_X86)
+				: (wkhtmltopdfMac.exists() ? wkhtmltopdfMac : wkhtmltopdfDebian);
        String commandline =
                wkhtmltopdf.getAbsolutePath() + " -q --title '" + url + "' " +
                //acceptLanguage == null ? "" : "--custom-header 'Accept-Language' '" + acceptLanguage + "' " + 
@ -285,12 +312,54 @@ public class Html2Image {
        ImageIO.write(img, destination.getName().endsWith("jpg") ? "jpg" : "png", destination);
    }
    
-    public static void main(String[] args) {
-        try {
-            Html2Image.writeSwingImage(args[0], new Dimension(1200, 2000), new File(args[1]));
-        } catch (IOException e) {
-            e.printStackTrace();
-        }
-    }
+    /**
+     * Test PDF or image snapshot generation for a given URL.
+     * @param args main arguments list:
+     * <ol>
+     * 	<li>Source remote URL (required)</li>
+     * 	<li>Target local file path (required)</li>
+     * 	<li>Snapshot generation method identifier (optional) :
+     * 		<ul>
+     * 			<li>"wkhtmltopdf" (default): generate a PDF snapshot using external wkhtmltopdf tool.</li>
+     * 			<li>"swing" : use JRE provided Swing to generate a jpg or png image snapshot.</li>
+     * 		</ul>
+     * 	</li>
+     * </ol>
+     */
+	public static void main(String[] args) {
+		try {
+			if (args.length < 2) {
+				System.out.println("Missing required parameter(s).");
+				System.out.println("Usage : java " + Html2Image.class.getName()
+						+ " <url> <target-file[.pdf|.jpg|.png]> [wkhtmltopdf|swing]");
+				return;
+			}
+			if (args.length < 3 || "wkhtmltopdf".equals(args[2])) {
+				if(Html2Image.wkhtmltopdfAvailable()) {
+					Html2Image.writeWkhtmltopdf(args[0], null, ClientIdentification.yacyInternetCrawlerAgent.userAgent,
+							"en-us,en;q=0.5", new File(args[1]));					
+				} else {
+					System.out.println("Unable to locate wkhtmltopdf executable on this system!");
+				}
+			} else if ("swing".equals(args[2])) {
+				try {
+					Html2Image.writeSwingImage(args[0], new Dimension(1200, 2000), new File(args[1]));
+				} catch (final IOException e) {
+					e.printStackTrace();
+				}
+			} else {
+				System.out.println("Unknown method : please specify either wkhtmltopdf or swing");
+			}
+		} finally {
+			/* Shutdown running threads */
+			Domains.close();
+			try {
+				HTTPClient.closeConnectionManager();
+			} catch (final InterruptedException e) {
+				Thread.currentThread().interrupt(); // restore interrupted state
+			}
+			ConcurrentLog.shutdown();
+		}
+	}
    
 }
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -374,13 +374,20 @@ public final class CrawlStacker implements WorkflowTask<Request>{
            return error;
        }

-        // check availability of parser and maxfilesize
        String warning = null;
-        //ContentDomain contentDomain = entry.url().getContentDomainFromExt();
-        if (TextParser.supportsExtension(entry.url()) != null) {
-            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots);
-            //if (warning != null && this.log.isFine()) this.log.logFine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
-            return null;
+        if (!profile.isCrawlerAlwaysCheckMediaType() && TextParser.supportsExtension(entry.url()) != null) {
+        	if(profile.isIndexNonParseableUrls()) {
+        		/* Unsupported file extension and no cross-checking of Media Type : add immediately to the noload stack to index only URL metadata */
+        		warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots);
+        		if (warning != null && CrawlStacker.log.isFine()) {
+        			CrawlStacker.log.fine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true) + " - not pushed to " + NoticedURL.StackType.NOLOAD + " stack : " + warning);
+        		}
+        		return null;
+        	}
+        	
+            error = "URL '" + entry.url().toString() + "' file extension is not supported and indexing of linked non-parsable documents is disabled.";
+            CrawlStacker.log.info(error);
+            return error;
        }

        if (global) {
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -96,6 +96,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        FOLLOW_FRAMES                ("followFrames",               false, CrawlAttribute.BOOLEAN, "Flag if frames shall be followed (no by default)"),
        OBEY_HTML_ROBOTS_NOINDEX     ("obeyHtmlRobotsNoindex",      false, CrawlAttribute.BOOLEAN, "Obey html-robots-noindex"),
        OBEY_HTML_ROBOTS_NOFOLLOW    ("obeyHtmlRobotsNofollow",     false, CrawlAttribute.BOOLEAN, "Obey html-robots-nofollow"),
+        CRAWLER_ALWAYS_CHECK_MEDIA_TYPE("crawlerAlwaysCheckMediaType", false, CrawlAttribute.BOOLEAN, "Always cross check file extension against actual Media Type"),
        CRAWLER_URL_MUSTMATCH        ("crawlerURLMustMatch",        false, CrawlAttribute.STRING,  "URL Must-Match Filter"),
        CRAWLER_URL_MUSTNOTMATCH     ("crawlerURLMustNotMatch",     false, CrawlAttribute.STRING,  "URL Must-Not-Match Filter"),
        CRAWLER_IP_MUSTMATCH         ("crawlerIPMustMatch",         false, CrawlAttribute.STRING,  "IP Must-Match Filter"),
@ -239,6 +240,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(CrawlAttribute.HANDLE.key,           handle);
        put(CrawlAttribute.NAME.key,             name);
        put(CrawlAttribute.AGENT_NAME.key, userAgentName);
+        put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true);
        put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key,     (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
        put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key,  (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
        put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key,      (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
@ -673,11 +675,29 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
    }

-    public boolean directDocByURL() {
+	/**
+	 * @return true when URLs of unsupported resources (no parser available or denied format) should
+	 *         be indexed as links (with metadata only on URL and not on content).
+	 */
+    public boolean isIndexNonParseableUrls() {
        final String r = get(CrawlAttribute.DIRECT_DOC_BY_URL.key);
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
+    
+	/**
+	 * @return true when the crawler must always cross check the eventual URL file
+	 *         extension against the actual Media Type, even when file extension is
+	 *         unknown or unsupported. False when the crawler should not load URLs
+	 *         with an unknown or unsupported file extension.
+	 */
+	public boolean isCrawlerAlwaysCheckMediaType() {
+		final String r = get(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key);
+		if (r == null) {
+			return false;
+		}
+		return (r.equals(Boolean.TRUE.toString()));
+	}

    public CacheStrategy cacheStrategy() {
        final String r = get(CrawlAttribute.CACHE_STRAGEGY.key);
@ -889,7 +909,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_agentName", this.get(CrawlAttribute.AGENT_NAME.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_userAgent", this.getAgent().userAgent);
        prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth());
-        prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.directDocByURL() ? 1 : 0);
+        prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.isIndexNonParseableUrls() ? 1 : 0);
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_recrawlIfOlder", this.recrawlIfOlder() == Long.MAX_VALUE ? "eternity" : (new Date(this.recrawlIfOlder()).toString()));
        prop.put(CRAWL_PROFILE_PREFIX + count + "_domMaxPages", this.domMaxPages());
        //prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (this.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(this.domMaxPages())); // TODO: remove, replace with 'domMaxPages'
@ -903,6 +923,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        prop.put(CRAWL_PROFILE_PREFIX + count + "_storeHTCache", this.storeHTCache() ? 1 : 0);
        prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", this.remoteIndexing() ? 1 : 0);
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_cacheStrategy", this.get(CrawlAttribute.CACHE_STRAGEGY.key));
+        prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlerAlwaysCheckMediaType", this.isCrawlerAlwaysCheckMediaType());
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key));
--- a/source/net/yacy/crawler/data/NoticedURL.java
+++ b/source/net/yacy/crawler/data/NoticedURL.java
@ -55,10 +55,18 @@ public class NoticedURL {
        LOCAL, GLOBAL, REMOTE, NOLOAD;
    }

-    private Balancer coreStack;      // links found by crawling to depth-1
-    private Balancer limitStack;     // links found by crawling at target depth
-    private Balancer remoteStack;    // links from remote crawl orders (init on demand)
-    private Balancer noloadStack;    // links that are not passed to a loader; the index will be generated from the Request entry
+    /** links found by crawling to depth-1 */
+    private Balancer coreStack;
+    
+    /** links found by crawling at target depth */
+    private Balancer limitStack;
+    
+    /** links from remote crawl orders (init on demand) */
+    private Balancer remoteStack;
+    
+    /** links that are not passed to a loader; the index will be generated from the Request entry */
+    private Balancer noloadStack;
+    
    private final File cachePath;

    protected NoticedURL(
--- a/source/net/yacy/crawler/retrieval/Response.java
+++ b/source/net/yacy/crawler/retrieval/Response.java
@ -742,8 +742,12 @@ public class Response {
        // -ranges in request
        // we checked that in shallStoreCache

-        // check if document can be indexed
-        if (this.responseHeader != null) {
+		/*
+		 * Eventually check if a parser supports the media yype. Depending on the crawl
+		 * profile, the indexingDocumentProcessor can eventually index only URL metadata
+		 * using the generic parser for unsupported media types
+		 */
+        if (this.responseHeader != null && !profile().isIndexNonParseableUrls()) {
            final String mimeType = this.responseHeader.getContentType();
            final String parserError = TextParser.supportsMime(mimeType);
            if (parserError != null && TextParser.supportsExtension(url()) != null)  return "no parser available: " + parserError;
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -241,6 +241,29 @@ public final class TextParser {
        return docs;
    }
    
+    /**
+     * Apply only the generic parser to the given content from location.
+     */
+    public static Document[] genericParseSource(
+            final DigestURL location,
+            String mimeType,
+            final String charset,
+            final Set<String> ignoreClassNames,
+            final VocabularyScraper scraper,
+            final int timezoneOffset,
+            final int depth,
+            final byte[] content
+        ) throws Parser.Failure {
+        if (AbstractParser.log.isFine()) {
+        	AbstractParser.log.fine("Parsing '" + location + "' from byte-array, applying only the generic parser");
+        }
+        mimeType = normalizeMimeType(mimeType);
+        Set<Parser> idioms = new HashSet<>();
+        idioms.add(TextParser.genericIdiom);
+
+        return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
+    }
+    
    private static Document[] parseSource(
            final DigestURL location,
            String mimeType,
@ -644,7 +667,7 @@ public final class TextParser {
     * @param url the given url
     * @param mimeType the given mime type
     * @return a list of Idiom parsers that may be appropriate for the given criteria
-     * @throws Parser.Failure
+     * @throws Parser.Failure when the file extension or the MIME type is denied
     */
    private static Set<Parser> parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure {
        final Set<Parser> idioms = new LinkedHashSet<Parser>(2); // LinkedSet to maintain order (genericParser should be last)
@ -661,7 +684,12 @@ public final class TextParser {
        // check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied)
        String ext = MultiProtocolURL.getFileExtension(url.getFileName());
        if (ext != null && ext.length() > 0) {
-            if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
+        	/* We do not throw here an exception when the media type is provided and inconsistent with the extension (if it is not supported an exception has already beeen thrown). 
+        	 * Otherwise we would reject URLs with an apparently unsupported extension but whose actual Media Type is supported (for example text/html).
+        	 * Notable example : wikimedia commons pages, such as https://commons.wikimedia.org/wiki/File:YaCy_logo.png */
+            if (denyExtensionx.containsKey(ext) && (mimeType1 == null || mimeType1.equals(mimeOf(ext)))) {
+            	throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
+            }
            idiom = ext2parser.get(ext);
            if (idiom != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser
                idioms.addAll(idiom);
--- a/source/net/yacy/http/Jetty9HttpServerImpl.java
+++ b/source/net/yacy/http/Jetty9HttpServerImpl.java
@ -34,6 +34,7 @@ import java.util.StringTokenizer;
 import javax.net.ssl.KeyManagerFactory;
 import javax.net.ssl.SSLContext;

+import org.eclipse.jetty.http.HttpMethod;
 import org.eclipse.jetty.http.HttpVersion;
 import org.eclipse.jetty.server.Connector;
 import org.eclipse.jetty.server.Handler;
@ -144,18 +145,20 @@ public class Jetty9HttpServerImpl implements YaCyHttpServer {
        //sholder.setInitParameter("welcomeFile", "index.html"); // default is index.html, welcome.html
        htrootContext.addServlet(sholder, "/*");
        
-        /* Handle gzip compression of responses to user agents accepting it */
-		final GzipHandler gzipHandler;
-		if (sb.getConfigBool(SwitchboardConstants.SERVER_RESPONSE_COMPRESS_GZIP,
+		final GzipHandler gzipHandler = new GzipHandler();
+		/*
+		 * Decompression of incoming requests body is required for index distribution
+		 * APIs /yacy/transferRWI.html and /yacy/transferURL.html This was previously
+		 * handled by a GZIPRequestWrapper in the YaCyDefaultServlet.
+		 */
+		gzipHandler.setInflateBufferSize(4096);
+		
+		if (!sb.getConfigBool(SwitchboardConstants.SERVER_RESPONSE_COMPRESS_GZIP,
 				SwitchboardConstants.SERVER_RESPONSE_COMPRESS_GZIP_DEFAULT)) {
-			gzipHandler = new GzipHandler();
-			/*
-			 * Ensure decompression of requests body is disabled : it is already handled by
-			 * the GZIPRequestWrapper in the YaCyDefaultServlet
-			 */
-			gzipHandler.setInflateBufferSize(0);
-			htrootContext.setGzipHandler(gzipHandler);
+			/* Gzip compression of responses can be disabled by user configuration */
+			gzipHandler.setExcludedMethods(HttpMethod.GET.asString(), HttpMethod.POST.asString());
 		}
+		htrootContext.setGzipHandler(gzipHandler);

        // -----------------------------------------------------------------------------
        // here we set and map the mandatory servlets, needed for typical YaCy operation
--- a/source/net/yacy/http/servlets/YaCyDefaultServlet.java
+++ b/source/net/yacy/http/servlets/YaCyDefaultServlet.java
@ -42,25 +42,33 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.concurrent.BlockingQueue;
-import java.util.concurrent.Callable;
 import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.Executors;
 import java.util.concurrent.LinkedBlockingQueue;
-import java.util.concurrent.TimeUnit;
-import java.util.zip.GZIPInputStream;

-import javax.servlet.ReadListener;
 import javax.servlet.RequestDispatcher;
 import javax.servlet.ServletContext;
 import javax.servlet.ServletException;
-import javax.servlet.ServletInputStream;
 import javax.servlet.UnavailableException;
 import javax.servlet.http.Cookie;
 import javax.servlet.http.HttpServlet;
 import javax.servlet.http.HttpServletRequest;
-import javax.servlet.http.HttpServletRequestWrapper;
 import javax.servlet.http.HttpServletResponse;

+import org.apache.commons.fileupload.FileItem;
+import org.apache.commons.fileupload.FileItemFactory;
+import org.apache.commons.fileupload.disk.DiskFileItemFactory;
+import org.apache.commons.fileupload.servlet.ServletFileUpload;
+import org.eclipse.jetty.http.HttpHeader;
+import org.eclipse.jetty.http.HttpMethod;
+import org.eclipse.jetty.http.MimeTypes;
+import org.eclipse.jetty.io.WriterOutputStream;
+import org.eclipse.jetty.server.InclusiveByteRange;
+import org.eclipse.jetty.util.MultiPartOutputStream;
+import org.eclipse.jetty.util.URIUtil;
+import org.eclipse.jetty.util.resource.Resource;
+
+import com.google.common.net.HttpHeaders;
+
 import net.yacy.cora.date.GenericFormatter;
 import net.yacy.cora.document.analysis.Classification;
 import net.yacy.cora.order.Base64Order;
@ -75,38 +83,19 @@ import net.yacy.data.InvalidURLLicenceException;
 import net.yacy.data.TransactionManager;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.MemoryControl;
-import net.yacy.kelondro.util.NamePrefixThreadFactory;
 import net.yacy.peers.Seed;
 import net.yacy.peers.graphics.EncodedImage;
 import net.yacy.peers.operation.yacyBuildProperties;
 import net.yacy.search.Switchboard;
 import net.yacy.search.SwitchboardConstants;
-import net.yacy.server.http.HTTPDFileHandler;
-import net.yacy.server.http.TemplateEngine;
 import net.yacy.server.serverClassLoader;
 import net.yacy.server.serverObjects;
 import net.yacy.server.serverSwitch;
 import net.yacy.server.servletProperties;
+import net.yacy.server.http.HTTPDFileHandler;
+import net.yacy.server.http.TemplateEngine;
 import net.yacy.visualization.RasterPlotter;

-import org.apache.commons.fileupload.FileItem;
-import org.apache.commons.fileupload.FileItemFactory;
-import org.apache.commons.fileupload.disk.DiskFileItemFactory;
-import org.apache.commons.fileupload.servlet.ServletFileUpload;
-import org.eclipse.jetty.http.HttpHeader;
-import org.eclipse.jetty.http.HttpMethod;
-import org.eclipse.jetty.http.MimeTypes;
-import org.eclipse.jetty.io.WriterOutputStream;
-import org.eclipse.jetty.server.InclusiveByteRange;
-import org.eclipse.jetty.util.MultiPartOutputStream;
-import org.eclipse.jetty.util.URIUtil;
-import org.eclipse.jetty.util.resource.Resource;
-
-import com.google.common.net.HttpHeaders;
-import com.google.common.util.concurrent.SimpleTimeLimiter;
-import com.google.common.util.concurrent.TimeLimiter;
-import com.google.common.util.concurrent.UncheckedTimeoutException;
-
 /**
 * YaCyDefaultServlet based on Jetty DefaultServlet.java 
 * handles static files and the YaCy servlets.
@ -152,8 +141,6 @@ public class YaCyDefaultServlet extends HttpServlet  {
    protected static final File TMPDIR = new File(System.getProperty("java.io.tmpdir"));
    protected static final int SIZE_FILE_THRESHOLD = 1024 * 1024 * 1024; // 1GB is a lot but appropriate for multi-document pushed using the push_p.json servlet
    protected static final FileItemFactory DISK_FILE_ITEM_FACTORY = new DiskFileItemFactory(SIZE_FILE_THRESHOLD, TMPDIR);
-	private final static TimeLimiter timeLimiter = new SimpleTimeLimiter(Executors.newCachedThreadPool(
-			new NamePrefixThreadFactory(YaCyDefaultServlet.class.getSimpleName() + ".timeLimiter")));
    /* ------------------------------------------------------------ */
    @Override
    public void init() throws UnavailableException {
@ -866,12 +853,7 @@ public class YaCyDefaultServlet extends HttpServlet  {
            RequestHeader legacyRequestHeader = generateLegacyRequestHeader(request, target, targetExt);
            // add multipart-form fields to parameter
            if (ServletFileUpload.isMultipartContent(request)) {
-                final String bodyEncoding = request.getHeader(HeaderFramework.CONTENT_ENCODING);
-                if (HeaderFramework.CONTENT_ENCODING_GZIP.equalsIgnoreCase(bodyEncoding)) {
-                    parseMultipart(new GZIPRequestWrapper(request),args);
-                } else {
-                    parseMultipart(request, args);
-                }
+                parseMultipart(request, args);
            }
            // eof modification to read attribute
            Object tmp;
@ -1336,122 +1318,4 @@ public class YaCyDefaultServlet extends HttpServlet  {
            ConcurrentLog.info("FILEHANDLER", ex.getMessage());
        }
    }
-
-    /**
-     * wraps request to uncompress gzip'ed input stream
-     */
-    private class GZIPRequestWrapper extends HttpServletRequestWrapper {
-
-        private final ServletInputStream is;
-
-        public GZIPRequestWrapper(HttpServletRequest request) throws IOException {
-            super(request);
-            this.is = new GZIPRequestStream(request);
-        }
-
-        @Override
-        public ServletInputStream getInputStream() throws IOException {
-            return is;
-        }
-
-    }
-
-    private class GZIPRequestStream extends ServletInputStream {
-
-    	private final GZIPInputStream in;
-        private final ServletInputStream sin;
-
-        public GZIPRequestStream(HttpServletRequest request) throws IOException {
-        	sin = request.getInputStream();
-        	in = new GZIPInputStream(sin);
-        }
-
-        @Override
-        public int read() throws IOException {
-        	return in.read();
-        }
-
-        @Override
-        public int read(byte[] b) throws IOException {
-        	return read(b, 0, b.length);
-        }
-
-        @Override
-        public int read(byte[] b, int off, int len) throws IOException {
-        	try {
-        		return timeLimiter.callWithTimeout(new CallableReader(in, b, off, len), len + 600, TimeUnit.MILLISECONDS, false);
-        	} catch (final UncheckedTimeoutException e) {
-        		return -1;
-        	} catch (Exception e) {
-				throw new IOException(e);
-			}
-        }
-
-        @Override
-        public void close() throws IOException {
-        	in.close();
-        }
-        
-        @Override
-        public int available() throws IOException {
-        	return in.available();
-        }
-        
-        @Override
-        public synchronized void mark(int readlimit) {
-        	in.mark(readlimit);
-        }
-        
-        @Override
-        public boolean markSupported() {
-        	return in.markSupported();
-        }
-        
-        @Override
-        public synchronized void reset() throws IOException {
-        	in.reset();
-        }
-        
-        @Override
-        public long skip(long n) throws IOException {
-        	return in.skip(n);
-        }
-
-        @Override
-        public boolean isFinished() {
-        	try {
-            	return available() < 1;
-            } catch (final IOException ex) {
-                return true;
-            }
-        }
-
-        @Override
-        public boolean isReady() {
-            return sin.isReady() && !isFinished();
-        }
-
-        @Override
-        public void setReadListener(ReadListener rl) {
-        	sin.setReadListener(rl);
-        }
-    }
-    
-    private class CallableReader implements Callable<Integer> {
-    	private int off, len;
-    	private byte[] b;
-    	private GZIPInputStream in;
-    	
-    	public CallableReader(final GZIPInputStream in, byte[] b, int off, int len) {
-    		this.in = in;
-    		this.b = b;
-    		this.off = off;
-    		this.len = len;
-    	}
-    	
-    	@Override
-		public Integer call() throws Exception {
-			return in.read(b, off, len);
-		}
-    }
 }
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2071,8 +2071,12 @@ public final class Switchboard extends serverSwitch {
            noIndexReason = response.shallIndexCacheForCrawler();
        }

-        // check if the parser supports the mime type
-        if ( noIndexReason == null ) {
+		/*
+		 * Eventually check if a parser supports the media type. Depending on the crawl
+		 * profile, the indexingDocumentProcessor can eventually index only URL metadata
+		 * using the generic parser for unsupported media types
+		 */
+        if ( noIndexReason == null && !response.profile().isIndexNonParseableUrls()) {
            noIndexReason = TextParser.supports(response.url(), response.getMimeType());
        }

@ -3009,18 +3013,40 @@ public final class Switchboard extends serverSwitch {
            }
        }
        assert response.getContent() != null;
+        
        try {
-            // parse the document
-            documents =
-                TextParser.parseSource(
-                    new AnchorURL(response.url()),
-                    response.getMimeType(),
-                    response.getCharacterEncoding(),
-                    response.profile().ignoreDivClassName(),
-                    response.profile().scraper(),
-                    response.profile().timezoneOffset(),
-                    response.depth(),
-                    response.getContent());
+            final String supportError = TextParser.supports(response.url(), response.getMimeType());
+    		if (supportError != null) {
+    			/* No parser available or format is denied */
+    			if(response.profile().isIndexNonParseableUrls()) {
+    				/* Apply the generic parser add the URL as a simple link (no content metadata) to the index */
+    				documents = TextParser.genericParseSource(new AnchorURL(response.url()),
+                        response.getMimeType(),
+                        response.getCharacterEncoding(),
+                        response.profile().ignoreDivClassName(),
+                        response.profile().scraper(),
+                        response.profile().timezoneOffset(),
+                        response.depth(),
+                        response.getContent());
+    			} else {
+    	            this.log.warn("Resource '" + response.url().toNormalform(true) + "' is not supported. " + supportError);
+    	            // create a new errorURL DB entry
+    	            this.crawlQueues.errorURL.push(response.url(), response.depth(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, supportError, -1);
+    				return null;
+    			}
+    		} else {
+    			// parse the document
+    			documents =
+    					TextParser.parseSource(
+    							new AnchorURL(response.url()),
+    							response.getMimeType(),
+    							response.getCharacterEncoding(),
+    							response.profile().ignoreDivClassName(),
+    							response.profile().scraper(),
+    							response.profile().timezoneOffset(),
+    							response.depth(),
+    							response.getContent());
+    		}
            if ( documents == null ) {
                throw new Parser.Failure("Parser returned null.", response.url());
            }
@ -3070,22 +3096,39 @@ public final class Switchboard extends serverSwitch {
            // get the hyperlinks
            final Map<AnchorURL, String> hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow());
            
-            if (response.profile().indexMedia()) {
-                for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
-                    if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(new AnchorURL(entry.getKey()), entry.getValue());
-                }
-            }
+			final boolean addAllLinksToCrawlStack = response.profile().isIndexNonParseableUrls() /* unsupported resources have to be indexed as pure links if no parser support them */
+					|| response.profile().isCrawlerAlwaysCheckMediaType() /* the crawler must always load resources to double-check the actual Media Type even on unsupported file extensions */;
+			
+			/* Handle media links */
+			
+			for (Map.Entry<DigestURL, String> entry : Document.getImagelinks(documents).entrySet()) {
+				if (addAllLinksToCrawlStack
+						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+				}
+			}
+			
+			for (Map.Entry<DigestURL, String> entry : Document.getApplinks(documents).entrySet()) {
+				if (addAllLinksToCrawlStack
+						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+				}
+			}
+			
+			for (Map.Entry<DigestURL, String> entry : Document.getVideolinks(documents).entrySet()) {
+				if (addAllLinksToCrawlStack
+						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+				}
+			}
+			
+			for (Map.Entry<DigestURL, String> entry : Document.getAudiolinks(documents).entrySet()) {
+				if (addAllLinksToCrawlStack
+						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+				}
+			}
            
-            // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
-            if (response.profile().directDocByURL()) {
-                for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
-                    if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(new AnchorURL(entry.getKey()), entry.getValue());
-                }
-                for (Map.Entry<DigestURL, String> d: Document.getApplinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
-                for (Map.Entry<DigestURL, String> d: Document.getVideolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
-                for (Map.Entry<DigestURL, String> d: Document.getAudiolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
-            }
-
            // insert those hyperlinks to the crawler
            MultiProtocolURL nextUrl;
            for ( final Map.Entry<AnchorURL, String> nextEntry : hl.entrySet() ) {
--- a/source/net/yacy/search/schema/HyperlinkGraph.java
+++ b/source/net/yacy/search/schema/HyperlinkGraph.java
@ -21,11 +21,14 @@
 package net.yacy.search.schema;

 import java.net.MalformedURLException;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.BlockingQueue;

 import net.yacy.cora.document.id.DigestURL;
@ -62,7 +65,11 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
        if (hostname.startsWith("www.")) hostname = hostname.substring(4);
        StringBuilder q = new StringBuilder();
        q.append(CollectionSchema.host_s.getSolrFieldName()).append(':').append(hostname).append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(':').append("www.").append(hostname);
-        BlockingQueue<SolrDocument> docs = solrConnector.concurrentDocumentsByQuery(q.toString(), CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, maxnodes, maxtime, 100, 1, true,
+        final int pageSize = 100;
+        final BlockingQueue<SolrDocument> docs = new ArrayBlockingQueue<>(pageSize);
+        final List<String> queries = new ArrayList<>();
+        queries.add(q.toString());
+        final Thread solrQueryTask = new Thread(solrConnector.newDocumentsByQueriesTask(docs, queries, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, maxnodes, maxtime, pageSize, 1, 
                CollectionSchema.id.getSolrFieldName(),
                CollectionSchema.sku.getSolrFieldName(),
                CollectionSchema.failreason_s.getSolrFieldName(),
@ -71,7 +78,8 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
                CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(),
                CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
                CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName()
-                );
+                ));
+        solrQueryTask.start();
        SolrDocument doc;
        Map<String, FailType> errorDocs = new HashMap<String, FailType>();
        HyperlinkEdges inboundEdges = new HyperlinkEdges();
@ -80,7 +88,12 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
        try {
            retrieval: while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
                String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
-                MultiProtocolURL from = new MultiProtocolURL(u);
+                MultiProtocolURL from;
+				try {
+					from = new MultiProtocolURL(u);
+				} catch (final MalformedURLException e1) {
+					continue;
+				}
                String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
                FailType error = errortype == null ? null : FailType.valueOf(errortype);
                if (error != null) {
@ -94,7 +107,9 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
                            HyperlinkEdge.Target linkurl = new HyperlinkEdge.Target(link, HyperlinkType.Inbound);
                            inboundEdges.addEdge(from, linkurl);
                            if (stopURL != null && linkurl.equals(stopURL)) break retrieval;
-                        } catch (MalformedURLException e) {}
+                        } catch (final MalformedURLException e) {
+                        	/* Continue on the next link */
+                        }
                    }
                    links = URIMetadataNode.getLinks(doc, false); // outbound
                    while (links.hasNext()) {
@ -103,42 +118,49 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
                            HyperlinkEdge.Target linkurl = new HyperlinkEdge.Target(link, HyperlinkType.Outbound);
                            outboundEdges.addEdge(from, linkurl);
                            if (stopURL != null && linkurl.equals(stopURL)) break retrieval;
-                        } catch (MalformedURLException e) {}
+                        } catch (final MalformedURLException e) {
+                        	/* Continue on the next link */
+                        }
                    }
                }
                if (inboundEdges.size() + outboundEdges.size() > maxnodes) {
                    break retrieval;
                }
            }
-        } catch (InterruptedException e) {
-        } catch (MalformedURLException e) {
+        } catch (final InterruptedException e) {
+        	Thread.currentThread().interrupt(); // preserve interrupted thread state
+        } finally {
+        	/* Ensure termination and proper resources release of the query thread */
+        	solrQueryTask.interrupt();
        }
-        // we use the errorDocs to mark all edges with endpoint to error documents
-        Iterator<HyperlinkEdge> i = inboundEdges.iterator();
-        HyperlinkEdge edge;
-        while (i.hasNext()) {
-            edge = i.next();
-            if (errorDocs.containsKey(edge.target.toNormalform(true))) {
-                i.remove();
-                edge.target.type = HyperlinkType.Dead;
-                errorEdges.add(edge);
-            }
-        }
-        i = outboundEdges.iterator();
-        while (i.hasNext()) {
-            edge = i.next();
-            if (errorDocs.containsKey(edge.target.toNormalform(true))) {
-                i.remove();
-                edge.target.type = HyperlinkType.Dead;
-                errorEdges.add(edge);
-            }
+        if(!Thread.currentThread().isInterrupted()) {
+        	// we use the errorDocs to mark all edges with endpoint to error documents
+        	Iterator<HyperlinkEdge> i = inboundEdges.iterator();
+        	HyperlinkEdge edge;
+        	while (i.hasNext()) {
+        		edge = i.next();
+        		if (errorDocs.containsKey(edge.target.toNormalform(true))) {
+        			i.remove();
+        			edge.target.type = HyperlinkType.Dead;
+        			errorEdges.add(edge);
+        		}
+        	}
+        	i = outboundEdges.iterator();
+        	while (i.hasNext()) {
+        		edge = i.next();
+        		if (errorDocs.containsKey(edge.target.toNormalform(true))) {
+        			i.remove();
+        			edge.target.type = HyperlinkType.Dead;
+        			errorEdges.add(edge);
+        		}
+        	}
+        	// we put all edges together in a specific order which is used to create nodes in a svg display:
+        	// notes that appear first are possible painted over by nodes coming later.
+        	// less important nodes shall appear therefore first
+        	this.edges.addAll(outboundEdges);
+        	this.edges.addAll(inboundEdges);
+        	this.edges.addAll(errorEdges);
        }
-        // we put all edges together in a specific order which is used to create nodes in a svg display:
-        // notes that appear first are possible painted over by nodes coming later.
-        // less important nodes shall appear therefore first
-        this.edges.addAll(outboundEdges);
-        this.edges.addAll(inboundEdges);
-        this.edges.addAll(errorEdges);
    }
    
    public void path(final Segment segment, DigestURL from, DigestURL to, final int maxtime, final int maxnodes) {