Added new crawler attribute for finer control over Media Type detection

New "Media Type detection" section in the advanced crawl start page allow to choose between : - not loading URLs with unknown or unsupported file extension without checking the actual Media Type (relying Content-Type header for now). This was the old default behavior, faster, but not really accurate. - always cross check URL file extension against the actual Media Type. This lets properly parse URLs ending with an apparently odd file extension, but which have actually a supported Media Type such as text/html. Sample URLs with misleading file extensions added as documentation in the crawl start page. fixes issue #244
6 years ago · fcf6b16db4
parent 88d0ed676c
commit fcf6b16db4
10 changed files with 187 additions and 45 deletions
--- a/htroot/CrawlProfileEditor_p.xml
+++ b/htroot/CrawlProfileEditor_p.xml
@ -20,6 +20,7 @@
 		<storeHTCache>#(storeHTCache)#false::true#(/storeHTCache)#</storeHTCache>
 		<remoteIndexing>#(remoteIndexing)#false::true#(/remoteIndexing)#</remoteIndexing>
 		<cacheStrategy>#[cacheStrategy]#</cacheStrategy>
+		<crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType>
 		<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
 		<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
 		<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@ -317,6 +317,27 @@
            Obey html-robots-nofollow: <input type="checkbox" name="obeyHtmlRobotsNofollow" id="obeyHtmlRobotsNofollow" #(obeyHtmlRobotsNofollowChecked)#::checked="checked"#(/obeyHtmlRobotsNofollowChecked)# /><!--<br/>
            Follow Frames: <input type="checkbox" name="followFrames" id="followFrames" #(followFramesChecked)#::checked="checked"#(/followFramesChecked)# />-->
          </dd>
+ 	        <dt>Media Type detection</dt>
+	        <dd>
+            	<div class="info" style="float:right">
+            		<img src="env/grafics/i16.gif" width="16" height="16" alt="Media Type checking info"/>
+            		<span style="right:0px; width:30em;" id="mediaTypeCheckingInfo">
+            			Not loading URLs with unsupported file extension is faster but less accurate. 
+            			Indeed, for some web resources the actual Media Type is not consistent with the URL file extension. Here are some examples:
+            			<ul>
+            				<li><a href="https://en.wikipedia.org/wiki/.de"  target="_blank">https://en.wikipedia.org/wiki/.de</a> : the .de extension is unknown, but the actual Media Type of this page is text/html</li>
+            				<li><a href="https://en.wikipedia.org/wiki/Ask.com"  target="_blank">https://en.wikipedia.org/wiki/Ask.com</a> : the .com extension is not supported (executable file format), but the actual Media Type of this page is text/html</li>
+            				<li><a href="https://commons.wikimedia.org/wiki/File:YaCy_logo.png"  target="_blank">https://commons.wikimedia.org/wiki/File:YaCy_logo.png</a> : the .png extension is a supported image format, but the actual Media Type of this page is text/html</li>
+            			</ul> 
+            		</span>
+            	</div>
+            	<label>
+          			<input type="radio" aria-describedby="mediaTypeCheckingInfo" name="crawlerAlwaysCheckMediaType" value="false" #(crawlerAlwaysCheckMediaType)#checked="checked"::#(/crawlerAlwaysCheckMediaType)# /> Do not load URLs with an unsupported file extension
+	          	</label>
+    	      	<label>
+        	  		<input type="radio" name="crawlerAlwaysCheckMediaType" value="true" #(crawlerAlwaysCheckMediaType)#::checked="checked"#(/crawlerAlwaysCheckMediaType)# /> Always cross check file extension against Content-Type header
+          		</label>
+	        </dd>
 	        <dt>Load Filter on URLs</dt>
 	        <dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
            The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>.
--- a/htroot/CrawlStartExpert.java
+++ b/htroot/CrawlStartExpert.java
@ -213,6 +213,13 @@ public class CrawlStartExpert {
            prop.put("obeyHtmlRobotsNoindexChecked", post.getBoolean("obeyHtmlRobotsNoindex") ? 1 : 0);
            prop.put("obeyHtmlRobotsNofollowChecked", post.getBoolean("obeyHtmlRobotsNofollow") ? 1 : 0);
        }
+        
+        // always cross-check URL file extension against actual Media Type ?
+		if (post == null) {
+			prop.put("crawlerAlwaysCheckMediaType", true);
+		} else {
+			prop.put("crawlerAlwaysCheckMediaType", post.getBoolean("crawlerAlwaysCheckMediaType"));
+		}

        // Load Filter on URLs (range)
        if (post != null && post.containsKey("range")) {
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -332,7 +332,7 @@ public class Crawler_p {
                env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
                if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;

-                boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); // catch also all linked media documents without loading them
+                boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); // catch also all linked media documents even when no parser is available
                env.setConfig("crawlingDirectDocByURL", directDocByURL);

                final String collection = post.get("collection", "user");
@ -633,6 +633,8 @@ public class Crawler_p {
 							.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
 					profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, solrQueryMustMatch);
 					profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, solrQueryMustNotMatch);
+					profile.put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key,
+							post.getBoolean("crawlerAlwaysCheckMediaType"));
 					
                    
                    handle = ASCII.getBytes(profile.handle());
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -374,13 +374,20 @@ public final class CrawlStacker implements WorkflowTask<Request>{
            return error;
        }

-        // check availability of parser and maxfilesize
        String warning = null;
-        //ContentDomain contentDomain = entry.url().getContentDomainFromExt();
-        if (TextParser.supportsExtension(entry.url()) != null) {
-            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots);
-            //if (warning != null && this.log.isFine()) this.log.logFine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
-            return null;
+        if (!profile.isCrawlerAlwaysCheckMediaType() && TextParser.supportsExtension(entry.url()) != null) {
+        	if(profile.isIndexNonParseableUrls()) {
+        		/* Unsupported file extension and no cross-checking of Media Type : add immediately to the noload stack to index only URL metadata */
+        		warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots);
+        		if (warning != null && CrawlStacker.log.isFine()) {
+        			CrawlStacker.log.fine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true) + " - not pushed to " + NoticedURL.StackType.NOLOAD + " stack : " + warning);
+        		}
+        		return null;
+        	}
+        	
+            error = "URL '" + entry.url().toString() + "' file extension is not supported and indexing of linked non-parsable documents is disabled.";
+            CrawlStacker.log.info(error);
+            return error;
        }

        if (global) {
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -96,6 +96,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        FOLLOW_FRAMES                ("followFrames",               false, CrawlAttribute.BOOLEAN, "Flag if frames shall be followed (no by default)"),
        OBEY_HTML_ROBOTS_NOINDEX     ("obeyHtmlRobotsNoindex",      false, CrawlAttribute.BOOLEAN, "Obey html-robots-noindex"),
        OBEY_HTML_ROBOTS_NOFOLLOW    ("obeyHtmlRobotsNofollow",     false, CrawlAttribute.BOOLEAN, "Obey html-robots-nofollow"),
+        CRAWLER_ALWAYS_CHECK_MEDIA_TYPE("crawlerAlwaysCheckMediaType", false, CrawlAttribute.BOOLEAN, "Always cross check file extension against actual Media Type"),
        CRAWLER_URL_MUSTMATCH        ("crawlerURLMustMatch",        false, CrawlAttribute.STRING,  "URL Must-Match Filter"),
        CRAWLER_URL_MUSTNOTMATCH     ("crawlerURLMustNotMatch",     false, CrawlAttribute.STRING,  "URL Must-Not-Match Filter"),
        CRAWLER_IP_MUSTMATCH         ("crawlerIPMustMatch",         false, CrawlAttribute.STRING,  "IP Must-Match Filter"),
@ -239,6 +240,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(CrawlAttribute.HANDLE.key,           handle);
        put(CrawlAttribute.NAME.key,             name);
        put(CrawlAttribute.AGENT_NAME.key, userAgentName);
+        put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true);
        put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key,     (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
        put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key,  (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
        put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key,      (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
@ -673,11 +675,29 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
    }

-    public boolean directDocByURL() {
+	/**
+	 * @return true when URLs of unsupported resources (no parser available or denied format) should
+	 *         be indexed as links (with metadata only on URL and not on content).
+	 */
+    public boolean isIndexNonParseableUrls() {
        final String r = get(CrawlAttribute.DIRECT_DOC_BY_URL.key);
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
+    
+	/**
+	 * @return true when the crawler must always cross check the eventual URL file
+	 *         extension against the actual Media Type, even when file extension is
+	 *         unknown or unsupported. False when the crawler should not load URLs
+	 *         with an unknown or unsupported file extension.
+	 */
+	public boolean isCrawlerAlwaysCheckMediaType() {
+		final String r = get(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key);
+		if (r == null) {
+			return false;
+		}
+		return (r.equals(Boolean.TRUE.toString()));
+	}

    public CacheStrategy cacheStrategy() {
        final String r = get(CrawlAttribute.CACHE_STRAGEGY.key);
@ -889,7 +909,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_agentName", this.get(CrawlAttribute.AGENT_NAME.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_userAgent", this.getAgent().userAgent);
        prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth());
-        prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.directDocByURL() ? 1 : 0);
+        prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.isIndexNonParseableUrls() ? 1 : 0);
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_recrawlIfOlder", this.recrawlIfOlder() == Long.MAX_VALUE ? "eternity" : (new Date(this.recrawlIfOlder()).toString()));
        prop.put(CRAWL_PROFILE_PREFIX + count + "_domMaxPages", this.domMaxPages());
        //prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (this.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(this.domMaxPages())); // TODO: remove, replace with 'domMaxPages'
@ -903,6 +923,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        prop.put(CRAWL_PROFILE_PREFIX + count + "_storeHTCache", this.storeHTCache() ? 1 : 0);
        prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", this.remoteIndexing() ? 1 : 0);
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_cacheStrategy", this.get(CrawlAttribute.CACHE_STRAGEGY.key));
+        prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlerAlwaysCheckMediaType", this.isCrawlerAlwaysCheckMediaType());
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key));
--- a/source/net/yacy/crawler/data/NoticedURL.java
+++ b/source/net/yacy/crawler/data/NoticedURL.java
@ -55,10 +55,18 @@ public class NoticedURL {
        LOCAL, GLOBAL, REMOTE, NOLOAD;
    }

-    private Balancer coreStack;      // links found by crawling to depth-1
-    private Balancer limitStack;     // links found by crawling at target depth
-    private Balancer remoteStack;    // links from remote crawl orders (init on demand)
-    private Balancer noloadStack;    // links that are not passed to a loader; the index will be generated from the Request entry
+    /** links found by crawling to depth-1 */
+    private Balancer coreStack;
+    
+    /** links found by crawling at target depth */
+    private Balancer limitStack;
+    
+    /** links from remote crawl orders (init on demand) */
+    private Balancer remoteStack;
+    
+    /** links that are not passed to a loader; the index will be generated from the Request entry */
+    private Balancer noloadStack;
+    
    private final File cachePath;

    protected NoticedURL(
--- a/source/net/yacy/crawler/retrieval/Response.java
+++ b/source/net/yacy/crawler/retrieval/Response.java
@ -742,8 +742,12 @@ public class Response {
        // -ranges in request
        // we checked that in shallStoreCache

-        // check if document can be indexed
-        if (this.responseHeader != null) {
+		/*
+		 * Eventually check if a parser supports the media yype. Depending on the crawl
+		 * profile, the indexingDocumentProcessor can eventually index only URL metadata
+		 * using the generic parser for unsupported media types
+		 */
+        if (this.responseHeader != null && !profile().isIndexNonParseableUrls()) {
            final String mimeType = this.responseHeader.getContentType();
            final String parserError = TextParser.supportsMime(mimeType);
            if (parserError != null && TextParser.supportsExtension(url()) != null)  return "no parser available: " + parserError;
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -241,6 +241,29 @@ public final class TextParser {
        return docs;
    }
    
+    /**
+     * Apply only the generic parser to the given content from location.
+     */
+    public static Document[] genericParseSource(
+            final DigestURL location,
+            String mimeType,
+            final String charset,
+            final Set<String> ignoreClassNames,
+            final VocabularyScraper scraper,
+            final int timezoneOffset,
+            final int depth,
+            final byte[] content
+        ) throws Parser.Failure {
+        if (AbstractParser.log.isFine()) {
+        	AbstractParser.log.fine("Parsing '" + location + "' from byte-array, applying only the generic parser");
+        }
+        mimeType = normalizeMimeType(mimeType);
+        Set<Parser> idioms = new HashSet<>();
+        idioms.add(TextParser.genericIdiom);
+
+        return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
+    }
+    
    private static Document[] parseSource(
            final DigestURL location,
            String mimeType,
@ -644,7 +667,7 @@ public final class TextParser {
     * @param url the given url
     * @param mimeType the given mime type
     * @return a list of Idiom parsers that may be appropriate for the given criteria
-     * @throws Parser.Failure
+     * @throws Parser.Failure when the file extension or the MIME type is denied
     */
    private static Set<Parser> parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure {
        final Set<Parser> idioms = new LinkedHashSet<Parser>(2); // LinkedSet to maintain order (genericParser should be last)
@ -661,7 +684,12 @@ public final class TextParser {
        // check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied)
        String ext = MultiProtocolURL.getFileExtension(url.getFileName());
        if (ext != null && ext.length() > 0) {
-            if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
+        	/* We do not throw here an exception when the media type is provided and inconsistent with the extension (if it is not supported an exception has already beeen thrown). 
+        	 * Otherwise we would reject URLs with an apparently unsupported extension but whose actual Media Type is supported (for example text/html).
+        	 * Notable example : wikimedia commons pages, such as https://commons.wikimedia.org/wiki/File:YaCy_logo.png */
+            if (denyExtensionx.containsKey(ext) && (mimeType1 == null || mimeType1.equals(mimeOf(ext)))) {
+            	throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
+            }
            idiom = ext2parser.get(ext);
            if (idiom != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser
                idioms.addAll(idiom);
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2071,8 +2071,12 @@ public final class Switchboard extends serverSwitch {
            noIndexReason = response.shallIndexCacheForCrawler();
        }

-        // check if the parser supports the mime type
-        if ( noIndexReason == null ) {
+		/*
+		 * Eventually check if a parser supports the media type. Depending on the crawl
+		 * profile, the indexingDocumentProcessor can eventually index only URL metadata
+		 * using the generic parser for unsupported media types
+		 */
+        if ( noIndexReason == null && !response.profile().isIndexNonParseableUrls()) {
            noIndexReason = TextParser.supports(response.url(), response.getMimeType());
        }

@ -3009,18 +3013,40 @@ public final class Switchboard extends serverSwitch {
            }
        }
        assert response.getContent() != null;
+        
        try {
-            // parse the document
-            documents =
-                TextParser.parseSource(
-                    new AnchorURL(response.url()),
-                    response.getMimeType(),
-                    response.getCharacterEncoding(),
-                    response.profile().ignoreDivClassName(),
-                    response.profile().scraper(),
-                    response.profile().timezoneOffset(),
-                    response.depth(),
-                    response.getContent());
+            final String supportError = TextParser.supports(response.url(), response.getMimeType());
+    		if (supportError != null) {
+    			/* No parser available or format is denied */
+    			if(response.profile().isIndexNonParseableUrls()) {
+    				/* Apply the generic parser add the URL as a simple link (no content metadata) to the index */
+    				documents = TextParser.genericParseSource(new AnchorURL(response.url()),
+                        response.getMimeType(),
+                        response.getCharacterEncoding(),
+                        response.profile().ignoreDivClassName(),
+                        response.profile().scraper(),
+                        response.profile().timezoneOffset(),
+                        response.depth(),
+                        response.getContent());
+    			} else {
+    	            this.log.warn("Resource '" + response.url().toNormalform(true) + "' is not supported. " + supportError);
+    	            // create a new errorURL DB entry
+    	            this.crawlQueues.errorURL.push(response.url(), response.depth(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, supportError, -1);
+    				return null;
+    			}
+    		} else {
+    			// parse the document
+    			documents =
+    					TextParser.parseSource(
+    							new AnchorURL(response.url()),
+    							response.getMimeType(),
+    							response.getCharacterEncoding(),
+    							response.profile().ignoreDivClassName(),
+    							response.profile().scraper(),
+    							response.profile().timezoneOffset(),
+    							response.depth(),
+    							response.getContent());
+    		}
            if ( documents == null ) {
                throw new Parser.Failure("Parser returned null.", response.url());
            }
@ -3070,22 +3096,39 @@ public final class Switchboard extends serverSwitch {
            // get the hyperlinks
            final Map<AnchorURL, String> hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow());
            
-            if (response.profile().indexMedia()) {
-                for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
-                    if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(new AnchorURL(entry.getKey()), entry.getValue());
-                }
-            }
+			final boolean addAllLinksToCrawlStack = response.profile().isIndexNonParseableUrls() /* unsupported resources have to be indexed as pure links if no parser support them */
+					|| response.profile().isCrawlerAlwaysCheckMediaType() /* the crawler must always load resources to double-check the actual Media Type even on unsupported file extensions */;
+			
+			/* Handle media links */
+			
+			for (Map.Entry<DigestURL, String> entry : Document.getImagelinks(documents).entrySet()) {
+				if (addAllLinksToCrawlStack
+						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+				}
+			}
+			
+			for (Map.Entry<DigestURL, String> entry : Document.getApplinks(documents).entrySet()) {
+				if (addAllLinksToCrawlStack
+						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+				}
+			}
+			
+			for (Map.Entry<DigestURL, String> entry : Document.getVideolinks(documents).entrySet()) {
+				if (addAllLinksToCrawlStack
+						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+				}
+			}
+			
+			for (Map.Entry<DigestURL, String> entry : Document.getAudiolinks(documents).entrySet()) {
+				if (addAllLinksToCrawlStack
+						|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+					hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+				}
+			}
            
-            // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
-            if (response.profile().directDocByURL()) {
-                for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
-                    if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(new AnchorURL(entry.getKey()), entry.getValue());
-                }
-                for (Map.Entry<DigestURL, String> d: Document.getApplinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
-                for (Map.Entry<DigestURL, String> d: Document.getVideolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
-                for (Map.Entry<DigestURL, String> d: Document.getAudiolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
-            }
-
            // insert those hyperlinks to the crawler
            MultiProtocolURL nextUrl;
            for ( final Map.Entry<AnchorURL, String> nextEntry : hl.entrySet() ) {