load image only if their parser flag is activated

12 years ago · 234a974955
parent b2c329929f
commit 234a974955
3 changed files with 9 additions and 12 deletions
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -55,6 +55,7 @@ import net.yacy.crawler.retrieval.HTTPLoader;
 import net.yacy.crawler.retrieval.Request;
 import net.yacy.crawler.retrieval.SMBLoader;
 import net.yacy.crawler.robots.RobotsTxt;
+import net.yacy.document.TextParser;
 import net.yacy.kelondro.data.citation.CitationReference;
 import net.yacy.kelondro.rwi.IndexCell;
 import net.yacy.kelondro.workflow.WorkflowProcessor;
@ -347,17 +348,10 @@ public final class CrawlStacker {

        // check availability of parser and maxfilesize
        String warning = null;
-        boolean loadImages = Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
-        if (!loadImages && Switchboard.getSwitchboard().getConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, "").equals("true;")) {
-            // dammit semicolon
-            // TODO: remove this shit later
-            Switchboard.getSwitchboard().setConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
-            loadImages = true;
-        }
        ContentDomain contentDomain = entry.url().getContentDomainFromExt();
        if ((maxFileSize >= 0 && entry.size() > maxFileSize) ||
            contentDomain == ContentDomain.APP  ||
-            (!loadImages && contentDomain == ContentDomain.IMAGE) ||
+            (contentDomain == ContentDomain.IMAGE && TextParser.supportsExtension(entry.url()) != null) ||
            contentDomain == ContentDomain.AUDIO  ||
            contentDomain == ContentDomain.VIDEO ||
            contentDomain == ContentDomain.CTRL) {
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2556,12 +2556,16 @@ public final class Switchboard extends serverSwitch {
           ) {
            // get the hyperlinks
            final Map<DigestURL, String> hl = Document.getHyperlinks(documents);
-            boolean loadImages = getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
-            if (loadImages) hl.putAll(Document.getImagelinks(documents));
+            for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
+                if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(entry.getKey(), entry.getValue());
+            }
+            
            
            // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
            if (response.profile().directDocByURL()) {
-                if (!loadImages) hl.putAll(Document.getImagelinks(documents));
+                for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
+                    if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(entry.getKey(), entry.getValue());
+                }
                hl.putAll(Document.getApplinks(documents));
                hl.putAll(Document.getVideolinks(documents));
                hl.putAll(Document.getAudiolinks(documents));
--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@ -323,7 +323,6 @@ public final class SwitchboardConstants {
     * <p><code>public static final String <strong>CRAWLER_THREADS_ACTIVE_MAX</strong> = "crawler.MaxActiveThreads"</code></p>
     * <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>
     */
-    public static final String CRAWLER_LOAD_IMAGE               = "crawler.load.image";
    public static final String CRAWLER_THREADS_ACTIVE_MAX       = "crawler.MaxActiveThreads";
    public static final String CRAWLER_FOLLOW_REDIRECTS         = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect
    public static final String CRAWLER_RECORD_REDIRECTS         = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store