fixed image search and favicon loading

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4225 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago · af10f729df
parent edba2b7bcc
commit af10f729df
9 changed files with 89 additions and 141 deletions
--- a/source/de/anomic/http/httpdProxyHandler.java
+++ b/source/de/anomic/http/httpdProxyHandler.java
@ -581,7 +581,7 @@ public final class httpdProxyHandler {
            // handle file types and make (possibly transforming) output stream
            if (
                    (!transformer.isIdentityTransformer()) &&
-                    (plasmaParser.supportedRealTimeContent(url,res.responseHeader.mime()))
+                    (plasmaParser.supportedHTMLContent(url,res.responseHeader.mime()))
                ) {
                // make a transformer
                theLogger.logFine("create transformer for URL " + url);
@ -794,8 +794,8 @@ public final class httpdProxyHandler {
                
                // make a transformer
                if (( !transformer.isIdentityTransformer()) &&
-                        (ext == null || !plasmaParser.supportedRealtimeFileExtContains(url)) &&
-                        (plasmaParser.realtimeParsableMimeTypesContains(cachedResponseHeader.mime()))) {
+                        (ext == null || !plasmaParser.supportedHTMLFileExtContains(url)) &&
+                        (plasmaParser.HTMLParsableMimeTypesContains(cachedResponseHeader.mime()))) {
                    hfos = new htmlFilterWriter((chunkedOut != null) ? chunkedOut : respond, charSet, null, transformer, (ext.length() == 0));
                } else {
                    hfos = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond);
--- a/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java
+++ b/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java
@ -39,6 +39,7 @@ import de.anomic.plasma.plasmaCrawlNURL;
 import de.anomic.plasma.plasmaCrawlProfile;
 import de.anomic.plasma.plasmaCrawlZURL;
 import de.anomic.plasma.plasmaHTCache;
+import de.anomic.plasma.plasmaParser;
 import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.server.logging.serverLog;
 import de.anomic.tools.crypt;
@ -518,7 +519,7 @@ public class plasmaCrawlQueues {
                0, 
                0);
        
-        return loader.load(centry);
+        return loader.load(centry, (forText) ? plasmaParser.PARSER_MODE_CRAWLER : plasmaParser.PARSER_MODE_IMAGE);
    }
    
    public int size() {
@ -547,7 +548,7 @@ public class plasmaCrawlQueues {
                } else {
                    // starting a load from the internet
                    this.entry.setStatus("worker-loading");
-                    String result = loader.process(this.entry);
+                    String result = loader.process(this.entry, plasmaParser.PARSER_MODE_CRAWLER);
                    if (result != null) {
                        plasmaCrawlZURL.Entry eentry = errorURL.newEntry(this.entry.url(), "cannot load: " + result);
                        eentry.store();
--- a/source/de/anomic/plasma/crawler/plasmaHTTPLoader.java
+++ b/source/de/anomic/plasma/crawler/plasmaHTTPLoader.java
@ -130,11 +130,11 @@ public final class plasmaHTTPLoader {
        );
    }    
   
-    public plasmaHTCache.Entry load(plasmaCrawlEntry entry) {
-        return load(entry, DEFAULT_CRAWLING_RETRY_COUNT);
+    public plasmaHTCache.Entry load(plasmaCrawlEntry entry, String parserMode) {
+        return load(entry, parserMode, DEFAULT_CRAWLING_RETRY_COUNT);
    }
    
-    private plasmaHTCache.Entry load(plasmaCrawlEntry entry, int retryCount) {
+    private plasmaHTCache.Entry load(plasmaCrawlEntry entry, String parserMode, int retryCount) {

        if (retryCount < 0) {
            this.log.logInfo("Redirection counter exceeded for URL " + entry.url().toString() + ". Processing aborted.");
@ -212,7 +212,7 @@ public final class plasmaHTTPLoader {
                // request has been placed and result has been returned. work off response
                File cacheFile = plasmaHTCache.getCachePath(entry.url());
                try {
-                    if (plasmaParser.supportedContent(plasmaParser.PARSER_MODE_CRAWLER,entry.url(),res.responseHeader.mime())) {
+                    if (plasmaParser.supportedContent(parserMode, entry.url(), res.responseHeader.mime())) {
                        // delete old content
                        if (cacheFile.isFile()) {
                            plasmaHTCache.deleteURLfromCache(entry.url());
@ -310,7 +310,7 @@ public final class plasmaHTTPLoader {
                        
                        // retry crawling with new url
                        entry.redirectURL(redirectionUrl);
-                        return load(entry, retryCount - 1);
+                        return load(entry, plasmaParser.PARSER_MODE_URLREDIRECTOR, retryCount - 1);
                        
                    }
            } else {
--- a/source/de/anomic/plasma/crawler/plasmaProtocolLoader.java
+++ b/source/de/anomic/plasma/crawler/plasmaProtocolLoader.java
@ -61,23 +61,23 @@ public final class plasmaProtocolLoader {
        return (HashSet) this.supportedProtocols.clone();
    }
    
-    public plasmaHTCache.Entry load(plasmaCrawlEntry entry) {
+    public plasmaHTCache.Entry load(plasmaCrawlEntry entry, String parserMode) {
        // getting the protocol of the next URL                
        String protocol = entry.url().getProtocol();
        
-        if ((protocol.equals("http") || (protocol.equals("https")))) return httpLoader.load(entry);
+        if ((protocol.equals("http") || (protocol.equals("https")))) return httpLoader.load(entry, parserMode);
        if (protocol.equals("ftp")) return ftpLoader.load(entry);
        
        this.log.logWarning("Unsupported protocol '" + protocol + "' in url " + entry.url());
        return null;
    }
    
-    public String process(plasmaCrawlEntry entry) {
+    public String process(plasmaCrawlEntry entry, String parserMode) {
        // load a resource, store it to htcache and push queue entry to switchboard queue
        // returns null if everything went fine, a fail reason string if a problem occurred
        plasmaHTCache.Entry h;
        try {
-            h = load(entry);
+            h = load(entry, parserMode);
            entry.setStatus("loaded");
            if (h == null) return "load failed";
            boolean stored = sb.htEntryStoreProcess(h);
--- a/source/de/anomic/plasma/plasmaCrawlEntry.java
+++ b/source/de/anomic/plasma/plasmaCrawlEntry.java
@ -1,6 +1,6 @@
-// plasmaCrawlBalancerEntry.java
-// (C) 2007 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
-// first published 14.03.2007 on http://www.anomic.de
+// plasmaCrawlEntry.java
+// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 14.03.2007 on http://yacy.net
 //
 // This is a part of YaCy, a peer-to-peer based web search engine
 //
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -1,12 +1,16 @@
 // plasmaParser.java 
-// ------------------------
-// part of YaCy
-// (C) by Michael Peter Christen; mc@anomic.de
-// first published on http://www.anomic.de
-// Frankfurt, Germany, 2005
+// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published in january 2005 on http://yacy.net
+// with contributions 02.05.2005 by Martin Thelian
 //
-// last major change: 02.05.2005 by Martin Thelian
+// This is a part of YaCy, a peer-to-peer based web search engine
 //
+// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
+// $LastChangedRevision: 1986 $
+// $LastChangedBy: orbiter $
+//
+// LICENSE
+// 
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation; either version 2 of the License, or
@ -20,27 +24,6 @@
 // You should have received a copy of the GNU General Public License
 // along with this program; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-//
-// Using this software in any meaning (reading, learning, copying, compiling,
-// running) means that you agree that the Author(s) is (are) not responsible
-// for cost, loss of data or any harm that may be caused directly or indirectly
-// by usage of this softare or this documentation. The usage of this software
-// is on your own risk. The installation and usage (starting/running) of this
-// software may allow other people or application to access your computer and
-// any attached devices and is highly dependent on the configuration of the
-// software which must be done by the user of the software; the author(s) is
-// (are) also not responsible for proper configuration and usage of the
-// software, even if provoked by documentation provided together with
-// the software.
-//
-// Any changes to this file according to the GPL as documented in the file
-// gpl.txt aside this file in the shipment you received can be done to the
-// lines that follows this copyright notice here, but changes must not be
-// done inside the copyright notive above. A re-distribution must contain
-// the intact and unchanged copyright notice.
-// Contributions and changes to the program code must be marked as such.
-
-// compile: javac -classpath lib/commons-collections.jar:lib/commons-pool.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java

 package de.anomic.plasma;

@ -78,15 +61,17 @@ import de.anomic.server.logging.serverLog;
 import de.anomic.yacy.yacyURL;

 public final class plasmaParser {
-    public static final String PARSER_MODE_PROXY   = "PROXY";
-    public static final String PARSER_MODE_CRAWLER = "CRAWLER";
+    public static final String PARSER_MODE_PROXY         = "PROXY";
+    public static final String PARSER_MODE_CRAWLER       = "CRAWLER";
    public static final String PARSER_MODE_URLREDIRECTOR = "URLREDIRECTOR";
-    public static final String PARSER_MODE_ICAP = "ICAP";
+    public static final String PARSER_MODE_ICAP          = "ICAP";
+    public static final String PARSER_MODE_IMAGE         = "IMAGE";
    public static final HashSet PARSER_MODE = new HashSet(Arrays.asList(new String[]{
            PARSER_MODE_PROXY,
            PARSER_MODE_CRAWLER,
            PARSER_MODE_ICAP,
-            PARSER_MODE_URLREDIRECTOR
+            PARSER_MODE_URLREDIRECTOR,
+            PARSER_MODE_IMAGE
    }));
    
    private static final HashMap parserConfigList = new HashMap();
@ -98,15 +83,10 @@ public final class plasmaParser {
    public static final Properties availableParserList = new Properties();
    
    /**
-     * A list of file extensions that are supported by the html-parser and can
-     * be parsed in realtime.
+     * A list of file extensions and mime types that are supported by the html-parser
     */
-    public static final HashSet supportedRealtimeFileExt = new HashSet();
-    
-    /**
-     * A list of mimeTypes that can be parsed in Realtime (on the fly)
-     */
-    public static final HashSet realtimeParsableMimeTypes = new HashSet();    
+    public static final HashSet supportedHTMLFileExt = new HashSet();
+    public static final HashSet supportedHTMLMimeTypes = new HashSet();    
    
    private static final Properties mimeTypeLookupByFileExt = new Properties();
    static {
@ -194,34 +174,24 @@ public final class plasmaParser {
    }
    
    /**
-     * This function is used to initialize the realtimeParsableMimeTypes List.
+     * This function is used to initialize the HTMLParsableMimeTypes List.
     * This list contains a list of mimeTypes that can be parsed in realtime by
     * the yacy html-Parser
-     * @param realtimeParsableMimeTypes a list of mimetypes that can be parsed by the 
+     * @param htmlParsableMimeTypes a list of mimetypes that can be parsed by the 
     * yacy html parser
     */
-    public static void initRealtimeParsableMimeTypes(String realtimeParsableMimeTypes) {
+    public static void initHTMLParsableMimeTypes(String htmlParsableMimeTypes) {
        LinkedList mimeTypes = new LinkedList();
-        if ((realtimeParsableMimeTypes == null) || (realtimeParsableMimeTypes.length() == 0)) {
-            // Nothing todo here
-        } else {            
-            String[] realtimeParsableMimeTypeList = realtimeParsableMimeTypes.split(",");        
-            for (int i = 0; i < realtimeParsableMimeTypeList.length; i++) mimeTypes.add(realtimeParsableMimeTypeList[i].toLowerCase().trim());
+        if ((htmlParsableMimeTypes == null) || (htmlParsableMimeTypes.length() == 0)) {
+            return;
        }
-        initRealtimeParsableMimeTypes(mimeTypes);
-    }
-    
-    /**
-     * This function is used to initialize the realtimeParsableMimeTypes List.
-     * This list contains a list of mimeTypes that can be parsed in realtime by
-     * the yacy html-Parser
-     * @param realtimeParsableMimeTypes a list of mimetypes that can be parsed by the 
-     * yacy html parser
-     */    
-    public static void initRealtimeParsableMimeTypes(List mimeTypesList) {
-        synchronized (realtimeParsableMimeTypes) {
-            realtimeParsableMimeTypes.clear();
-            realtimeParsableMimeTypes.addAll(mimeTypesList);
+        String[] realtimeParsableMimeTypeList = htmlParsableMimeTypes.split(",");        
+        for (int i = 0; i < realtimeParsableMimeTypeList.length; i++) {
+            mimeTypes.add(realtimeParsableMimeTypeList[i].toLowerCase().trim());
+        }
+        synchronized (supportedHTMLMimeTypes) {
+            supportedHTMLMimeTypes.clear();
+            supportedHTMLMimeTypes.addAll(mimeTypes);
        }        
    }
    
@ -277,32 +247,31 @@ public final class plasmaParser {
        }        
    }
    
-    public static void initSupportedRealtimeFileExt(List supportedRealtimeFileExtList) {
-        synchronized (supportedRealtimeFileExt) {
-            supportedRealtimeFileExt.clear();
-            supportedRealtimeFileExt.addAll(supportedRealtimeFileExtList);
+    public static void initSupportedHTMLFileExt(List supportedRealtimeFileExtList) {
+        synchronized (supportedHTMLFileExt) {
+            supportedHTMLFileExt.clear();
+            supportedHTMLFileExt.addAll(supportedRealtimeFileExtList);
        }
    }
        
-    public static boolean realtimeParsableMimeTypesContains(String mimeType) {
-        mimeType = getRealMimeType(mimeType);
-        synchronized (realtimeParsableMimeTypes) {
-            return realtimeParsableMimeTypes.contains(mimeType);
+    public static boolean HTMLParsableMimeTypesContains(String mimeType) {
+        mimeType = normalizeMimeType(mimeType);
+        synchronized (supportedHTMLMimeTypes) {
+            return supportedHTMLMimeTypes.contains(mimeType);
        }
    }
    
-    public static boolean supportedRealTimeContent(yacyURL url, String mimeType) {
-        return realtimeParsableMimeTypesContains(mimeType) && supportedRealtimeFileExtContains(url);
+    public static boolean supportedHTMLContent(yacyURL url, String mimeType) {
+        return HTMLParsableMimeTypesContains(mimeType) && supportedHTMLFileExtContains(url);
    }    
    
-    public static boolean supportedRealtimeFileExtContains(yacyURL url) {
+    public static boolean supportedHTMLFileExtContains(yacyURL url) {
        String fileExt = getFileExt(url);
-        synchronized (supportedRealtimeFileExt) {
-            return supportedRealtimeFileExt.contains(fileExt);
+        synchronized (supportedHTMLFileExt) {
+            return supportedHTMLFileExt.contains(fileExt);
        }   
    }

-    
    public static String getFileExt(yacyURL url) {
        // getting the file path
        String name = url.getPath();
@ -319,13 +288,12 @@ public final class plasmaParser {
        return name.substring(p + 1);        
    }

-    
    public static boolean mediaExtContains(String mediaExt) {
        if (mediaExt == null) return false;
        mediaExt = mediaExt.trim().toLowerCase();
        
-        synchronized (supportedRealtimeFileExt) {
-            if (supportedRealtimeFileExt.contains(mediaExt)) return false;
+        synchronized (supportedHTMLFileExt) {
+            if (supportedHTMLFileExt.contains(mediaExt)) return false;
        }        
        
        if (supportedFileExtContains(mediaExt)) return false;
@ -407,7 +375,7 @@ public final class plasmaParser {
    	return encoding;
    }
    
-    public static String getRealMimeType(String mimeType) {
+    public static String normalizeMimeType(String mimeType) {
        //if (mimeType == null) doMimeTypeAnalysis
        if (mimeType == null) mimeType = "application/octet-stream";
        mimeType = mimeType.trim().toLowerCase();
@ -616,7 +584,7 @@ public final class plasmaParser {
                this.theLogger.logFine("Parsing '" + location + "' from stream");            
            
            // getting the mimetype of the document
-            mimeType = getRealMimeType(theMimeType);
+            mimeType = normalizeMimeType(theMimeType);
            
            // getting the file extension of the document
            String fileExt = getFileExt(location);
@ -646,7 +614,7 @@ public final class plasmaParser {
                theParser.setContentLength(contentLength);
                // parse the resource
                doc = theParser.parse(location, mimeType,documentCharset,sourceStream);
-            } else if (realtimeParsableMimeTypesContains(mimeType)) {
+            } else if (HTMLParsableMimeTypesContains(mimeType)) {
                doc = parseHtml(location, mimeType, documentCharset, sourceStream);
            } else {
                String errorMsg = "No parser available to parse mimetype '" + mimeType + "'";
@ -749,7 +717,7 @@ public final class plasmaParser {
     */
    private Parser getParser(String mimeType) {

-        mimeType = getRealMimeType(mimeType);        
+        mimeType = normalizeMimeType(mimeType);        
        try {
            
            // determining the proper parser class name for the mimeType
@ -782,17 +750,6 @@ public final class plasmaParser {
        
    }
    
-    /*
-    public static String urlNormalform(URL url) {
-        if (url == null) return null;
-        return urlNormalform(url.toString());
-    }
-    
-    public static String urlNormalform(String us) {
-        return htmlFilterContentScraper.urlNormalform(us);
-    }   
-    */
-    
    static Map allReflinks(Set links) {
        // links is either a Set of Strings (with urls) or htmlFilterImageEntries
        // we find all links that are part of a reference inside a url
@ -909,11 +866,8 @@ public final class plasmaParser {
            // creating a plasma parser
            plasmaParser theParser = new plasmaParser();
            
-            // configuring the realtime parsable mimeTypes
-            plasmaParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain,text/sgml");
-            
-            // configure all other supported mimeTypes
-            plasmaParser.enableAllParsers(PARSER_MODE_PROXY);
+            // configuring the html parsable mimeTypes
+            plasmaParser.initHTMLParsableMimeTypes("application/xhtml+xml,text/html,text/plain,text/sgml");

            // parsing the content
            plasmaParserDocument document = null;
@ -955,17 +909,6 @@ public final class plasmaParser {
        }
    }
    
-    private static void enableAllParsers(String parserMode) {
-        if (!PARSER_MODE.contains(parserMode)) throw new IllegalArgumentException();
-        
-        plasmaParserConfig config = (plasmaParserConfig) parserConfigList.get(parserMode);
-        if (config == null) {
-            config = new plasmaParserConfig(parserMode);
-            parserConfigList.put(parserMode, config);
-        }
-        config.enableAllParsers();        
-    }
-    
    public static boolean supportedContent(yacyURL url, String mimeType) {
        if (url == null) throw new NullPointerException();
        
@ -984,6 +927,7 @@ public final class plasmaParser {
        if (!PARSER_MODE.contains(parserMode)) throw new IllegalArgumentException();
        if (url == null) throw new NullPointerException();
        
+        if (parserMode.equals(PARSER_MODE_IMAGE)) return true;
        plasmaParserConfig config = (plasmaParserConfig) parserConfigList.get(parserMode);
        return (config == null)?false:config.supportedContent(url, mimeType);
    }
--- a/source/de/anomic/plasma/plasmaParserConfig.java
+++ b/source/de/anomic/plasma/plasmaParserConfig.java
@ -88,7 +88,7 @@ public class plasmaParserConfig {
    public boolean supportedContent(yacyURL url, String mimeType) {
        // TODO: we need some exceptions here to index URLs like this
        //       http://www.musicabona.com/respighi/12668/cd/index.html.fr
-        mimeType = plasmaParser.getRealMimeType(mimeType);
+        mimeType = plasmaParser.normalizeMimeType(mimeType);
        if (
                mimeType.equals("text/html") ||
                mimeType.equals("application/xhtml+xml") ||
@ -100,10 +100,10 @@ public class plasmaParserConfig {
    }        
    
    public boolean supportedMimeTypesContains(String mimeType) {
-        mimeType = plasmaParser.getRealMimeType(mimeType);
+        mimeType = plasmaParser.normalizeMimeType(mimeType);
        
-        synchronized (plasmaParser.realtimeParsableMimeTypes) {
-            if (plasmaParser.realtimeParsableMimeTypes.contains(mimeType)) return true;
+        synchronized (plasmaParser.supportedHTMLMimeTypes) {
+            if (plasmaParser.supportedHTMLMimeTypes.contains(mimeType)) return true;
        }        

        synchronized (this.enabledParserList) { 
@ -124,8 +124,8 @@ public class plasmaParserConfig {
        if (fileExt == null) return false;        
        fileExt = fileExt.trim().toLowerCase();

-        synchronized (plasmaParser.supportedRealtimeFileExt) {
-            if (plasmaParser.supportedRealtimeFileExt.contains(fileExt)) return true;
+        synchronized (plasmaParser.supportedHTMLFileExt) {
+            if (plasmaParser.supportedHTMLFileExt.contains(fileExt)) return true;
        }        

        synchronized(this.supportedFileExt) {
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -563,11 +563,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
    // Parser settings
    //////////////////////////////////////////////////////////////////////////////////////////////
    
-    public static final String PARSER_MIMETYPES_REALTIME        = "parseableRealtimeMimeTypes";
+    public static final String PARSER_MIMETYPES_HTML            = "parseableMimeTypes.HTML";
    public static final String PARSER_MIMETYPES_PROXY           = "parseableMimeTypes.PROXY";
    public static final String PARSER_MIMETYPES_CRAWLER         = "parseableMimeTypes.CRAWLER";
    public static final String PARSER_MIMETYPES_ICAP            = "parseableMimeTypes.ICAP";
    public static final String PARSER_MIMETYPES_URLREDIRECTOR   = "parseableMimeTypes.URLREDIRECTOR";
+    public static final String PARSER_MIMETYPES_IMAGE           = "parseableMimeTypes.IMAGE";
    public static final String PARSER_MEDIA_EXT                 = "mediaExt";
    public static final String PARSER_MEDIA_EXT_PARSEABLE       = "parseableExt";
    
@ -1180,15 +1181,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        // define an extension-blacklist
        log.logConfig("Parser: Initializing Extension Mappings for Media/Parser");
        plasmaParser.initMediaExt(plasmaParser.extString2extList(getConfig(PARSER_MEDIA_EXT,"")));
-        plasmaParser.initSupportedRealtimeFileExt(plasmaParser.extString2extList(getConfig(PARSER_MEDIA_EXT_PARSEABLE,"")));
+        plasmaParser.initSupportedHTMLFileExt(plasmaParser.extString2extList(getConfig(PARSER_MEDIA_EXT_PARSEABLE,"")));
        
        // define a realtime parsable mimetype list
        log.logConfig("Parser: Initializing Mime Types");
-        plasmaParser.initRealtimeParsableMimeTypes(getConfig(PARSER_MIMETYPES_REALTIME,"application/xhtml+xml,text/html,text/plain"));
-        plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_PROXY,getConfig(PARSER_MIMETYPES_PROXY,null));
-        plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER,getConfig(PARSER_MIMETYPES_CRAWLER,null));
-        plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_ICAP,getConfig(PARSER_MIMETYPES_ICAP,null));
-        plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_URLREDIRECTOR,getConfig(PARSER_MIMETYPES_URLREDIRECTOR,null));
+        plasmaParser.initHTMLParsableMimeTypes(getConfig(PARSER_MIMETYPES_HTML, "application/xhtml+xml,text/html,text/plain"));
+        plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_PROXY, getConfig(PARSER_MIMETYPES_PROXY, null));
+        plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, getConfig(PARSER_MIMETYPES_CRAWLER, null));
+        plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_ICAP, getConfig(PARSER_MIMETYPES_ICAP, null));
+        plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_URLREDIRECTOR, getConfig(PARSER_MIMETYPES_URLREDIRECTOR, null));
+        plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_IMAGE, getConfig(PARSER_MIMETYPES_IMAGE, null));
        
        // start a loader
        log.logConfig("Starting Crawl Loader");
--- a/yacy.init
+++ b/yacy.init
@ -224,9 +224,9 @@ proxyCacheMigration = true

 # the following mime-types are the whitelist for indexing
 #
-# parseableRealtimeMimeTypes: specifies mime-types that can be indexed on the fly
+# parseableMimeTypes.HTML: specifies mime-types that can be indexed with built-in html parser
 # parseableMime: specifies mime-types that can be indexed but not on the fly
-parseableRealtimeMimeTypes=application/xhtml+xml,text/html,text/plain,text/sgml
+parseableMimeTypes.HTML=application/xhtml+xml,text/html,text/plain,text/sgml
 parseableMimeTypes=
 parseableMimeTypes__pro=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-vnd.oasis.opendocument.text,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml
 parseableMimeTypes.CRAWLER=
@ -237,6 +237,7 @@ parseableMimeTypes.ICAP=
 parseableMimeTypes.ICAP__pro=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-vnd.oasis.opendocument.text,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml
 parseableMimeTypes.URLREDIRECTOR=
 parseableMimeTypes.URLREDIRECTOR__pro=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-vnd.oasis.opendocument.text,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml
+parseableMimeTypes.IMAGE=image/gif,image/jpeg,image/png,image/tiff,image/vnd.wap.wbmp,image/x-icon,image/bmp

 # media extension string
 # a comma-separated list of extensions that denote media file formats