From e7d055b98e20292d79cd59c733719ed943bc97df Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Wed, 13 Apr 2005 15:52:00 +0000
Subject: [PATCH] very experimental integration of the new generic parser and
 optional disabling of bluelist filtering in proxy. Does not yet work
 properly. To disable the disable-feature, the presence of a non-empty
 bluelist is necessary

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@17 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/CacheAdmin_p.java                      |   7 +-
 htroot/IndexCreate_p.java                     |   2 +-
 .../htmlFilter/htmlFilterContentScraper.java  | 188 +-----------
 .../htmlFilterContentTransformer.java         |   6 +
 .../htmlFilter/htmlFilterTransformer.java     |   5 +
 source/de/anomic/http/httpdProxyHandler.java  | 111 ++++---
 .../de/anomic/plasma/plasmaCrawlLoader.java   |  21 +-
 source/de/anomic/plasma/plasmaHTCache.java    | 281 +++++++++++-------
 source/de/anomic/plasma/plasmaParser.java     |  20 +-
 .../de/anomic/plasma/plasmaSwitchboard.java   | 106 ++++---
 .../plasma/plasmaWordIndexFileCache.java      |  42 ++-
 source/yacy.java                              |   2 +-
 yacy.blue                                     |   1 +
 yacy.init                                     |  12 +-
 14 files changed, 404 insertions(+), 400 deletions(-)
 create mode 100644 yacy.blue
diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
index 6f23c5b77..702244c6e 100644
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@@ -114,11 +114,12 @@ public class CacheAdmin_p {
                     else {
                         htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
                         OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
+                        plasmaParser.document document = switchboard.parser.transformScraper(url, "text/html", scraper);
                         serverFileUtils.copy(file, os);
                         info += "<b>HEADLINE:</b><br>" + scraper.getHeadline() + "<br><br>";
-                        info += "<b>HREF:</b><br>" + formatAnchor(scraper.getHyperlinks()) + "<br>";
-                        info += "<b>MEDIA:</b><br>" + formatAnchor(scraper.getMedialinks()) + "<br>";
-                        info += "<b>EMAIL:</b><br>" + formatAnchor(scraper.getEmaillinks()) + "<br>";
+                        info += "<b>HREF:</b><br>" + formatAnchor(document.getHyperlinks()) + "<br>";
+                        info += "<b>MEDIA:</b><br>" + formatAnchor(document.getMedialinks()) + "<br>";
+                        info += "<b>EMAIL:</b><br>" + formatAnchor(document.getEmaillinks()) + "<br>";
                         info += "<b>TEXT:</b><br><span class=\"small\">" + new String(scraper.getText()) + "</span><br>";
                     }
                 } catch (Exception e) {
diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java
index ef147f490..e2be76047 100644
--- a/htroot/IndexCreate_p.java
+++ b/htroot/IndexCreate_p.java
@@ -305,7 +305,7 @@ public class IndexCreate_p {
                             prop.put("indexing-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
                             prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth);
                             prop.put("indexing-queue_list_"+i+"_modified", daydate(pcentry.lastModified));
-                            prop.put("indexing-queue_list_"+i+"_href",((pcentry.scraper == null) ? "0" : ("" + pcentry.scraper.getHyperlinks().size())));
+                            prop.put("indexing-queue_list_"+i+"_href",((pcentry.scraper == null) ? "0" : ("" + pcentry.scraper.getAnchors().size())));
                             prop.put("indexing-queue_list_"+i+"_anchor", ((pcentry.scraper == null) ? "-" : pcentry.scraper.getHeadline()) );
                             prop.put("indexing-queue_list_"+i+"_url", pcentry.urlString);
                             dark = !dark;
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index 48ca7c8d7..c3b7343e0 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -52,10 +52,6 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
     private static HashSet linkTags0;
     private static HashSet linkTags1;
 
-    public static String mediaExt =
-        "swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
-        "sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj";
-    
     static {
 	linkTags0 = new HashSet();
 	linkTags0.add("img");
@@ -67,8 +63,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
     }
 
     // class variables: collectors for links
-    private Properties anchor;
-    private Properties image;
+    private HashMap anchors;
+    private HashMap images;
     private String title;
     private String headline;
     private serverByteBuffer text;
@@ -79,8 +75,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
         // it is only the reference for relative links
 	super(linkTags0, linkTags1);
 	this.root = root;
-	this.anchor = new Properties();
-	this.image = new Properties();
+	this.anchors = new HashMap();
+	this.images = new HashMap();
 	this.title = "";
 	this.headline = "";
 	this.text = new serverByteBuffer();
@@ -117,12 +113,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
     }
 
     public void scrapeTag0(String tagname, Properties tagopts) {
-	if (tagname.equals("img")) image.setProperty(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
+	if (tagname.equals("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
     }
 
     public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
 	//System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
-	if (tagname.equals("a")) anchor.setProperty(absolutePath(tagopts.getProperty("href", "")),
+	if (tagname.equals("a")) anchors.put(absolutePath(tagopts.getProperty("href", "")),
 						    new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString());
 	if (tagname.equals("h1")) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
 	if (tagname.equals("title")) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
@@ -153,179 +149,19 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
 	return text.getBytes();
     }
     
-    public Properties getAnchor() {
-	return anchor;
-    }
-
-    public Properties getImage() {
-	return image;
-    }
-
-    public Map getHyperlinks() {
-	if (hyperlinks == null) resortLinks();
-	return hyperlinks;
-    }
-
-    public Map getMedialinks() {
-	if (medialinks == null) resortLinks();
-	return medialinks;
-    }
-
-    public Map getEmaillinks() {
-	if (emaillinks == null) resortLinks();
-	return emaillinks;
-    }
-
-    HashMap hyperlinks = null;
-    HashMap medialinks = null;
-    HashMap emaillinks = null;
-
-            private synchronized void resortLinks() {
-            Iterator i;
-            String url;
-            int extpos;
-            String ext;
-            i = anchor.entrySet().iterator();
-            hyperlinks = new HashMap();
-            medialinks = new HashMap();
-            emaillinks = new HashMap();
-            Map.Entry entry;
-            while (i.hasNext()) {
-                entry = (Map.Entry) i.next();
-                url = (String) entry.getKey();
-                if ((url != null) && (url.startsWith("mailto:"))) {
-                    emaillinks.put(url.substring(7), entry.getValue());
-                } else {
-                    extpos = url.lastIndexOf(".");
-                    String normal;
-                    if (extpos > 0) {
-                        ext = url.substring(extpos).toLowerCase();
-                        normal = urlNormalform(url);
-                        if (normal != null) {
-                            if (mediaExt.indexOf(ext.substring(1)) >= 0) {
-                                // this is not an normal anchor, its a media link
-                                medialinks.put(normal, entry.getValue());
-                            } else {
-                                hyperlinks.put(normal, entry.getValue());
-                            }
-                        }
-                    }
-                }
-            }
-            // finally add the images to the medialinks
-            i = image.entrySet().iterator();
-            String normal;
-            while (i.hasNext()) {
-                entry = (Map.Entry) i.next();
-                url = (String) entry.getKey();
-                normal = urlNormalform(url);
-                if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
-            }
-            expandHyperlinks();
-        }
-        
-            /*
-    private synchronized void resortLinks() {
-	Enumeration e;
-	String url;
-	int extpos;
-	String ext;
-	e = anchor.propertyNames();
-	hyperlinks = new Properties();
-	medialinks = new Properties();
-	emaillinks = new Properties();
-	while (e.hasMoreElements()) {
-	    url = (String) e.nextElement();
-	    if ((url != null) && (url.startsWith("mailto:"))) {
-		emaillinks.setProperty(url.substring(7), anchor.getProperty(url));
-	    } else {
-		extpos = url.lastIndexOf(".");
-		String normal;
-		if (extpos > 0) {
-		    ext = url.substring(extpos).toLowerCase();
-		    normal = urlNormalform(url);
-		    if (normal != null) {
-			if (mediaExt.indexOf(ext.substring(1)) >= 0) {
-			    // this is not an normal anchor, its a media link
-			    medialinks.setProperty(normal, anchor.getProperty(url));
-			} else {
-			    hyperlinks.setProperty(normal, anchor.getProperty(url));
-			}
-		    }
-		}
-	    }
-	}
-	// finally add the images to the medialinks
-	e = image.propertyNames();
-	String normal;
-	while (e.hasMoreElements()) {
-	    url = (String) e.nextElement();
-	    normal = urlNormalform(url);
-	    if (normal != null) medialinks.setProperty(normal, image.getProperty(url)); // avoid NullPointerException
-	}
+    public Map getAnchors() {
+	return anchors;
     }
-*/
 
-    public synchronized void expandHyperlinks() {
-	// we add artificial hyperlinks to the hyperlink set that can be calculated from
-	// given hyperlinks and imagelinks
-	hyperlinks.putAll(allReflinks(hyperlinks));
-	hyperlinks.putAll(allReflinks(medialinks));
-	hyperlinks.putAll(allSubpaths(hyperlinks));
-	hyperlinks.putAll(allSubpaths(medialinks));
+    public Map getImages() {
+	return images;
     }
 
-    private static Map allReflinks(Map links) {
-	// we find all links that are part of a reference inside a url
-	HashMap v = new HashMap();
-	Iterator i = links.keySet().iterator();
-	String s;
-	int pos;
-	loop: while (i.hasNext()) {
-	    s = (String) i.next();
-	    if ((pos = s.toLowerCase().indexOf("http://",7)) > 0) {
-		i.remove();
-		s = s.substring(pos);
-		while ((pos = s.toLowerCase().indexOf("http://",7)) > 0) s = s.substring(pos);
-		if (!(v.containsKey(s))) v.put(s, "ref");
-		continue loop;
-	    }
-	    if ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) {
-		i.remove();
-		s = "http:/" + s.substring(pos);
-		while ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) s = "http:/" + s.substring(pos);
-		if (!(v.containsKey(s))) v.put(s, "ref");
-		continue loop;
-	    }
-	}
-	return v;
-    }
-
-    private static Map allSubpaths(Map links) {
-	HashMap v = new HashMap();
-	Iterator i = links.keySet().iterator();
-	String s;
-	int pos;
-	while (i.hasNext()) {
-	    s = (String) i.next();
-	    if (s.endsWith("/")) s = s.substring(0, s.length() - 1);
-	    pos = s.lastIndexOf("/");
-	    while (pos > 8) {
-		s = s.substring(0, pos + 1);
-		if (!(v.containsKey(s))) v.put(s, "sub");
-		s = s.substring(0, pos);
-		pos = s.lastIndexOf("/");
-	    }
-	}
-	return v;
-    }
-
-
     public void print() {
 	System.out.println("TITLE   :" + title);
 	System.out.println("HEADLINE:" + headline);
-	System.out.println("ANCHORS :" + anchor.toString());
-	System.out.println("IMAGES  :" + image.toString());
+	System.out.println("ANCHORS :" + anchors.toString());
+	System.out.println("IMAGES  :" + images.toString());
 	System.out.println("TEXT    :" + new String(text.getBytes()));
     }
 
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
index 6d187d877..4e2baa5ee 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
@@ -65,6 +65,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
     }
 
     public void init(String initarg) {
+        System.out.println("Transformer init: " + initarg);
 	if (bluelist == null) {
 	    // here, the initarg is used to load a list of bluelisted words
 	    bluelist = new Vector();
@@ -78,9 +79,14 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
 		r.close();
 	    } catch (Exception e) {
 	    }
+            if (bluelist.size() == 0) System.out.println("BLUELIST is empty");
 	}
     }
 
+    public boolean isIdentityTransformer() {
+        return bluelist.size() == 0;
+    }
+    
     private static byte[] genBlueLetters(int length) {
 	serverByteBuffer bb = new serverByteBuffer(" <FONT COLOR=#0000FF>".getBytes());
 	length = length / 2;
diff --git a/source/de/anomic/htmlFilter/htmlFilterTransformer.java b/source/de/anomic/htmlFilter/htmlFilterTransformer.java
index 816cf8138..bc68336ed 100644
--- a/source/de/anomic/htmlFilter/htmlFilterTransformer.java
+++ b/source/de/anomic/htmlFilter/htmlFilterTransformer.java
@@ -49,6 +49,11 @@ public interface htmlFilterTransformer {
     // more specific transformation rules
     public void init(String initarg);
 
+    // ask if this transformer will do any transformation whatsoever
+    // this may return true if the initialization resultet in a status
+    // that does not allow any transformation
+    public boolean isIdentityTransformer();
+    
     // tests, if a given body-less tag (i.e. <br> shall be supervised)
     // only tags that are defined here will be cached and not streamed
     public boolean isTag0(String tag);
diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java
index 7cbd696ee..6efb0ec6d 100644
--- a/source/de/anomic/http/httpdProxyHandler.java
+++ b/source/de/anomic/http/httpdProxyHandler.java
@@ -125,15 +125,9 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
                 if (!(htRootPath.exists())) htRootPath.mkdir();
             }
             
-	    // load a transformer
-	    try {
-		ClassLoader cp = new serverClassLoader(this.getClass().getClassLoader());
-		Class transformerClass = cp.loadClass(switchboard.getConfig("pageTransformerClass", ""));
-		transformer = (htmlFilterTransformer) transformerClass.newInstance();
-		transformer.init(switchboard.getConfig("pageTransformerArg", "")); // this is usually the blueList
-	    } catch (Exception e) {
-		transformer = null;
-	    }
+            // load a transformer
+            transformer = new htmlFilterContentTransformer();
+            transformer.init(new File(switchboard.getRootPath(), switchboard.getConfig("plasmaBlueList", "")).toString());
 
 	    String f;
 	    // load the yellow-list
@@ -396,7 +390,7 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
 	if (cacheExists) {
 	    // we respond on the request by using the cache
 
-            hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK", cachedResponseHeader, null, null, switchboard.defaultProxyProfile);
+            hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK", cachedResponseHeader, null, switchboard.defaultProxyProfile);
 
 	    if (hpc.shallUseCache()) {
 		// the cache is fresh
@@ -426,7 +420,8 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
 			respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative'
 			
 			// make a transformer
-			if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
+			if ((!(transformer.isIdentityTransformer())) &&
+                            ((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
                             ((cachedResponseHeader == null) || (httpd.isTextMime(cachedResponseHeader.mime(), switchboard.mimeWhite)))) {
 			    hfos = new htmlFilterOutputStream(respond, null, transformer, (ext.length() == 0));
 			} else {
@@ -472,24 +467,30 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
 		    res = remote.GET(remotePath, requestHeader);
 		    long contentLength = res.responseHeader.contentLength();
 
-		    // make a scraper and transformer
+		    // reserver cache entry
+		    hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
+                    
+                    // make a scraper and transformer
                     if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
                         (httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
-			scraper = new htmlFilterContentScraper(url);
-			hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
-                        if (((htmlFilterOutputStream) hfos).binarySuspect()) {
-                            scraper = null; // forget it, may be rubbish
-                            log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
+                        if (transformer.isIdentityTransformer()) {
+                            hfos = hpc.getContentOutputStream();
+                        } else {
+                            scraper = new htmlFilterContentScraper(url);
+                            hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
+                            if (((htmlFilterOutputStream) hfos).binarySuspect()) {
+                                scraper = null; // forget it, may be rubbish
+                                log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
+                            }
+                            hpc.scraper = scraper;
                         }
 		    } else {
                         log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
 			scraper = null;
 			hfos = respond;
+                        hpc.scraper = scraper;
 		    }
 
-		    // reserver cache entry
-		    hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, scraper, null, switchboard.defaultProxyProfile);
-
                     // handle incoming cookies
                     handleIncomingCookies(res.responseHeader, host, ip);
                     
@@ -502,7 +503,13 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
 			    if ((contentLength > 0) && // known
                                 (contentLength < 1048576)) // 1 MB
                             {
-				byte[] cacheArray = res.writeContent(hfos);
+				byte[] cacheArray;
+                                if (transformer.isIdentityTransformer()) {
+                                    res.writeContentX(hfos, respond);
+                                    cacheArray = hpc.getContentBytes();
+                                } else {
+                                    cacheArray = res.writeContent(hfos);
+                                }
 				if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
 				// before we came here we deleted a cache entry
 				if (sizeBeforeDelete == cacheArray.length) {
@@ -514,8 +521,16 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
 				    cacheManager.stackProcess(hpc, cacheArray); // necessary update, write response header to cache
 				}
 			    } else {
+                                // the file is too big to cache it in the ram, write to file
 				cacheFile.getParentFile().mkdirs();
-				res.writeContent(hfos, cacheFile);
+                                if (transformer.isIdentityTransformer()) {
+                                    res.writeContent(respond, cacheFile);
+                                    if (contentLength < 10485760) { // 10 mb
+                                        serverFileUtils.copy(cacheFile, hfos);
+                                    } // else hfos is empty and that means: no work afterwards with it
+                                } else {
+                                    res.writeContent(hfos, cacheFile);
+                                }
 				if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
 				// before we came here we deleted a cache entry
 				if (sizeBeforeDelete == cacheFile.length()) {
@@ -579,24 +594,30 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
 		httpc.response res = remote.GET(remotePath, requestHeader);
 		long contentLength = res.responseHeader.contentLength();
 
-		// make a scraper and transformer
+		// reserve cache entry
+		hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
+                
+                // make a scraper and transformer
                 if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
                     (httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
-                    scraper = new htmlFilterContentScraper(url);
-		    hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
-                    if (((htmlFilterOutputStream) hfos).binarySuspect()) {
-                        scraper = null; // forget it, may be rubbish
-                        log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
+                    if (transformer.isIdentityTransformer()) {
+                        hfos = hpc.getContentOutputStream();
+                    } else {
+                        scraper = new htmlFilterContentScraper(url);
+                        hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
+                        if (((htmlFilterOutputStream) hfos).binarySuspect()) {
+                            scraper = null; // forget it, may be rubbish
+                            log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
+                        }
+                        hpc.scraper = scraper;
                     }
-		} else {
+                } else {
                     log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
 		    scraper = null;
 		    hfos = respond;
+                    hpc.scraper = scraper;
 		}
 
-		// reserve cache entry
-		hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, scraper, null, switchboard.defaultProxyProfile);
-
                 // handle incoming cookies
                 handleIncomingCookies(res.responseHeader, host, ip);
                     
@@ -608,16 +629,29 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
                     if ((storeError = hpc.shallStoreCache()) == null) {
 			// we write a new cache entry
 			if ((contentLength > 0) && (contentLength < 1048576)) {
-			    // write to buffer
-			    byte[] cacheArray = res.writeContent(hfos);
-			    if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
+			// write to buffer
+                            byte[] cacheArray;
+                            if (transformer.isIdentityTransformer()) {
+                                res.writeContentX(hfos, respond);
+                                cacheArray = hpc.getContentBytes();
+                            } else {
+                                cacheArray = res.writeContent(hfos);
+                            }
+                            if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
 			    // enQueue new entry with response header and file as byte[]
                             hpc.status = plasmaHTCache.CACHE_FILL;
 			    cacheManager.stackProcess(hpc, cacheArray);
 			} else try {
 			    // write to file system directly
 			    cacheFile.getParentFile().mkdirs();
-			    res.writeContent(hfos, cacheFile);
+                            if (transformer.isIdentityTransformer()) {
+                                res.writeContent(respond, cacheFile);
+                                if (contentLength < 10485760) { // 10 mb
+                                    serverFileUtils.copy(cacheFile, hfos);
+                                } // else hfos is empty and that means: no work afterwards with it
+                            } else {
+                                res.writeContent(hfos, cacheFile);
+                            }
 			    if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
 			    // enQueue new entry with response header
 			    hpc.status = plasmaHTCache.CACHE_FILL;
@@ -711,9 +745,6 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
         }
     }
 
-    
-
-    
     public void doHead(Properties conProp, httpHeader requestHeader, OutputStream respond) throws IOException {
 	String method = conProp.getProperty("METHOD");
 	String host = conProp.getProperty("HOST");
@@ -834,8 +865,6 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
 	respond.flush();
     }
 
-
-
     public void doConnect(Properties conProp, de.anomic.http.httpHeader requestHeader, InputStream clientIn, OutputStream clientOut) throws IOException {
         String host = conProp.getProperty("HOST");
 	int    port = Integer.parseInt(conProp.getProperty("PORT"));
diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java
index 52aa1f1ff..bc1821b22 100644
--- a/source/de/anomic/plasma/plasmaCrawlLoader.java
+++ b/source/de/anomic/plasma/plasmaCrawlLoader.java
@@ -88,7 +88,7 @@ public class plasmaCrawlLoader {
                     // we kill that thread
                     thread.interrupt(); // hopefully this wakes him up.
                     slots.remove(i);
-                    System.out.println("CRAWLER: IGNORING SLEEPING DOWNLOAD SLOT " + thread.url.toString());
+                    log.logDebug("IGNORING SLEEPING DOWNLOAD SLOT " + thread.url.toString());
                 }
             } else {
                 // thread i is dead, remove it
@@ -198,31 +198,26 @@ public class plasmaCrawlLoader {
                     // the transfer is ok
                     long contentLength = res.responseHeader.contentLength();
                     
-                    // make a scraper and transformer
-                    htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
-                    OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
-                    
                     // reserve cache entry
-                    plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, scraper, initiator, profile);
+                    plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, initiator, profile);
                     
                     // request has been placed and result has been returned. work off response
                     File cacheFile = cacheManager.getCachePath(url);
                     try {
                         if (!(httpd.isTextMime(res.responseHeader.mime().toLowerCase(), acceptMimeTypes))) {
                             // if the response has not the right file type then reject file
-                            hfos.close();
                             remote.close();
-                            System.out.println("REJECTED WRONG MIME TYPE " + res.responseHeader.mime() + " for url " + url.toString());
+                            log.logInfo("REJECTED WRONG MIME TYPE " + res.responseHeader.mime() + " for url " + url.toString());
                             htCache.status = plasmaHTCache.CACHE_UNFILLED;
                         } else if ((profile.storeHTCache()) && ((error = htCache.shallStoreCache()) == null)) {
                             // we write the new cache entry to file system directly
                             cacheFile.getParentFile().mkdirs();
-                            res.writeContent(hfos, cacheFile); // writes in content scraper and cache file
+                            res.writeContent(htCache.getContentOutputStream(), cacheFile); // writes in content scraper and cache file
                             htCache.status = plasmaHTCache.CACHE_FILL;
                         } else {
                             if (error != null) log.logDebug("CRAWLER NOT STORED RESOURCE " + url.toString() + ": " + error);
                             // anyway, the content still lives in the content scraper
-                            res.writeContent(hfos, null); // writes only into content scraper
+                            res.writeContent(htCache.getContentOutputStream(), null); // writes only into content scraper
                             htCache.status = plasmaHTCache.CACHE_PASSING;
                         }
                         // enQueue new entry with response header
@@ -240,18 +235,18 @@ public class plasmaCrawlLoader {
                         // but we clean the cache also, since it may be only partial
                         // and most possible corrupted
                         if (cacheFile.exists()) cacheFile.delete();
-                        System.out.println("CRAWLER LOADER ERROR1: with url=" + url.toString() + ": " + e.toString());
+                        log.logError("CRAWLER LOADER ERROR1: with url=" + url.toString() + ": " + e.toString());
                     }
                 } else {
                     // if the response has not the right response type then reject file
-                    System.out.println("REJECTED WRONG STATUS TYPE '" + res.status + "' for url " + url.toString());
+                    log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for url " + url.toString());
                     // not processed any further
                 }
                 remote.close();
             } catch (Exception e) {
                 // this may happen if the targeted host does not exist or anything with the
                 // remote server was wrong.
-                System.out.println("CRAWLER LOADER ERROR2 with url=" + url.toString() + ": " + e.toString());
+                log.logError("CRAWLER LOADER ERROR2 with url=" + url.toString() + ": " + e.toString());
                 e.printStackTrace();
             }
         }
diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java
index f6c809696..f5d5b9ebe 100644
--- a/source/de/anomic/plasma/plasmaHTCache.java
+++ b/source/de/anomic/plasma/plasmaHTCache.java
@@ -427,11 +427,10 @@ public class plasmaHTCache {
     public Entry newEntry(Date initDate, int depth, URL url,
 			  httpHeader requestHeader,
 			  String responseStatus, httpHeader responseHeader,
-			  htmlFilterContentScraper scraper,
                           String initiator,
                           plasmaCrawlProfile.entry profile) {
         //System.out.println("NEW ENTRY: " + url.toString()); // DEBUG
-	return new Entry(initDate, depth, url, requestHeader, responseStatus, responseHeader, scraper, initiator, profile);
+	return new Entry(initDate, depth, url, requestHeader, responseStatus, responseHeader, initiator, profile);
     }
 
     public class Entry {
@@ -449,15 +448,17 @@ public class plasmaHTCache {
 	public String                   urlString;
 	public int                      status;         // cache load/hit/stale etc status
 	public Date                     lastModified;
-	public htmlFilterContentScraper scraper;
 	public char                     doctype;
 	public String                   language;
         public plasmaCrawlProfile.entry profile;
         private String                  initiator;
+        public ByteArrayOutputStream    content;
+        public htmlFilterContentScraper scraper;
+
+	
 	public Entry(Date initDate, int depth, URL url,
 		     httpHeader requestHeader,
 		     String responseStatus, httpHeader responseHeader,
-		     htmlFilterContentScraper scraper,
                      String initiator,
                      plasmaCrawlProfile.entry profile) {
 
@@ -478,7 +479,7 @@ public class plasmaHTCache {
 	    this.requestHeader  = requestHeader;
 	    this.responseStatus = responseStatus;
 	    this.responseHeader = responseHeader;
-	    this.scraper        = scraper;
+	    this.content        = new ByteArrayOutputStream();
 	    this.profile        = profile;
             this.initiator      = (initiator == null) ? null : ((initiator.length() == 0) ? null: initiator);
 
@@ -503,8 +504,16 @@ public class plasmaHTCache {
 	    // to be defined later:
 	    this.cacheArray     = null;
 	    this.status         = CACHE_UNFILLED;
+            this.scraper        = null;
 	}
 	
+        public OutputStream getContentOutputStream() {
+            return (OutputStream) content;
+        }
+        public byte[] getContentBytes() {
+            try { content.flush(); } catch (IOException e) {}
+            return content.toByteArray();
+        }
         public String initiator() {
             return initiator;
         }
@@ -614,8 +623,129 @@ public class plasmaHTCache {
 	    
 	    return null;
 	}
-	
-	public String shallIndexCache() {
+
+        public boolean shallUseCache() {
+	    // decide upon header information if a specific file should be taken from the cache or not
+	    
+	    //System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString());
+	    
+	    // -CGI access in request
+	    // CGI access makes the page very individual, and therefore not usable in caches
+	    if (isPOST(urlString)) return false;
+	    if (isCGI(urlString)) return false;
+	    
+	    // -authorization cases in request
+	    if (requestHeader.containsKey("AUTHORIZATION")) return false;
+	    
+	    // -ranges in request
+	    // we do not cache partial content
+	    if ((requestHeader != null) && (requestHeader.containsKey("RANGE"))) return false;
+	    
+	    //Date d1, d2;
+
+	    // -if-modified-since in request
+	    // The entity has to be transferred only if it has
+	    // been modified since the date given by the If-Modified-Since header.
+	    if (requestHeader.containsKey("IF-MODIFIED-SINCE")) {
+		// checking this makes only sense if the cached response contains
+		// a Last-Modified field. If the field does not exist, we go the safe way
+		if (!(responseHeader.containsKey("Last-Modified"))) return false;
+		// parse date
+                Date d1, d2;
+		d2 = responseHeader.lastModified(); if (d2 == null) d2 = new Date();
+		d1 = requestHeader.ifModifiedSince(); if (d1 == null) d1 = new Date();
+		// finally, we shall treat the cache as stale if the modification time is after the if-.. time
+		if (d2.after(d1)) return false;
+	    }
+	    
+	    boolean isNotPicture = !isPicture(responseHeader);
+
+	    // -cookies in request
+	    // unfortunately, we should reload in case of a cookie
+	    // but we think that pictures can still be considered as fresh
+	    if ((requestHeader.containsKey("COOKIE")) && (isNotPicture)) return false;
+	    
+	    // -set-cookie in cached response
+	    // this is a similar case as for COOKIE.
+	    if ((responseHeader.containsKey("SET-COOKIE")) && (isNotPicture)) return false; // too strong
+	    if ((responseHeader.containsKey("SET-COOKIE2")) && (isNotPicture)) return false; // too strong
+	    
+	    // -pragma in cached response
+	    // logically, we would not need to care about no-cache pragmas in cached response headers,
+	    // because they cannot exist since they are not written to the cache.
+	    // So this IF should always fail..
+	    if ((responseHeader.containsKey("PRAGMA")) &&
+		(((String) responseHeader.get("Pragma")).toUpperCase().equals("NO-CACHE"))) return false;
+	    
+	    // calculate often needed values for freshness attributes
+	    Date date           = responseHeader.date();
+	    Date expires        = responseHeader.expires();
+	    Date lastModified   = responseHeader.lastModified();
+	    String cacheControl = (String) responseHeader.get("Cache-Control");
+	    
+	    
+	    // see for documentation also:
+	    // http://www.web-caching.com/cacheability.html
+	    // http://vancouver-webpages.com/CacheNow/
+
+	    // look for freshnes information
+	    // if we don't have any freshnes indication, we treat the file as stale.
+	    // no handle for freshness control:
+	    if ((expires == null) && (cacheControl == null) && (lastModified == null)) return false;
+	    
+	    // -expires in cached response
+	    // the expires value gives us a very easy hint when the cache is stale
+	    if (expires != null) {
+		Date yesterday = new Date((new Date()).getTime() - oneday);
+		if (expires.before(yesterday)) return false;
+	    }
+	    
+	    // -lastModified in cached response
+	    // we can apply a TTL (Time To Live)  heuristic here. We call the time delta between the last read
+	    // of the file and the last modified date as the age of the file. If we consider the file as
+	    // middel-aged then, the maximum TTL would be cache-creation plus age.
+	    // This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
+	    // file may only be treated as fresh for one more month, not more.
+	    if (lastModified != null) {
+		if (date == null) date = new Date();
+		long age = date.getTime() - lastModified.getTime();
+		if (age < 0) return false;
+		// TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
+		// the actual living-time is new Date().getTime() - d2.getTime()
+		// therefore the cache is stale, if Date().getTime() - d2.getTime() > age/10
+		if ((new Date()).getTime() - date.getTime() > age / 10) return false;
+	    }
+	    
+ 	    // -cache-control in cached response
+	    // the cache-control has many value options.
+	    if (cacheControl != null) {
+                cacheControl = cacheControl.trim().toUpperCase();
+                if (cacheControl.startsWith("PUBLIC")) {
+                    // ok, do nothing
+                } else if ((cacheControl.startsWith("PRIVATE")) ||
+                           (cacheControl.startsWith("NO-CACHE")) ||
+                           (cacheControl.startsWith("NO-STORE"))) {
+                    // easy case
+                    return false;
+                } else if (cacheControl.startsWith("MAX-AGE=")) {
+                    // we need also the load date
+                    if (date == null) return false;
+                    try {
+                        long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
+                        if ((new Date()).getTime() - date.getTime() > ttl) {
+                            return false;
+                        }
+                    } catch (Exception e) {
+                        return false;
+                    }
+                }
+	    }
+	    
+	    return true;
+	}
+        
+        	
+	public String shallIndexCacheForProxy() {
 	    // decide upon header information if a specific file should be indexed
 	    // this method returns null if the answer is 'YES'!
 	    // if the answer is 'NO' (do not index), it returns a string with the reason
@@ -670,10 +800,8 @@ public class plasmaHTCache {
 	    // thus we do not care about it here for indexing
 
 	    // -pragma in cached response
-            /*
 	    if ((responseHeader.containsKey("PRAGMA")) &&
 		(((String) responseHeader.get("Pragma")).toUpperCase().equals("NO-CACHE"))) return "Denied_(pragma_no_cache)";
-            */
             
 	    // see for documentation also:
 	    // http://www.web-caching.com/cacheability.html
@@ -732,126 +860,69 @@ public class plasmaHTCache {
 	    
 	    return null;
 	}
-
-    	public boolean shallUseCache() {
-	    // decide upon header information if a specific file should be taken from the cache or not
-	    
-	    //System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString());
+        
+        	
+	public String shallIndexCacheForCrawler() {
+	    // decide upon header information if a specific file should be indexed
+	    // this method returns null if the answer is 'YES'!
+	    // if the answer is 'NO' (do not index), it returns a string with the reason
+	    // to reject the crawling demand in clear text
 	    
+            // check profile
+            if (!(profile.localIndexing())) return "Indexing_Not_Allowed";
+            
 	    // -CGI access in request
 	    // CGI access makes the page very individual, and therefore not usable in caches
-	    if (isPOST(urlString)) return false;
-	    if (isCGI(urlString)) return false;
+	     if ((isPOST(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)";
+             if ((isCGI(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)";
 	    
 	    // -authorization cases in request
-	    if (requestHeader.containsKey("AUTHORIZATION")) return false;
+	    // we checked that in shallStoreCache
 	    
 	    // -ranges in request
-	    // we do not cache partial content
-	    if ((requestHeader != null) && (requestHeader.containsKey("RANGE"))) return false;
+	    // we checked that in shallStoreCache
 	    
-	    //Date d1, d2;
+	    // a picture cannot be indexed
+	    if (isPicture(responseHeader)) return "Media_Content_(Picture)";
+	    if (!(isText(responseHeader))) return "Media_Content_(not_text)";
+	    if (noIndexingURL(urlString)) return "Media_Content_(forbidden)";
 
 	    // -if-modified-since in request
-	    // The entity has to be transferred only if it has
-	    // been modified since the date given by the If-Modified-Since header.
-	    if (requestHeader.containsKey("IF-MODIFIED-SINCE")) {
-		// checking this makes only sense if the cached response contains
-		// a Last-Modified field. If the field does not exist, we go the safe way
-		if (!(responseHeader.containsKey("Last-Modified"))) return false;
-		// parse date
-                Date d1, d2;
-		d2 = responseHeader.lastModified(); if (d2 == null) d2 = new Date();
-		d1 = requestHeader.ifModifiedSince(); if (d1 == null) d1 = new Date();
-		// finally, we shall treat the cache as stale if the modification time is after the if-.. time
-		if (d2.after(d1)) return false;
-	    }
+	    // if the page is fresh at the very moment we can index it
+            // -> this does not apply for the crawler
 	    
-	    boolean isNotPicture = !isPicture(responseHeader);
-
 	    // -cookies in request
-	    // unfortunately, we should reload in case of a cookie
-	    // but we think that pictures can still be considered as fresh
-	    if ((requestHeader.containsKey("COOKIE")) && (isNotPicture)) return false;
-	    
-	    // -set-cookie in cached response
-	    // this is a similar case as for COOKIE.
-	    if ((responseHeader.containsKey("SET-COOKIE")) && (isNotPicture)) return false; // too strong
-	    if ((responseHeader.containsKey("SET-COOKIE2")) && (isNotPicture)) return false; // too strong
-	    
-	    // -pragma in cached response
-	    // logically, we would not need to care about no-cache pragmas in cached response headers,
-	    // because they cannot exist since they are not written to the cache.
-	    // So this IF should always fail..
-	    if ((responseHeader.containsKey("PRAGMA")) &&
-		(((String) responseHeader.get("Pragma")).toUpperCase().equals("NO-CACHE"))) return false;
-	    
-	    // calculate often needed values for freshness attributes
-	    Date date           = responseHeader.date();
-	    Date expires        = responseHeader.expires();
-	    Date lastModified   = responseHeader.lastModified();
-	    String cacheControl = (String) responseHeader.get("Cache-Control");
-	    
-	    
-	    // see for documentation also:
-	    // http://www.web-caching.com/cacheability.html
-	    // http://vancouver-webpages.com/CacheNow/
+	    // unfortunately, we cannot index pages which have been requested with a cookie
+	    // because the returned content may be special for the client
+            // -> this does not apply for a crawler
+
+	    // -set-cookie in response
+	    // the set-cookie from the server does not indicate that the content is special
+	    // thus we do not care about it here for indexing
+            // -> this does not apply for a crawler
 
+	    // -pragma in cached response
+            // -> in the crawler we ignore this
+            
 	    // look for freshnes information
-	    // if we don't have any freshnes indication, we treat the file as stale.
-	    // no handle for freshness control:
-	    if ((expires == null) && (cacheControl == null) && (lastModified == null)) return false;
 	    
 	    // -expires in cached response
 	    // the expires value gives us a very easy hint when the cache is stale
-	    if (expires != null) {
-		Date yesterday = new Date((new Date()).getTime() - oneday);
-		if (expires.before(yesterday)) return false;
-	    }
+	    // sometimes, the expires date is set to the past to prevent that a page is cached
+	    // we use that information to see if we should index it
+	    // -> this does not apply for a crawler
 	    
 	    // -lastModified in cached response
-	    // we can apply a TTL (Time To Live)  heuristic here. We call the time delta between the last read
-	    // of the file and the last modified date as the age of the file. If we consider the file as
-	    // middel-aged then, the maximum TTL would be cache-creation plus age.
-	    // This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
-	    // file may only be treated as fresh for one more month, not more.
-	    if (lastModified != null) {
-		if (date == null) date = new Date();
-		long age = date.getTime() - lastModified.getTime();
-		if (age < 0) return false;
-		// TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
-		// the actual living-time is new Date().getTime() - d2.getTime()
-		// therefore the cache is stale, if Date().getTime() - d2.getTime() > age/10
-		if ((new Date()).getTime() - date.getTime() > age / 10) return false;
-	    }
+	    // this information is too weak to use it to prevent indexing
+	    // even if we can apply a TTL heuristic for cache usage
 	    
  	    // -cache-control in cached response
 	    // the cache-control has many value options.
-	    if (cacheControl != null) {
-                cacheControl = cacheControl.trim().toUpperCase();
-                if (cacheControl.startsWith("PUBLIC")) {
-                    // ok, do nothing
-                } else if ((cacheControl.startsWith("PRIVATE")) ||
-                           (cacheControl.startsWith("NO-CACHE")) ||
-                           (cacheControl.startsWith("NO-STORE"))) {
-                    // easy case
-                    return false;
-                } else if (cacheControl.startsWith("MAX-AGE=")) {
-                    // we need also the load date
-                    if (date == null) return false;
-                    try {
-                        long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
-                        if ((new Date()).getTime() - date.getTime() > ttl) {
-                            return false;
-                        }
-                    } catch (Exception e) {
-                        return false;
-                    }
-                }
-	    }
+	    // -> in the crawler we ignore this
 	    
-	    return true;
+	    return null;
 	}
+        
     }
     
 }
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index 747f9c7eb..aafb9e83f 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -59,17 +59,25 @@ public class plasmaParser {
         
     }
     
-    public document parse(URL location, String mimeType, byte[] source) {
+    public document parseSource(URL location, String mimeType, byte[] source) {
         // make a scraper and transformer
         htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
         OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
         try {
             hfos.write(source);
+            return transformScraper(location, mimeType, scraper);
+        } catch (IOException e) {
+            return null;
+        }
+    }
+    
+    public document transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
+        try {
             return new document(new URL(urlNormalform(location)),
-            mimeType, null, null, scraper.getHeadline(),
-            null, null,
-            scraper.getText(), scraper.getAnchor(), scraper.getImage());
-        } catch (Exception e) {
+                                mimeType, null, null, scraper.getHeadline(),
+                                null, null,
+                                scraper.getText(), scraper.getAnchors(), scraper.getImages());
+        } catch (MalformedURLException e) {
             return null;
         }
     }
@@ -89,8 +97,6 @@ public class plasmaParser {
         return us;
     }   
     
-    
-    
     public class document {
         
         URL location;       // the source url
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 633f4610e..68aa6c746 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -147,6 +147,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
     public  HashSet                extensionBlack;
     public  HashMap                outgoingCookies, incomingCookies;
     public  kelondroTables         facilityDB;
+    public  plasmaParser           parser;
     public  int                    serverJobs;
     public boolean terminate = false;
     
@@ -203,28 +204,10 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
         
 	// make crawl profiles database and default profiles
         profiles = new plasmaCrawlProfile(new File(plasmaPath, "crawlProfiles0.db"));
-
-        //System.out.println("profiles.size=" + profiles.size());
-        //System.out.println("profile-config=" + getConfig("defaultProxyProfile", "").length());
-	//System.out.println("profile-entry=" + profiles.getEntry(getConfig("defaultProxyProfile", "")).toString());
-        if ((profiles.size() == 0) ||
-            (getConfig("defaultProxyProfile", "").length() == 0) ||
-            (profiles.getEntry(getConfig("defaultProxyProfile", "")) == null)) {
-            // generate new default entry for proxy crawling
-            defaultProxyProfile = profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), false, true, true, true, false, true, true, true);
-            setConfig("defaultProxyProfile", defaultProxyProfile.handle());
-        } else {
-            defaultProxyProfile = profiles.getEntry(getConfig("defaultProxyProfile", ""));
-        }
-        if ((profiles.size() == 1) ||
-            (getConfig("defaultRemoteProfile", "").length() == 0) ||
-            (profiles.getEntry(getConfig("defaultRemoteProfile", "")) == null)) {
-            // generate new default entry for proxy crawling
-            defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, false, false, true, true, false, true, true, false);
-            setConfig("defaultRemoteProfile", defaultRemoteProfile.handle());
-        } else {
-            defaultRemoteProfile = profiles.getEntry(getConfig("defaultRemoteProfile", ""));
-        }
+        initProfiles();
+        
+        // make parser
+        parser = new plasmaParser(new File(""));
         
         // start indexing management
         loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL);
@@ -309,14 +292,46 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
         this.serverJobs = jobs;
     }
     
+    private void initProfiles() throws IOException {
+        if ((profiles.size() == 0) ||
+            (getConfig("defaultProxyProfile", "").length() == 0) ||
+            (profiles.getEntry(getConfig("defaultProxyProfile", "")) == null)) {
+            // generate new default entry for proxy crawling
+            defaultProxyProfile = profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), false, true, true, true, false, true, true, true);
+            setConfig("defaultProxyProfile", defaultProxyProfile.handle());
+        } else {
+            defaultProxyProfile = profiles.getEntry(getConfig("defaultProxyProfile", ""));
+        }
+        if ((profiles.size() == 1) ||
+            (getConfig("defaultRemoteProfile", "").length() == 0) ||
+            (profiles.getEntry(getConfig("defaultRemoteProfile", "")) == null)) {
+            // generate new default entry for proxy crawling
+            defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, false, false, true, true, false, true, true, false);
+            setConfig("defaultRemoteProfile", defaultRemoteProfile.handle());
+        } else {
+            defaultRemoteProfile = profiles.getEntry(getConfig("defaultRemoteProfile", ""));
+        }
+    }
+    private void resetProfiles() {
+        File pdb = new File(plasmaPath, "crawlProfiles0.db");
+        if (pdb.exists()) pdb.delete();
+        try {
+            profiles = new plasmaCrawlProfile(pdb);
+            initProfiles();
+        } catch (IOException e) {}
+    }
     private void cleanProfiles() {
         if (totalSize() > 0) return;
 	Iterator i = profiles.profiles(true);
 	plasmaCrawlProfile.entry entry;
-	while (i.hasNext()) {
-	    entry = (plasmaCrawlProfile.entry) i.next();
-	    if (!((entry.name().equals("proxy")) || (entry.name().equals("remote")))) i.remove();
-	}
+        try {
+            while (i.hasNext()) {
+                entry = (plasmaCrawlProfile.entry) i.next();
+                if (!((entry.name().equals("proxy")) || (entry.name().equals("remote")))) i.remove();
+            }
+        } catch (kelondroException e) {
+            resetProfiles();
+        }
     }
 
     public plasmaHTCache getCacheManager() {
@@ -454,7 +469,8 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
         
     private synchronized void processResourceStack(plasmaHTCache.Entry entry) {
         // work off one stack entry with a fresh resource (scraped web page)
-        if (entry.scraper != null) try {
+        byte[] content;
+        if (((content = entry.getContentBytes()).length > 0) || (entry.scraper != null)) try {
             // we must distinguish the following cases: resource-load was initiated by
             // 1) global crawling: the index is extern, not here (not possible here)
             // 2) result of search queries, some indexes are here (not possible here)
@@ -479,10 +495,20 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
 
 	    log.logDebug("processResourceStack: processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", url=" + entry.url); // DEBUG
 
+            // parse content
+            plasmaParser.document document;
+            if (entry.scraper != null) {
+                log.logDebug("(Parser) '" + entry.urlString + "' is pre-parsed by scraper");
+                document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper);
+            } else {
+                log.logDebug("(Parser) '" + entry.urlString + "' is not parsed, parsing now");
+                document = parser.parseSource(entry.url, entry.responseHeader.mime(), content);
+            }
+            
             // put anchors on crawl stack
             if (((processCase == 4) || (processCase == 5)) &&
                              (entry.depth < entry.profile.generalDepth())) {
-                Map hl = entry.scraper.getHyperlinks();
+                Map hl = document.getHyperlinks();
                 Iterator i = hl.entrySet().iterator();
                 String nexturlstring;
                 String rejectReason;
@@ -500,18 +526,26 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
                     }
                 }
                 log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url.toString() +
-                ", NEW CRAWL STACK SIZE IS " + noticeURL.localStackSize());
+                            ", NEW CRAWL STACK SIZE IS " + noticeURL.localStackSize());
             }
             
             // create index
-            String noIndexReason;
-            String descr = entry.scraper.getHeadline();
+            
+            String descr = document.getMainLongTitle();
             URL referrerURL = entry.referrerURL();
             String referrerHash = (referrerURL == null) ? plasmaURL.dummyHash : plasmaURL.urlHash(referrerURL);
-            if ((noIndexReason = entry.shallIndexCache()) == null ) {
+            String noIndexReason = "unspecified";
+            if (processCase == 4) {
+                // proxy-load
+                noIndexReason = entry.shallIndexCacheForProxy();
+            } else {
+                // normal crawling
+                noIndexReason = entry.shallIndexCacheForCrawler();
+            }
+            if (noIndexReason == null) {
                 // strip out words
                 log.logDebug("(Profile) Condensing for '" + entry.urlString + "'");
-                plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(entry.scraper.getText()));
+                plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(document.getText()));
  
                 //log.logInfo("INDEXING HEADLINE:" + descr);
                 try {
@@ -573,7 +607,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
             }
             
             // explicit delete/free resources
-            entry.scraper = null; entry = null;
+            document = null; entry = null;
         } catch (IOException e) {
             log.logError("ERROR in plasmaSwitchboard.process(): " + e.toString());
         }
@@ -1310,6 +1344,10 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
             log.logError("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage());
             e.printStackTrace();
             return new plasmaWordIndexEntity[0];
+        } catch (kelondroException e) {
+            log.logError("selectTransferIndexes database corrupted: " + e.getMessage());
+            e.printStackTrace();
+            return new plasmaWordIndexEntity[0];
         }
     }
     
diff --git a/source/de/anomic/plasma/plasmaWordIndexFileCache.java b/source/de/anomic/plasma/plasmaWordIndexFileCache.java
index 517686efb..c714ce6f7 100644
--- a/source/de/anomic/plasma/plasmaWordIndexFileCache.java
+++ b/source/de/anomic/plasma/plasmaWordIndexFileCache.java
@@ -62,6 +62,7 @@ package de.anomic.plasma;
 
 import java.io.*;
 import java.util.*;
+import de.anomic.server.*;
 import de.anomic.kelondro.*;
 
 public class plasmaWordIndexFileCache {
@@ -72,24 +73,43 @@ public class plasmaWordIndexFileCache {
     // class variables
     private File         databaseRoot;
     private kelondroTree indexCache;
+    private int          bufferkb;
 
     public plasmaWordIndexFileCache(File databaseRoot, int bufferkb) throws IOException {
 	this.databaseRoot = databaseRoot;
+        this.bufferkb = bufferkb;
 	File indexCacheFile = new File(databaseRoot, indexCacheFileName);
         if (indexCacheFile.exists()) {
 	    // simply open the file
 	    indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400);
 	} else {
-	    // create a new file
-            int[] columns = new int[buffers + 2];
-            columns[0] = plasmaWordIndexEntry.wordHashLength;
-            columns[1] = 1;
-            for (int i = 0; i < buffers; i++) columns[i + 2] = plasmaCrawlLURL.urlHashLength + plasmaWordIndexEntry.attrSpaceShort;
-            indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400, columns);
+            createCacheFile(indexCacheFile);
 	}
     }
 
-
+    private void resetCacheFile() {
+        // this has to be used in emergencies only
+        // it can happen that there is a serious db inconsistency; in that case we re-create the indexCache
+        try { indexCache.close(); } catch (IOException e) {}
+        File indexCacheFile = new File(databaseRoot, indexCacheFileName);
+        if (indexCacheFile.exists()) indexCacheFile.delete();
+        try {
+            createCacheFile(indexCacheFile);
+        } catch (IOException e) {
+            de.anomic.server.serverLog.logError("PLASMA", "plasmaWordIndexFileCache.resetCacheFile: serious failure creating the cache file: " + e.getMessage());
+            indexCache = null;
+        }
+    }
+    
+    private void createCacheFile(File indexCacheFile) throws IOException {
+        // create a new file
+        int[] columns = new int[buffers + 2];
+        columns[0] = plasmaWordIndexEntry.wordHashLength;
+        columns[1] = 1;
+        for (int i = 0; i < buffers; i++) columns[i + 2] = plasmaCrawlLURL.urlHashLength + plasmaWordIndexEntry.attrSpaceShort;
+        indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400, columns);
+    }
+    
     protected void close() throws IOException {
 	indexCache.close();
 	indexCache = null;
@@ -162,8 +182,12 @@ public class plasmaWordIndexFileCache {
                 indexCache.put(row);
             } catch (kelondroException e) {
                 // this is a very bad case; a database inconsistency occurred
-                deleteComplete(wordHash);
-                System.out.println("fatal error in plasmaWordIndexFileCacle.addEntriesToIndex: write to word hash file " + wordHash + " failed - " + e.getMessage() + " - index deleted.");
+                serverLog.logError("PLASMA", "fatal error in plasmaWordIndexFileCache.addEntriesToIndex: write of " + wordHash + " to index cache failed - " + e.getMessage() + " - indexCache.db deleted");
+                resetCacheFile();
+            } catch (IOException e) {
+                // this is a very bad case; a database inconsistency occurred
+                serverLog.logError("PLASMA", "fatal error in plasmaWordIndexFileCache.addEntriesToIndex: write of " + wordHash + " to index cache failed - " + e.getMessage() + " - indexCache.db deleted");
+                resetCacheFile();
             }
         }
 	// finished!
diff --git a/source/yacy.java b/source/yacy.java
index 51e2d4c15..eca12829c 100644
--- a/source/yacy.java
+++ b/source/yacy.java
@@ -208,7 +208,7 @@ public class yacy {
 	    }
 
             // init parser
-            de.anomic.htmlFilter.htmlFilterContentScraper.mediaExt = sb.getConfig("mediaExt","");
+            de.anomic.plasma.plasmaParser.mediaExt = sb.getConfig("mediaExt","");
             
 	    // start main threads
 	    try {
diff --git a/yacy.blue b/yacy.blue
new file mode 100644
index 000000000..d2833eb81
--- /dev/null
+++ b/yacy.blue
@@ -0,0 +1 @@
+testblue
diff --git a/yacy.init b/yacy.init
index f0f62fe49..97b01a63b 100644
--- a/yacy.init
+++ b/yacy.init
@@ -166,16 +166,8 @@ remoteProxyUse=false
 #remoteProxyUse=true
 
 # the proxy may filter the content of transferred web pages
-# this is archieved using a special filtering class that can be
-# exchanged like a transformation plug-in
-# If you want to do this, you must implement the htmlFilterTransformer
-# -Interface and set the name of the implementing class here.
-# As a default, we use a filtering Transformer that takes a blacklist
-# and blocks all text fragments where a word from the blacklist appears
-# as the blacklist, we use the search-engine's blue-list
-# please see that class as an implementation example for your own transformers
-pageTransformerClass=htmlFilterContentTransformer
-pageTransformerArg=yacy.blue
+# the bluelist removes specific keywords from web pages
+proxyBlueList=yacy.blue
 
 # security settigns
 # we provide proxy and server security through a 2-stage security gate: