From 87a8747ce338a146d295b5af8e15bd91fa50ca47 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Mon, 25 Feb 2008 14:08:15 +0000
Subject: [PATCH] - enhanced recognition, parsing, management and
 double-occurrence-handling of image tags - enhanced text parser (condenser):
 found and eliminated bad code parts; increase of speed - added handling of
 image preview using the image cache from HTCACHE - some other minor changes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4507 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/CacheAdmin_p.java                      |  5 +-
 htroot/ViewFile.java                          |  6 +--
 htroot/ViewImage.java                         | 46 +++++++++----------
 htroot/yacysearchitem.html                    |  2 +-
 htroot/yacysearchitem.java                    |  2 +
 .../htmlFilter/htmlFilterContentScraper.java  | 46 ++++++++++++++++---
 source/de/anomic/http/httpdFileHandler.java   |  1 +
 .../anomic/plasma/parser/rss/rssParser.java   | 10 ++--
 .../anomic/plasma/parser/tar/tarParser.java   |  6 +--
 .../anomic/plasma/parser/zip/zipParser.java   |  6 +--
 source/de/anomic/plasma/plasmaCondenser.java  | 41 ++++++++++-------
 .../de/anomic/plasma/plasmaCrawlBalancer.java |  5 +-
 source/de/anomic/plasma/plasmaCrawlNURL.java  | 30 +++++++++---
 source/de/anomic/plasma/plasmaParser.java     |  5 +-
 .../anomic/plasma/plasmaParserDocument.java   | 32 ++++++-------
 .../de/anomic/plasma/plasmaSearchImages.java  | 26 +++--------
 .../de/anomic/plasma/plasmaSnippetCache.java  |  3 +-
 17 files changed, 161 insertions(+), 111 deletions(-)

diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
index d825be9b4..bc7898de9 100644
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@@ -54,6 +54,7 @@ import java.io.File;
 import java.io.FilenameFilter;
 import java.io.IOException;
 import java.io.Writer;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.TreeSet;
@@ -313,8 +314,8 @@ public class CacheAdmin_p {
         prop.put("info_type_use." + extension, (i == 0) ? 0 : 1);
     }
 
-    private static void formatImageAnchor(serverObjects prop, TreeSet<htmlFilterImageEntry> anchor) {
-        final Iterator<htmlFilterImageEntry> iter = anchor.iterator();
+    private static void formatImageAnchor(serverObjects prop, HashMap<String, htmlFilterImageEntry> anchor) {
+        final Iterator<htmlFilterImageEntry> iter = anchor.values().iterator();
         htmlFilterImageEntry ie;
         prop.put("info_type_use.images_images", anchor.size());
         int i = 0;
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index 2ad6fa099..faa66f3ec 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -49,9 +49,9 @@ import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.net.URLDecoder;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
-import java.util.TreeSet;
 
 import de.anomic.data.htmlTools;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
@@ -339,8 +339,8 @@ public class ViewFile {
                 i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
                 dark = (i % 2 == 0);
                 
-                TreeSet<htmlFilterImageEntry> ts = document.getImages();
-                Iterator<htmlFilterImageEntry> tsi = ts.iterator();
+                HashMap<String, htmlFilterImageEntry> ts = document.getImages();
+                Iterator<htmlFilterImageEntry> tsi = ts.values().iterator();
                 htmlFilterImageEntry entry;
                 while (tsi.hasNext()) {
                     entry = tsi.next();
diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java
index 785a05200..90878495f 100644
--- a/htroot/ViewImage.java
+++ b/htroot/ViewImage.java
@@ -135,47 +135,47 @@ public class ViewImage {
             // find original size
             int h = image.getHeight(null);
             int w = image.getWidth(null);
-
-            // System.out.println("DEBUG: get access to image " +
-            // url.toNormalform() + " is " + ((auth) ? "authorized" : "NOT
-            // authorized"));
-
+            
             // in case of not-authorized access shrink the image to prevent
-            // copyright problems
-            // so that images are not larger than thumbnails
-            if ((!auth) && ((w > 16) || (h > 16))) {
+            // copyright problems, so that images are not larger than thumbnails
+            if (auth) {
+                maxwidth = (maxwidth == 0) ? w : maxwidth;
+                maxheight = (maxheight == 0) ? h : maxheight;
+            } else if ((w > 16) || (h > 16)) {
                 maxwidth = (int) Math.min(64.0, w * 0.6);
                 maxheight = (int) Math.min(64.0, h * 0.6);
+            } else {
+                maxwidth = 16;
+                maxheight = 16;
             }
 
             // calculate width & height from maxwidth & maxheight
-            if ((maxwidth != 0) || (maxheight != 0)) {
+            if ((maxwidth < w) || (maxheight < h)) {
+                // scale image
                 double hs = (w <= maxwidth) ? 1.0 : ((double) maxwidth) / ((double) w);
                 double vs = (h <= maxheight) ? 1.0 : ((double) maxheight) / ((double) h);
                 double scale = Math.min(hs, vs);
                 if (!auth) scale = Math.min(scale, 0.6); // this is for copyright purpose
                 if (scale < 1.0) {
-                    width = (int) (w * scale);
-                    height = (int) (h * scale);
+                    width = Math.max(1, (int) (w * scale));
+                    height = Math.max(1, (int) (h * scale));
                 } else {
-                    width = w;
-                    height = h;
+                    width = Math.max(1, w);
+                    height = Math.max(1, h);
                 }
+                
+                // compute scaled image
+                scaled = ((w == width) && (h == height)) ? image : image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
+                MediaTracker mediaTracker = new MediaTracker(new Container());
+                mediaTracker.addImage(scaled, 0);
+                try {mediaTracker.waitForID(0);} catch (InterruptedException e) {}
             } else {
+                // do not scale
                 width = w;
                 height = h;
+                scaled = image;
             }
 
-            // check for minimum values
-            width = Math.max(width, 1);
-            height = Math.max(height, 1);
-
-            // scale image
-            scaled = ((w == width) && (h == height)) ? image : image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
-            MediaTracker mediaTracker = new MediaTracker(new Container());
-            mediaTracker.addImage(scaled, 0);
-            try {mediaTracker.waitForID(0);} catch (InterruptedException e) {}
-
             if ((height == 16) && (width == 16) && (resource != null)) {
                 // this might be a favicon, store image to cache for faster re-load later on
                 iconcache.put(urlString, scaled);
diff --git a/htroot/yacysearchitem.html b/htroot/yacysearchitem.html
index 5f14af734..6291da034 100644
--- a/htroot/yacysearchitem.html
+++ b/htroot/yacysearchitem.html
@@ -22,7 +22,7 @@
   ::
   #{items}#
   <div class="thumbcontainer">
-    <a href="#[href]#" class="thumblink" onclick="return hs.expand(this)">
+    <a href="#[hrefCache]#" class="thumblink" onclick="return hs.expand(this)">
       <img src="/ViewImage.png?maxwidth=96&maxheight=96&code=#[code]#" alt="#[name]#">
     </a>
     <div class="highslide-caption"><a href="#[href]#">#[name]#<br \><a href="#[source]#">#[sourcedom]#</a></a></div>
diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java
index 1537327c6..9af955ba5 100644
--- a/htroot/yacysearchitem.java
+++ b/htroot/yacysearchitem.java
@@ -67,6 +67,7 @@ public class yacysearchitem {
         boolean rss = post.get("rss", "false").equals("true");
         boolean authenticated = sb.adminAuthenticated(header) >= 2;
         int item = post.getInt("item", -1);
+        boolean auth = ((String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "")).equals("localhost") || sb.verifyAuthentication(header, true);
         
         // default settings for blank item
         prop.put("content", "0");
@@ -233,6 +234,7 @@ public class yacysearchitem {
             if (ms == null) {
                 prop.put("content_items", "0");
             } else {
+                prop.putHTML("content_items_0_hrefCache", (auth) ? "/ViewImage.png?url=" + ms.href.toNormalform(true, false) : ms.href.toNormalform(true, false));
                 prop.putHTML("content_items_0_href", ms.href.toNormalform(true, false));
                 prop.put("content_items_0_code", sb.licensedURLs.aquireLicense(ms.href));
                 prop.putHTML("content_items_0_name", shorten(ms.name, namelength));
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index 82dbf7c4d..18e5b2956 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -54,6 +54,7 @@ import java.net.MalformedURLException;
 import java.text.Collator;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
@@ -102,7 +103,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
 
     // class variables: collectors for links
     private HashMap<yacyURL, String> anchors;
-    private TreeSet<htmlFilterImageEntry> images; // String(absolute url)/ImageEntry relation
+    private HashMap<String, htmlFilterImageEntry> images; // urlhash/image relation
     private HashMap<String, String> metas;
     private String title;
     //private String headline;
@@ -127,7 +128,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
         super(linkTags0, linkTags1);
         this.root = root;
         this.anchors = new HashMap<yacyURL, String>();
-        this.images = new TreeSet<htmlFilterImageEntry>();
+        this.images = new HashMap<String, htmlFilterImageEntry>();
         this.metas = new HashMap<String, String>();
         this.title = "";
         this.headlines = new ArrayList[4];
@@ -178,7 +179,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
             } catch (NumberFormatException e) {}
             yacyURL url = absolutePath(tagopts.getProperty("src", ""));
             htmlFilterImageEntry ie = new htmlFilterImageEntry(url, tagopts.getProperty("alt",""), width, height);
-            images.add(ie);
+            addImage(images, ie);
         }
         if (tagname.equalsIgnoreCase("base")) try {
             root = new yacyURL(tagopts.getProperty("href", ""), null);
@@ -212,7 +213,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
 
                 if (type.equalsIgnoreCase("shortcut icon")) {
                     htmlFilterImageEntry ie = new htmlFilterImageEntry(newLink, linktitle, -1,-1);
-                    images.add(ie);    
+                    images.put(ie.url().hash(), ie);    
                     this.favicon = newLink;
                 } else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) {
                     anchors.put(newLink, linktitle);
@@ -234,12 +235,24 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
         // fire event
         fireScrapeTag0(tagname, tagopts);
     }
-
+    
     public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
         // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
         if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) {
             String href = tagopts.getProperty("href", "");
-            if (href.length() > 0) anchors.put(absolutePath(href), super.stripAll(new serverCharBuffer(text)).trim().toString());
+            if (href.length() > 0) {
+                yacyURL url = absolutePath(href);
+                String f = url.getFile();
+                int p = f.lastIndexOf('.');
+                String type = (p < 0) ? "" : f.substring(p + 1);
+                if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg")) {
+                    // special handling of such urls: put them to the image urls
+                    htmlFilterImageEntry ie = new htmlFilterImageEntry(url, super.stripAll(new serverCharBuffer(text)).trim().toString(), -1, -1);
+                    addImage(images, ie);
+                } else {
+                    anchors.put(url, super.stripAll(new serverCharBuffer(text)).trim().toString());
+                }
+            }
         }
         String h;
         if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
@@ -348,7 +361,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
         return anchors;
     }
 
-    public TreeSet<htmlFilterImageEntry> getImages() {
+    public HashMap<String, htmlFilterImageEntry> getImages() {
         // this resturns a String(absolute url)/htmlFilterImageEntry - relation
         return images;
     }
@@ -522,5 +535,24 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
         
         return scraper;
     }
+    
+    public static void addAllImages(HashMap<String, htmlFilterImageEntry> a, HashMap<String, htmlFilterImageEntry> b) {
+        Iterator<Map.Entry<String, htmlFilterImageEntry>> i = b.entrySet().iterator();
+        Map.Entry<String, htmlFilterImageEntry> ie;
+        while (i.hasNext()) {
+            ie = i.next();
+            addImage(a, ie.getValue());
+        }
+    }
+    
+    public static void addImage(HashMap<String, htmlFilterImageEntry> a, htmlFilterImageEntry ie) {
+        if (a.containsKey(ie.url().hash())) {
+            // in case of a collision, take that image that has the better image size tags
+            if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url().hash(), ie);
+        } else {
+            a.put(ie.url().hash(), ie);
+        }
+    }
+    
 }
 
diff --git a/source/de/anomic/http/httpdFileHandler.java b/source/de/anomic/http/httpdFileHandler.java
index ecdc1d14a..e06f1eed1 100644
--- a/source/de/anomic/http/httpdFileHandler.java
+++ b/source/de/anomic/http/httpdFileHandler.java
@@ -452,6 +452,7 @@ public final class httpdFileHandler {
                     sb.append("<html>\n<head>\n</head>\n<body>\n<h1>Index of " + path + "</h1>\n  <ul>\n");
                     File dir = new File(htDocsPath, path);
                     String[] list = dir.list();
+                    if (list == null) list = new String[0]; // should not occur!
                     File f;
                     String size;
                     long sz;
diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java
index c14459135..4d26f8a8e 100644
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@@ -50,7 +50,6 @@ import java.util.HashMap;
 import java.util.Hashtable;
 import java.util.LinkedList;
 import java.util.Map;
-import java.util.TreeSet;
 
 import de.anomic.htmlFilter.htmlFilterAbstractScraper;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
@@ -97,7 +96,7 @@ public class rssParser extends AbstractParser implements Parser {
         try {
             LinkedList<String> feedSections = new LinkedList<String>();
             HashMap<yacyURL, String> anchors = new HashMap<yacyURL, String>();
-            TreeSet<htmlFilterImageEntry> images  = new TreeSet<htmlFilterImageEntry>();
+            HashMap<String, htmlFilterImageEntry> images  = new HashMap<String, htmlFilterImageEntry>();
             serverByteBuffer text = new serverByteBuffer();
             serverCharBuffer authors = new serverCharBuffer();
             
@@ -114,7 +113,8 @@ public class rssParser extends AbstractParser implements Parser {
             String feedDescription = reader.getChannel().getDescription();
             
             if (reader.getImage() != null) {
-                images.add(new htmlFilterImageEntry(new yacyURL(reader.getImage(), null), feedTitle, -1, -1));
+                yacyURL imgURL = new yacyURL(reader.getImage(), null);
+                images.put(imgURL.hash(), new htmlFilterImageEntry(imgURL, feedTitle, -1, -1));
             }            
             
             // loop through the feed items
@@ -154,9 +154,9 @@ public class rssParser extends AbstractParser implements Parser {
                             anchors.putAll(itemLinks);
                         }
                         
-                        TreeSet<htmlFilterImageEntry> itemImages = scraper.getImages();
+                        HashMap<String, htmlFilterImageEntry> itemImages = scraper.getImages();
                         if ((itemImages != null) && (itemImages.size() > 0)) {
-                            images.addAll(itemImages);
+                            htmlFilterContentScraper.addAllImages(images, itemImages);
                         }
                         
                         byte[] extractedText = scraper.getText();
diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java
index 93f7b36cd..0587f0cf5 100644
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@@ -53,12 +53,12 @@ import java.util.HashMap;
 import java.util.Hashtable;
 import java.util.LinkedList;
 import java.util.Map;
-import java.util.TreeSet;
 import java.util.zip.GZIPInputStream;
 
 import com.ice.tar.TarEntry;
 import com.ice.tar.TarInputStream;
 
+import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.plasma.plasmaParser;
 import de.anomic.plasma.plasmaParserDocument;
@@ -132,7 +132,7 @@ public class tarParser extends AbstractParser implements Parser {
             StringBuffer docAbstrct = new StringBuffer();
 
             Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
-            TreeSet<htmlFilterImageEntry> docImages = new TreeSet<htmlFilterImageEntry>(); 
+            HashMap<String, htmlFilterImageEntry> docImages = new HashMap<String, htmlFilterImageEntry>(); 
                         
             // looping through the contained files
             TarEntry entry;
@@ -193,7 +193,7 @@ public class tarParser extends AbstractParser implements Parser {
                 }               
                 
                 docAnchors.putAll(subDoc.getAnchors());
-                docImages.addAll(subDoc.getImages());
+                htmlFilterContentScraper.addAllImages(docImages, subDoc.getImages());
                 
                 // release subdocument
                 subDoc.close();
diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java
index 2b48249ed..e73c615d5 100644
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@@ -53,10 +53,10 @@ import java.util.HashMap;
 import java.util.Hashtable;
 import java.util.LinkedList;
 import java.util.Map;
-import java.util.TreeSet;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
 
+import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.plasma.plasmaParser;
 import de.anomic.plasma.plasmaParserDocument;
@@ -115,7 +115,7 @@ public class zipParser extends AbstractParser implements Parser {
             LinkedList<String> docSections = new LinkedList<String>();
             StringBuffer docAbstrct = new StringBuffer();
             Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
-            TreeSet<htmlFilterImageEntry> docImages = new TreeSet<htmlFilterImageEntry>(); 
+            HashMap<String, htmlFilterImageEntry> docImages = new HashMap<String, htmlFilterImageEntry>(); 
             
             // creating a new parser class to parse the unzipped content
             plasmaParser theParser = new plasmaParser();            
@@ -176,7 +176,7 @@ public class zipParser extends AbstractParser implements Parser {
                 }
                 
                 docAnchors.putAll(subDoc.getAnchors());
-                docImages.addAll(subDoc.getImages());
+                htmlFilterContentScraper.addAllImages(docImages, subDoc.getImages());
                 
                 // release subdocument
                 subDoc.close();
diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
index 01d0edb6f..4c2bc8377 100644
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@@ -107,6 +107,19 @@ public final class plasmaCondenser {
     
     private final static int numlength = 5;
 
+    // initialize array of invisible characters
+    private static boolean[] invisibleChar = new boolean['z' - ' ' + 1];
+    static {
+        // initialize array of invisible charachters
+        String invisibleString = "\"$%&/()=`^+*#'-_:;,<>[]\\";
+        for (int i = ' '; i <= 'z'; i++) {
+            invisibleChar[i - ' '] = false;
+        }
+        for (int i = 0; i < invisibleString.length(); i++) {
+            invisibleChar[invisibleString.charAt(i) - ' '] = true;
+        }
+    }
+    
     //private Properties analysis;
     private TreeMap<String, wordStatProp> words; // a string (the words) to (wordStatProp) - relation
     private HashMap<StringBuffer, phraseStatProp> sentences;
@@ -198,7 +211,7 @@ public final class plasmaCondenser {
             }
 
             // images
-            Iterator<htmlFilterImageEntry> j = document.getImages().iterator();
+            Iterator<htmlFilterImageEntry> j = document.getImages().values().iterator();
             htmlFilterImageEntry ientry;
             while (j.hasNext()) {
                 ientry = j.next();
@@ -659,7 +672,7 @@ public final class plasmaCondenser {
     public final static boolean invisible(char c) {
         // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
         if ((c < ' ') || (c > 'z')) return true;
-        return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0);
+        return invisibleChar[c - ' '];
     }
 
     public static Enumeration<StringBuffer> wordTokenizer(String s, String charset, int minLength) {
@@ -727,7 +740,7 @@ public final class plasmaCondenser {
 
         public unsievedWordsEnum(InputStream is, String charset) throws UnsupportedEncodingException {
             e = new sentencesFromInputStreamEnum(is, charset);
-            s = new StringBuffer();
+            s = new StringBuffer(20);
             buffer = nextElement0();
         }
 
@@ -859,9 +872,9 @@ public final class plasmaCondenser {
     }
 
     static StringBuffer readSentence(Reader reader, boolean pre) throws IOException {
-        StringBuffer s = new StringBuffer();
+        StringBuffer s = new StringBuffer(20);
         int nextChar;
-        char c;
+        char c, lc = (char) 0;
         
         // find sentence end
         for (;;) {
@@ -871,20 +884,14 @@ public final class plasmaCondenser {
                 if (s.length() == 0) return null; else break;
             }
             c = (char) nextChar;
+            if (pre && ((c == (char) 10) || (c == (char) 13))) break;
+            if ((c == (char) 8) || (c == (char) 10) || (c == (char) 13)) c = ' ';
+            if ((lc == ' ') && (c == ' ')) continue; // ignore double spaces
             s.append(c);
-            if (pre) {
-                if ((c == (char) 10) || (c == (char) 13)) break;
-            } else {
-                if (htmlFilterContentScraper.punctuation(c)) break;
-            }
-        }
-
-        // replace line endings and tabs by blanks
-        for (int i = 0; i < s.length(); i++) {
-            if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) || (s.charAt(i) == (char) 8)) s.setCharAt(i, ' ');
+            if (htmlFilterContentScraper.punctuation(c)) break;
+            lc = c;
         }
-        // remove all double-spaces
-        int p; while ((p = s.indexOf("  ")) >= 0) s.deleteCharAt(p);
+        
         return s;
     }
 
diff --git a/source/de/anomic/plasma/plasmaCrawlBalancer.java b/source/de/anomic/plasma/plasmaCrawlBalancer.java
index 960765c53..75e32bcba 100644
--- a/source/de/anomic/plasma/plasmaCrawlBalancer.java
+++ b/source/de/anomic/plasma/plasmaCrawlBalancer.java
@@ -130,7 +130,10 @@ public class plasmaCrawlBalancer {
     }
     
     public void finalize() {
-        if (urlFileStack != null) close();
+        if (urlFileStack != null) {
+            serverLog.logWarning("plasmaCrawlBalancer", "crawl stack " + stackname + " closed by finalizer");
+            close();
+        }
     }
     
     public synchronized void clear() {
diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java
index faa53c931..244a8d1f9 100644
--- a/source/de/anomic/plasma/plasmaCrawlNURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlNURL.java
@@ -49,6 +49,8 @@ import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Iterator;
 
+import de.anomic.server.logging.serverLog;
+
 public class plasmaCrawlNURL {
     
     public static final int STACK_TYPE_NULL     =  0; // do not stack
@@ -64,9 +66,9 @@ public class plasmaCrawlNURL {
     private static final long minimumGlobalDelta = 500; // the minimum time difference between access of the same global domain
     private static final long maximumDomAge =  60000; // the maximum age of a domain until it is used for another crawl attempt
     
-    private final plasmaCrawlBalancer coreStack;      // links found by crawling to depth-1
-    private final plasmaCrawlBalancer limitStack;     // links found by crawling at target depth
-    private final plasmaCrawlBalancer remoteStack;    // links from remote crawl orders
+    private plasmaCrawlBalancer coreStack;      // links found by crawling to depth-1
+    private plasmaCrawlBalancer limitStack;     // links found by crawling at target depth
+    private plasmaCrawlBalancer remoteStack;    // links from remote crawl orders
     //private final plasmaCrawlBalancer overhangStack;  // links found by crawling at depth+1
     //private kelondroStack imageStack;     // links pointing to image resources
     //private kelondroStack movieStack;     // links pointing to movie resources
@@ -81,10 +83,26 @@ public class plasmaCrawlNURL {
     }
 
     public void close() {
-        coreStack.close();
-        limitStack.close();
+        if (coreStack != null) {
+            coreStack.close();
+            coreStack = null;
+        }
+        if (limitStack != null) {
+            limitStack.close();
+            limitStack = null;
+        }
         //overhangStack.close();
-        remoteStack.close();
+        if (remoteStack != null) {
+            remoteStack.close();
+            remoteStack = null;
+        }
+    }
+    
+    public void finalize() {
+        if ((coreStack != null) || (limitStack != null) || (remoteStack != null)) {
+            serverLog.logWarning("plasmaCrawlNURL", "NURL stack closed by finalizer");
+            close();
+        }
     }
     
     public boolean notEmpty() {
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index a7001118e..0344ae68b 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -39,6 +39,7 @@ import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Hashtable;
@@ -747,7 +748,7 @@ public final class plasmaParser {
         
     }
     
-    static Map<yacyURL, String> allReflinks(Set<?> links) {
+    static Map<yacyURL, String> allReflinks(Collection<?> links) {
         // links is either a Set of Strings (with urls) or htmlFilterImageEntries
         // we find all links that are part of a reference inside a url
         HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
@@ -786,7 +787,7 @@ public final class plasmaParser {
         return v;
     }
     
-    static Map<yacyURL, String> allSubpaths(Set<?> links) {
+    static Map<yacyURL, String> allSubpaths(Collection<?> links) {
         // links is either a Set of Strings (urls) or a Set of htmlFilterImageEntries
         HashSet<String> h = new HashSet<String>();
         Iterator<?> i = links.iterator();
diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java
index 014eedefc..e9c87581b 100644
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@@ -61,6 +61,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.TreeSet;
 
+import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.plasma.parser.Parser;
 
@@ -76,7 +77,7 @@ public class plasmaParserDocument {
     private StringBuffer description;   // an abstract, if present: short content description
     private Object text;            // the clear text, all that is visible
     private Map<yacyURL, String> anchors;    // all links embedded as clickeable entities (anchor tags)
-    private TreeSet<htmlFilterImageEntry> images;         // all visible pictures in document
+    private HashMap<String, htmlFilterImageEntry> images;         // all visible pictures in document
     // the anchors and images - Maps are URL-to-EntityDescription mappings.
     // The EntityDescription appear either as visible text in anchors or as alternative
     // text in image tags.
@@ -89,7 +90,7 @@ public class plasmaParserDocument {
     protected plasmaParserDocument(yacyURL location, String mimeType, String charset,
                     String[] keywords, String title, String author,
                     String[] sections, String abstrct,
-                    Object text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
+                    Object text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
         this.source = location;
         this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
         this.charset = charset;
@@ -99,7 +100,7 @@ public class plasmaParserDocument {
         this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections);
         this.description = (abstrct == null) ? new StringBuffer() : new StringBuffer(abstrct);
         this.anchors = (anchors == null) ? new HashMap<yacyURL, String>(0) : anchors;
-        this.images =  (images == null) ? new TreeSet<htmlFilterImageEntry>() : images;
+        this.images =  (images == null) ? new HashMap<String, htmlFilterImageEntry>() : images;
         this.hyperlinks = null;
         this.audiolinks = null;
         this.videolinks = null;
@@ -124,21 +125,21 @@ public class plasmaParserDocument {
     public plasmaParserDocument(yacyURL location, String mimeType, String charset,
                     String[] keywords, String title, String author,
                     String[] sections, String abstrct,
-                    byte[] text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
+                    byte[] text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
         this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
     }
     
     public plasmaParserDocument(yacyURL location, String mimeType, String charset,
             String[] keywords, String title, String author,
             String[] sections, String abstrct,
-            File text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
+            File text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
         this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
     }
     
     public plasmaParserDocument(yacyURL location, String mimeType, String charset,
             String[] keywords, String title, String author,
             String[] sections, String abstrct,
-            serverCachedFileOutputStream text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
+            serverCachedFileOutputStream text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
         this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
     }
 
@@ -310,7 +311,7 @@ dc_rights
         return this.videolinks;
     }
     
-    public TreeSet<htmlFilterImageEntry> getImages() {
+    public HashMap<String, htmlFilterImageEntry> getImages() {
         // returns all links enbedded as pictures (visible in document)
         // this resturns a htmlFilterImageEntry collection
         if (!resorted) resortLinks();
@@ -341,7 +342,7 @@ dc_rights
         audiolinks = new HashMap<yacyURL, String>();
         applinks   = new HashMap<yacyURL, String>();
         emaillinks = new HashMap<String, String>();
-        TreeSet<htmlFilterImageEntry> collectedImages = new TreeSet<htmlFilterImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
+        HashMap<String, htmlFilterImageEntry> collectedImages = new HashMap<String, htmlFilterImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
         Map.Entry<yacyURL, String> entry;
         while (i.hasNext()) {
             entry = i.next();
@@ -361,7 +362,7 @@ dc_rights
                     if (plasmaParser.mediaExtContains(ext)) {
                         // this is not a normal anchor, its a media link
                         if (plasmaParser.imageExtContains(ext)) {
-                            collectedImages.add(new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1));
+                            htmlFilterContentScraper.addImage(collectedImages, new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1));
                         }
                         else if (plasmaParser.audioExtContains(ext)) audiolinks.put(url, (String)entry.getValue());
                         else if (plasmaParser.videoExtContains(ext)) videolinks.put(url, (String)entry.getValue());
@@ -374,23 +375,18 @@ dc_rights
         }
         
         // add image links that we collected from the anchors to the image map
-        Iterator<htmlFilterImageEntry>  j = collectedImages.iterator();
-        htmlFilterImageEntry iEntry;
-        while (j.hasNext()) {
-            iEntry = (htmlFilterImageEntry) j.next();
-            if (!images.contains(iEntry)) images.add(iEntry);
-        }
+        htmlFilterContentScraper.addAllImages(images, collectedImages);
        
         // expand the hyperlinks:
         // we add artificial hyperlinks to the hyperlink set
         // that can be calculated from given hyperlinks and imagelinks
         
-        hyperlinks.putAll(plasmaParser.allReflinks(images));
+        hyperlinks.putAll(plasmaParser.allReflinks(images.values()));
         hyperlinks.putAll(plasmaParser.allReflinks(audiolinks.keySet()));
         hyperlinks.putAll(plasmaParser.allReflinks(videolinks.keySet()));
         hyperlinks.putAll(plasmaParser.allReflinks(applinks.keySet()));
         hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks.keySet()));
-        hyperlinks.putAll(plasmaParser.allSubpaths(images));
+        hyperlinks.putAll(plasmaParser.allSubpaths(images.values()));
         hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks.keySet()));
         hyperlinks.putAll(plasmaParser.allSubpaths(videolinks.keySet()));
         hyperlinks.putAll(plasmaParser.allSubpaths(applinks.keySet()));
@@ -417,7 +413,7 @@ dc_rights
         serverFileUtils.copy(doc.getText(), (serverCachedFileOutputStream)this.text);
         
         anchors.putAll(doc.getAnchors());
-        images.addAll(doc.getImages());
+        htmlFilterContentScraper.addAllImages(images, doc.getImages());
     }
     
     /**
diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java
index 90ae0ec01..aabaf4cc8 100644
--- a/source/de/anomic/plasma/plasmaSearchImages.java
+++ b/source/de/anomic/plasma/plasmaSearchImages.java
@@ -43,9 +43,10 @@ package de.anomic.plasma;
 
 import java.io.InputStream;
 import java.net.MalformedURLException;
+import java.util.HashMap;
 import java.util.Iterator;
-import java.util.TreeSet;
 
+import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.plasma.parser.ParserException;
 import de.anomic.server.serverDate;
@@ -53,11 +54,11 @@ import de.anomic.yacy.yacyURL;
 
 public final class plasmaSearchImages {
 
-    private TreeSet<htmlFilterImageEntry> images;
+    private HashMap<String, htmlFilterImageEntry> images;
     
     public plasmaSearchImages(long maxTime, yacyURL url, int depth) {
         long start = System.currentTimeMillis();
-        this.images = new TreeSet<htmlFilterImageEntry>();
+        this.images = new HashMap<String, htmlFilterImageEntry>();
         if (maxTime > 10) {
             Object[] resource = plasmaSnippetCache.getResource(url, true, (int) maxTime, false);
             InputStream res = (InputStream) resource[0];
@@ -75,7 +76,7 @@ public final class plasmaSearchImages {
                 if (document == null) return;
                 
                 // add the image links
-                this.addAll(document.getImages());
+                htmlFilterContentScraper.addAllImages(this.images, document.getImages());
 
                 // add also links from pages one step deeper, if depth > 0
                 if (depth > 0) {
@@ -97,26 +98,13 @@ public final class plasmaSearchImages {
     
     public void addAll(plasmaSearchImages m) {
         synchronized (m.images) {
-            addAll(m.images);
-        }
-    }
-    
-    private void addAll(TreeSet<htmlFilterImageEntry> ts) {
-        Iterator<htmlFilterImageEntry> i = ts.iterator();
-        htmlFilterImageEntry ie;
-        while (i.hasNext()) {
-            ie = i.next();
-            if (images.contains(ie)) {
-                if ((ie.height() > 0) && (ie.width() > 0)) images.add(ie);
-            } else {
-                images.add(ie);
-            }
+            htmlFilterContentScraper.addAllImages(this.images, m.images);
         }
     }
     
     public Iterator<htmlFilterImageEntry> entries() {
         // returns htmlFilterImageEntry - Objects
-        return images.iterator();
+        return images.values().iterator();
     }
     
 }
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index 42accd875..3b336ee69 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -697,7 +697,8 @@ public class plasmaSnippetCache {
     
     public static ArrayList<MediaSnippet> computeImageSnippets(plasmaParserDocument document, Set<String> queryhashes) {
         
-        TreeSet<htmlFilterImageEntry> images = document.getImages(); // iterates images in descending size order!
+        TreeSet<htmlFilterImageEntry> images = new TreeSet<htmlFilterImageEntry>();
+        images.addAll(document.getImages().values()); // iterates images in descending size order!
         // a measurement for the size of the images can be retrieved using the htmlFilterImageEntry.hashCode()
         
         Iterator<htmlFilterImageEntry> i = images.iterator();