redesigned some parts of the html scanner & parser

to better support image tags git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1995 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 83e0e765ec
parent ac114d69c0
commit 83e0e765ec
15 changed files with 357 additions and 289 deletions
--- a/htroot/ViewImage.java
+++ b/htroot/ViewImage.java
@ -66,9 +66,10 @@ public class ViewImage {
        }
        int width = post.getInt("width", 0);
        int height = post.getInt("height", 0);
+        int timeout = post.getInt("timeout", 5000);
        
        // load image
-        byte[] imgb = sb.snippetCache.getResource(url, true);
+        byte[] imgb = sb.snippetCache.getResource(url, true, timeout);
        if (imgb == null) return null;
        
        // create image 
@ -76,7 +77,20 @@ public class ViewImage {
        Image original = Toolkit.getDefaultToolkit().createImage(imgb); 
        mediaTracker.addImage(original, 0); 
        try {mediaTracker.waitForID(0);} catch (InterruptedException e) {} 
-        if ((width == 0) || (height == 0)) return original;
+        boolean auth = ((String) header.get("CLIENTIP", "")).equals("localhost") || sb.verifyAuthentication(header, false); // handle access rights
+        if ((auth) && ((width == 0) || (height == 0))) return original;
+
+        // in case of not-authorized access shrink the image to prevent copyright problems
+        // so that images are not larger than thumbnails
+        if (!auth) {
+            width = width / 2;
+            height = height / 2;
+            int xsc = Math.max(width, height);
+            if (xsc > 64) {
+                width = width * 64 / xsc;
+                height = height * 64 / xsc;
+            }
+        }
        
        // scale image 
        Image scaled = original.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING); 
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -89,7 +89,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen

    // class variables: collectors for links
    private HashMap anchors;
-    private HashMap images;
+    private TreeSet images; // String(absolute url)/ImageEntry relation
    private HashMap metas;
    private String title;
    //private String headline;
@ -103,7 +103,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        super(linkTags0, linkTags1);
        this.root = root;
        this.anchors = new HashMap();
-        this.images = new HashMap();
+        this.images = new TreeSet();
        this.metas = new HashMap();
        this.title = "";
        this.headlines = new ArrayList[4];
@ -112,55 +112,11 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
    }

    public void scrapeText(byte[] newtext) {
-//      System.out.println("SCRAPE: " + new String(newtext));
+        // System.out.println("SCRAPE: " + new String(newtext));
        if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32);
        content.append(super.stripAll(new serverByteBuffer(newtext, newtext.length + 1)).trim()).append(32);
    }

-    
-    /*
-    public static String urlNormalform(String us) {
-        if (us == null) { return null; }
-        if (us.length() == 0) { return null; }
-
-        serverLog.logFiner("htmlFilter", "urlNormalform:  IN=" + us);
-        
-        // TODO: what about 
-        // - case insensitive domain names
-        // - chars that should be escaped in URLs
-
-        // cutting of everything behind #
-        int cpos = us.indexOf("#");
-        if (cpos >= 0) { us = us.substring(0, cpos); }
-        if (us.startsWith("https")) {
-            if (us.endsWith(":443")) {
-                us = us.substring(0, us.length() - 4);
-                serverLog.logFinest("htmlFilter", "urlNormalform: :443=" + us);
-            } else {
-                cpos = us.indexOf(":443/");
-                if (cpos >= 0) {
-                    us = us.substring(0, cpos).concat(us.substring(cpos + 4));
-                    serverLog.logFinest("htmlFilter", "urlNormalform: :443/=" + us);
-                }
-            }
-        } else if (us.startsWith("http")) {
-            if (us.endsWith(":80")) {
-                us = us.substring(0, us.length() - 3);
-                serverLog.logFinest("htmlFilter", "urlNormalform: :80=" + us);
-            } else {
-                cpos = us.indexOf(":80/");
-                if (cpos >= 0) {
-                    us = us.substring(0, cpos).concat(us.substring(cpos + 3));
-                    serverLog.logFinest("htmlFilter", "urlNormalform: :80/=" + us);
-                }
-            } 
-        }
-        if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1);
-        serverLog.logFine("htmlFilter", "urlNormalform: OUT=" + us);        
-        return us;
-    }
-    */
-
    public static String urlNormalform(URL url) {
        boolean defaultPort = false;
        // serverLog.logFinest("htmlFilter", "urlNormalform: '" + url.toString() + "'");
@ -212,7 +168,18 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
    }

    public void scrapeTag0(String tagname, Properties tagopts) {
-        if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
+        if (tagname.equalsIgnoreCase("img")) {
+            int width = -1, height = -1;
+            try {
+                width = Integer.parseInt(tagopts.getProperty("width", "-1"));
+                height = Integer.parseInt(tagopts.getProperty("height", "-1"));
+            } catch (NumberFormatException e) {}
+            try {
+                URL url = new URL(absolutePath(tagopts.getProperty("src", "")));
+                htmlFilterImageEntry ie = new htmlFilterImageEntry(url, tagopts.getProperty("alt",""), width, height);
+                images.add(ie);
+            } catch (MalformedURLException e) {}
+        }
        if (tagname.equalsIgnoreCase("base")) try {root = new URL(tagopts.getProperty("href", ""));} catch (MalformedURLException e) {}
        if (tagname.equalsIgnoreCase("frame")) anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
        if (tagname.equalsIgnoreCase("meta")) {
@ -230,7 +197,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
    }

    public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
-//      System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
+        // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
        if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString());
        String h;
        if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
@ -303,7 +270,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        return anchors;
    }

-    public Map getImages() {
+    public TreeSet getImages() {
+        // this resturns a String(absolute url)/htmlFilterImageEntry - relation
        return images;
    }

@ -389,7 +357,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        System.out.println("METAS    :" + metas.toString());
        System.out.println("TEXT     :" + new String(content.getBytes()));
    }
-
+    
+    
+    
 /*
    public static void main(String[] args) {  
        try {
--- a/source/de/anomic/htmlFilter/htmlFilterImageEntry.java
+++ b/source/de/anomic/htmlFilter/htmlFilterImageEntry.java
@ -0,0 +1,101 @@
+// htmlFilterImageEntry.java
+// -----------------------------
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2006
+// created 04.04.2006
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+package de.anomic.htmlFilter;
+
+import java.net.URL;
+
+public class htmlFilterImageEntry {
+
+    private URL url;
+    private String alt;
+    private int width, height;
+
+    public htmlFilterImageEntry(URL url, String alt, int width, int height) {
+        this.url = url;
+        this.alt = alt;
+        this.width = width;
+        this.height = height;
+    }
+
+    public URL url() {
+        return this.url;
+    }
+    
+    public String alt() {
+        return this.alt;
+    }
+
+    public int width() {
+        return this.width;
+    }
+
+    public int height() {
+        return this.height;
+    }
+
+    public String toString() {
+        return "{" + alt + ", " + width + "/" + height + "}";
+    }
+
+    public int hashCode() {
+        if ((width > 0) && (height > 0))
+            return ((0xFFFF - (((width * height) >> 8) & 0xFFFF)) << 16) | (url.hashCode() & 0xFFFF);
+        else
+            return 0xFFFF0000 | (url.hashCode() & 0xFFFF);
+    }
+    
+    public int compareTo(Object h) {
+        // this is needed if this object is stored in a TreeSet
+        assert (url != null);
+        assert (h instanceof htmlFilterImageEntry);
+        if (this.url.equals(((htmlFilterImageEntry) h).url)) return 0;
+        int thc = this.hashCode();
+        int ohc = ((htmlFilterImageEntry) h).hashCode();
+        if (thc < ohc) return -1;
+        if (thc > ohc) return 1;
+        return 0;
+    }
+    
+    public boolean equals(Object o) {
+        if (!(o instanceof htmlFilterImageEntry)) return false;
+        return compareTo(o) == 0;
+    }
+}
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@ -53,9 +53,11 @@ import java.util.Hashtable;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.Map;
+import java.util.TreeSet;

 import de.anomic.htmlFilter.htmlFilterAbstractScraper;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
+import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.htmlFilter.htmlFilterOutputStream;
 import de.anomic.plasma.plasmaParserDocument;
 import de.anomic.plasma.parser.AbstractParser;
@ -104,7 +106,7 @@ public class rssParser extends AbstractParser implements Parser {
        try {
            LinkedList feedSections = new LinkedList();
            HashMap anchors = new HashMap();
-            HashMap images = new HashMap();
+            TreeSet images  = new TreeSet();
            serverByteBuffer text = new serverByteBuffer();
            
            
@ -125,7 +127,7 @@ public class rssParser extends AbstractParser implements Parser {
            
            ImageIF channelImage = channel.getImage();
            if (channelImage != null) {
-                images.put(channelImage.getLocation().toString(),channelImage.getTitle());
+                images.add(new htmlFilterImageEntry(channelImage.getLocation(), channelImage.getTitle(), -1, -1));
            }            
            
            // loop through the feed items
@ -162,9 +164,9 @@ public class rssParser extends AbstractParser implements Parser {
                            anchors.putAll(itemLinks);
                        }
                        
-                        Map itemImages = scraper.getImages();
+                        TreeSet itemImages = scraper.getImages();
                        if ((itemImages != null) && (itemImages.size() > 0)) {
-                            images.putAll(itemImages);
+                            images.addAll(itemImages);
                        }
                        
                        byte[] extractedText = scraper.getText();
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@ -51,6 +51,7 @@ import java.util.HashMap;
 import java.util.Hashtable;
 import java.util.LinkedList;
 import java.util.Map;
+import java.util.TreeSet;
 import java.util.zip.GZIPInputStream;

 import com.ice.tar.TarEntry;
@ -115,7 +116,7 @@ public class tarParser extends AbstractParser implements Parser {
            StringBuffer docAbstrct = new StringBuffer();
            serverByteBuffer docText = new serverByteBuffer();
            Map docAnchors = new HashMap();
-            Map docImages = new HashMap(); 
+            TreeSet docImages = new TreeSet(); 
                        
            // looping through the contained files
            TarEntry entry;
@ -174,7 +175,7 @@ public class tarParser extends AbstractParser implements Parser {
                docText.append(theDoc.getText());                 
                
                docAnchors.putAll(theDoc.getAnchors());
-                docImages.putAll(theDoc.getImages());
+                docImages.addAll(theDoc.getImages());
            }
            
            /* (URL location, String mimeType,
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@ -51,6 +51,7 @@ import java.util.HashMap;
 import java.util.Hashtable;
 import java.util.LinkedList;
 import java.util.Map;
+import java.util.TreeSet;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;

@ -100,8 +101,7 @@ public class zipParser extends AbstractParser implements Parser {
            StringBuffer docAbstrct = new StringBuffer();
            serverByteBuffer docText = new serverByteBuffer();
            Map docAnchors = new HashMap();
-            Map docImages = new HashMap(); 
-            
+            TreeSet docImages = new TreeSet(); 
            
            // creating a new parser class to parse the unzipped content
            plasmaParser theParser = new plasmaParser();            
@ -151,7 +151,7 @@ public class zipParser extends AbstractParser implements Parser {
                docText.append(theDoc.getText());                 
                
                docAnchors.putAll(theDoc.getAnchors());
-                docImages.putAll(theDoc.getImages());
+                docImages.addAll(theDoc.getImages());
            }
            
            /* (URL location, String mimeType,
--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@ -418,10 +418,11 @@ public final class plasmaCrawlLURL extends plasmaURL {
        // - phrasecount, total number of phrases
        // - boolean: URL attributes (see Word-Entity definition)
        // - boolean: appearance of bold and/or italics
+        // - ETag: for re-crawl decision upon HEAD request
        // - int: # of outlinks to same domain
        // - int: # of outlinks to outside domain
-        // - ETag: for re-crawl decision upon HEAD request
        // - int: # of keywords
+        // - int: # der auf der Seite vorhandenen Links zu image, audio, video, applications
        
        public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) {
            // create new entry and store it into database
--- a/source/de/anomic/plasma/plasmaCrawlWorker.java
+++ b/source/de/anomic/plasma/plasmaCrawlWorker.java
@ -247,6 +247,7 @@ public final class plasmaCrawlWorker extends Thread {
        }
    }

+    
    public static plasmaHTCache.Entry load(
            URL url,
            String name,
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -132,7 +132,12 @@ public final class plasmaParser {
    /**
     * A list of media extensions that should <b>not</b> be handled by the plasmaParser
     */
-    private static final HashSet mediaExtSet = new HashSet(28);
+    private static final HashSet mediaExtSet = new HashSet();
+    
+    /**
+     * A list of image extensions that should be handleable by image viewer apps
+     */
+    private static final HashSet imageExtSet = new HashSet();
    
    /**
     * This {@link FilenameFilter} is used to find all classes based on there filenames 
@ -160,8 +165,17 @@ public final class plasmaParser {
     * @see #initMediaExt(String)
     */
    static {
-        initMediaExt(extString2extList("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
-        "sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj"));
+        initMediaExt(extString2extList(
+                "sit,hqx,img,dmg,exe,com,bat,sh" +   // application container
+                "tar,gz,bz2,arj,zip,rar," +          // archive formats
+                "ps,xls,ppt,asf," +                  // text formats without support
+                "mp3,ogg,aac," +                     // audio formats
+                "swf,avi,wmv,rm,mov,mpg,mpeg,ram," + // video formats
+                "jpg,jpeg,jpe,gif,png"               // image formats
+                ));
+        initImageExt(extString2extList(
+                "jpg,jpeg,jpe,gif,png"               // image formats
+                ));
        
        /* ===================================================
         * initializing the parser object pool
@ -225,8 +239,6 @@ public final class plasmaParser {
        }        
    }
    
-
-    
    public static List extString2extList(String extString) {
        LinkedList extensions = new LinkedList();
        if ((extString == null) || (extString.length() == 0)) {
@ -245,6 +257,13 @@ public final class plasmaParser {
        }
    }
    
+    public static void initImageExt(List imageExtList) {
+        synchronized (imageExtSet) {
+            imageExtSet.clear();
+            imageExtSet.addAll(imageExtList);
+        }
+    }
+    
    public static String getMediaExtList() {
        synchronized (mediaExtSet) {
            return mediaExtSet.toString();
@ -315,6 +334,13 @@ public final class plasmaParser {
 		}
    }

+    public static boolean imageExtContains(String imageExt) {
+        if (imageExt == null) return false;
+        synchronized (imageExtSet) {
+            return imageExtSet.contains(imageExt.trim().toLowerCase());
+        }
+    }
+
    public static String getRealMimeType(String mimeType) {
        //if (mimeType == null) doMimeTypeAnalysis
        if (mimeType == null) mimeType = "application/octet-stream";
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -43,12 +43,15 @@
 package de.anomic.plasma;

 import de.anomic.htmlFilter.htmlFilterContentScraper;
+import de.anomic.htmlFilter.htmlFilterImageEntry;

 import java.io.ByteArrayInputStream;
+import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.TreeSet;

 public class plasmaParserDocument {
    
@ -61,7 +64,7 @@ public class plasmaParserDocument {
    String abstrct;     // an abstract, if present: short content description
    byte[] text;        // the clear text, all that is visible
    Map anchors;        // all links embedded as clickeable entities (anchor tags)
-    Map images;         // all visible pictures in document
+    TreeSet images;     // all visible pictures in document
    // the anchors and images - Maps are URL-to-EntityDescription mappings.
    // The EntityDescription appear either as visible text in anchors or as alternative
    // text in image tags.
@ -69,11 +72,12 @@ public class plasmaParserDocument {
    Map medialinks;
    Map emaillinks;
    plasmaCondenser condenser;
+    boolean resorted;
                    
    public plasmaParserDocument(URL location, String mimeType,
                    String keywords, String shortTitle, String longTitle,
                    String[] sections, String abstrct,
-                    byte[] text, Map anchors, Map images) {
+                    byte[] text, Map anchors, TreeSet images) {
        this.location = location;
        this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
        this.keywords = (keywords==null)?"":keywords;
@ -83,23 +87,14 @@ public class plasmaParserDocument {
        this.abstrct = (abstrct==null)?"":abstrct;
        this.text = (text==null)?new byte[0]:text;
        this.anchors = (anchors==null)?new HashMap(0):anchors;
-        this.images = (images==null)?new HashMap(0):images;
+        this.images = (images==null)?new TreeSet():images;
        this.hyperlinks = null;
        this.medialinks = null;
        this.emaillinks = null;
        this.condenser = null;
+        this.resorted = false;
    }
-    
-    /*
-    private String absolutePath(String relativePath) {
-        try {
-            return htmlFilterContentScraper.urlNormalform(location, relativePath);
-        } catch (Exception e) {
-            return "";
-        }
-    }
-    */
-    
+
    public String getMimeType() {
        return this.mimeType;
    }
@ -143,8 +138,10 @@ public class plasmaParserDocument {
        return anchors;
    }
    
-    public Map getImages() {
+    public TreeSet getImages() {
        // returns all links enbedded as pictures (visible in document)
+        // this resturns a htmlFilterImageEntry collection
+        if (!resorted) resortLinks();
        return images;
    }
    
@ -152,23 +149,25 @@ public class plasmaParserDocument {
    
    public Map getHyperlinks() {
        // this is a subset of the getAnchor-set: only links to other hyperrefs
-        if (hyperlinks == null) resortLinks();
+        if (!resorted) resortLinks();
        return hyperlinks;
    }
    
    public Map getMedialinks() {
        // this is partly subset of getAnchor and getImage: all non-hyperrefs
-        if (medialinks == null) resortLinks();
+        if (!resorted) resortLinks();
        return medialinks;
    }
    
    public Map getEmaillinks() {
        // this is part of the getAnchor-set: only links to email addresses
-        if (emaillinks == null) resortLinks();
+        if (!resorted) resortLinks();
        return emaillinks;
    }
    
    private synchronized void resortLinks() {
+        
+        // extract hyperlinks, medialinks and emaillinks from anchorlinks
        Iterator i;
        String url;
        int extpos, qpos;
@ -177,6 +176,7 @@ public class plasmaParserDocument {
        hyperlinks = new HashMap();
        medialinks = new HashMap();
        emaillinks = new HashMap();
+        TreeSet collectedImages = new TreeSet(); // this is a set that is collected now and joined later to the imagelinks
        Map.Entry entry;
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
@ -190,42 +190,54 @@ public class plasmaParserDocument {
                    if (((qpos = url.indexOf("?")) >= 0) && (qpos > extpos)) {
                        ext = url.substring(extpos, qpos).toLowerCase();
                    } else {
-			ext = url.substring(extpos).toLowerCase();
+                        ext = url.substring(extpos).toLowerCase();
                    }
                    normal = htmlFilterContentScraper.urlNormalform(null, url);
                    if (normal != null) { //TODO: extension function is not correct
                        if (plasmaParser.mediaExtContains(ext.substring(1))) {
-                            // this is not an normal anchor, its a media link
+                            // this is not a normal anchor, its a media link
                            medialinks.put(normal, entry.getValue());
                        } else {
                            hyperlinks.put(normal, entry.getValue());
                        }
+                        if (plasmaParser.imageExtContains(ext.substring(1))) {
+                            try {
+                                collectedImages.add(new htmlFilterImageEntry(new URL(normal), "", -1, -1));
+                            } catch (MalformedURLException e) {}
+                        }
                    }
                }
            }
        }
-        // finally add the images to the medialinks
-        i = images.entrySet().iterator();
+        
+        // add the images to the medialinks
+        i = images.iterator();
        String normal;
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
            url = (String) entry.getKey();
            normal = htmlFilterContentScraper.urlNormalform(null, url);
-            if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
+            if (normal != null) medialinks.put(normal, ((htmlFilterImageEntry) entry.getValue()).alt()); // avoid NullPointerException
        }
-        expandHyperlinks();
-    }
-    
-    
-    public synchronized void expandHyperlinks() {
-        // we add artificial hyperlinks to the hyperlink set that can be calculated from
-        // given hyperlinks and imagelinks
+        
+        // expand the hyperlinks:
+        // we add artificial hyperlinks to the hyperlink set
+        // that can be calculated from given hyperlinks and imagelinks
        hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks));
        hyperlinks.putAll(plasmaParser.allReflinks(medialinks));
        hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks));
        hyperlinks.putAll(plasmaParser.allSubpaths(medialinks));
+        
+        // finally add image links that we collected from the anchors to the image map
+        i = collectedImages.iterator();
+        htmlFilterImageEntry iEntry;
+        while (i.hasNext()) {
+            iEntry = (htmlFilterImageEntry) i.next();
+            if (!images.contains(iEntry)) images.add(iEntry);
+        }
+        
+        // don't do this again
+        this.resorted = true;
    }
-
-
    
 }
--- a/source/de/anomic/plasma/plasmaSearchImages.java
+++ b/source/de/anomic/plasma/plasmaSearchImages.java
@ -0,0 +1,119 @@
+// plasmaSearchImages.java 
+// -----------------------
+// part of YACY
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2006
+// Created: 04.04.2006
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+package de.anomic.plasma;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.TreeSet;
+
+import de.anomic.htmlFilter.htmlFilterContentScraper;
+import de.anomic.htmlFilter.htmlFilterImageEntry;
+import de.anomic.server.serverDate;
+
+public final class plasmaSearchImages {
+
+    private TreeSet images;
+    
+    public plasmaSearchImages(plasmaSnippetCache sc, long maxTime, URL url, int depth) {
+        long start = System.currentTimeMillis();
+        this.images = new TreeSet();
+        if (maxTime > 10) {
+            byte[] res = sc.getResource(url, true, (int) maxTime);
+            if (res != null) {
+                plasmaParserDocument document = sc.parseDocument(url, res);
+
+                // add the image links
+                this.addAll(document.getImages());
+
+                // add also links from pages one step deeper, if depth > 0
+                if (depth > 0) {
+                    Map hl = document.getHyperlinks();
+                    Iterator i = hl.entrySet().iterator();
+                    while (i.hasNext()) {
+                        Map.Entry e = (Map.Entry) i.next();
+                        String nexturlstring = htmlFilterContentScraper.urlNormalform(null, (String) e.getKey());
+                        try {
+                            addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), new URL(nexturlstring), depth - 1));
+                        } catch (MalformedURLException e2) {}
+                    }
+                }
+            }
+        }
+    }
+    
+    public plasmaSearchImages(plasmaSnippetCache sc, long maxTime, plasmaSearchResult sres, int depth) {
+        long start = System.currentTimeMillis();
+        this.images = new TreeSet();
+        plasmaCrawlLURL.Entry urlentry;
+        while (sres.hasMoreElements()) {
+            urlentry = sres.nextElement();
+            addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.url(), depth));
+        }
+    }
+    
+    public void addAll(plasmaSearchImages m) {
+        synchronized (m.images) {
+            addAll(m.images);
+        }
+    }
+    
+    private void addAll(TreeSet ts) {
+        Iterator i = ts.iterator();
+        htmlFilterImageEntry ie;
+        while (i.hasNext()) {
+            ie = (htmlFilterImageEntry) i.next();
+            if (images.contains(ie)) {
+                if ((ie.height() > 0) && (ie.width() > 0)) images.add(ie);
+            } else {
+                images.add(ie);
+            }
+        }
+    }
+    
+    public Iterator entries() {
+        // returns htmlFilterImageEntry - Objects
+        return images.iterator();
+    }
+    
+}
--- a/source/de/anomic/plasma/plasmaSearchMedia.java
+++ b/source/de/anomic/plasma/plasmaSearchMedia.java
@ -1,184 +0,0 @@
-// plasmaSearchMedia.java 
-// -----------------------
-// part of YACY
-// (C) by Michael Peter Christen; mc@anomic.de
-// first published on http://www.anomic.de
-// Frankfurt, Germany, 2006
-// Created: 03.04.2006
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-//
-// Using this software in any meaning (reading, learning, copying, compiling,
-// running) means that you agree that the Author(s) is (are) not responsible
-// for cost, loss of data or any harm that may be caused directly or indirectly
-// by usage of this softare or this documentation. The usage of this software
-// is on your own risk. The installation and usage (starting/running) of this
-// software may allow other people or application to access your computer and
-// any attached devices and is highly dependent on the configuration of the
-// software which must be done by the user of the software; the author(s) is
-// (are) also not responsible for proper configuration and usage of the
-// software, even if provoked by documentation provided together with
-// the software.
-//
-// Any changes to this file according to the GPL as documented in the file
-// gpl.txt aside this file in the shipment you received can be done to the
-// lines that follows this copyright notice here, but changes must not be
-// done inside the copyright notive above. A re-distribution must contain
-// the intact and unchanged copyright notice.
-// Contributions and changes to the program code must be marked as such.
-
-
-package de.anomic.plasma;
-
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.TreeSet;
-
-import de.anomic.htmlFilter.htmlFilterContentScraper;
-
-public final class plasmaSearchMedia {
-
-    private HashSet ext;
-    private TreeSet media;
-    
-    public plasmaSearchMedia(plasmaSnippetCache sc, String exts, URL url, int depth) {
-        this(sc, extGen(exts), url, depth);
-    }
-    
-    public plasmaSearchMedia(plasmaSnippetCache sc, HashSet exts, URL url, int depth) {
-        this.ext = exts;
-        this.media = new TreeSet();
-        byte[] res = sc.getResource(url, true);
-        if (res != null) {
-            plasmaParserDocument document = sc.parseDocument(url, res);
-            
-            // add the media links
-            Map ml = document.getMedialinks();
-            Iterator i = ml.entrySet().iterator();
-            while (i.hasNext()) {
-                Map.Entry e = (Map.Entry) i.next();
-                String nexturlstring = htmlFilterContentScraper.urlNormalform(null, (String) e.getKey());
-                int p = nexturlstring.lastIndexOf(".");
-                if ((p > 0) && (this.ext.contains(nexturlstring.substring(p + 1)))) {
-                    try {
-                        media.add(new Entry(new URL(nexturlstring), 0));
-                    } catch (MalformedURLException e1) {}
-                }
-            }
-            
-            // add also links from pages one step deeper, if depth > 0
-            if (depth > 0) {
-                Map hl = document.getHyperlinks();
-                i = hl.entrySet().iterator();
-                while (i.hasNext()) {
-                    Map.Entry e = (Map.Entry) i.next();
-                    String nexturlstring = htmlFilterContentScraper.urlNormalform(null, (String) e.getKey());
-                    try {
-                        addAll(new plasmaSearchMedia(sc, ext, new URL(nexturlstring), depth - 1));
-                    } catch (MalformedURLException e2) {}
-                }
-            }
-        }
-    }
-    
-    public plasmaSearchMedia(plasmaSnippetCache sc, String exts, plasmaSearchResult sres) {
-        this(sc, extGen(exts), sres);
-    }
-    
-    public plasmaSearchMedia(plasmaSnippetCache sc, HashSet exts, plasmaSearchResult sres) {
-        this.ext = exts;
-        this.media = new TreeSet();
-        plasmaCrawlLURL.Entry urlentry;
-        while (sres.hasMoreElements()) {
-            urlentry = sres.nextElement();
-            addAll(new plasmaSearchMedia(sc, ext, urlentry.url(), 0));
-        }
-    }
-
-    private static HashSet extGen(String ext) {
-        ext.replaceAll(",", " ");
-        String[] exts = ext.split(" ");
-        HashSet s = new HashSet(exts.length);
-        for (int i = 0; i < exts.length; i++) s.add(exts[i]);
-        return s;
-    }
-    
-    public void addAll(plasmaSearchMedia m) {
-        this.media.addAll(m.media);
-    }
-    
-    public Iterator entries() {
-        // returns Entry-Objects
-        return media.iterator();
-    }
-    
-    public class Entry {
-
-        private URL url;
-        private int size, width, height;
-        
-        public Entry(URL url, int size) {
-            this.url = url;
-            this.size = size;
-            this.width = -1;
-            this.height = -1;
-        }
-        
-        public Entry(URL url, int width, int height) {
-            this.url = url;
-            this.size = -1;
-            this.width = width;
-            this.height = height;
-        }
-        
-        public URL url() {
-            return this.url;
-        }
-        
-        public int size() {
-            return this.size;
-        }
-        
-        public int width() {
-            return this.width;
-        }
-        
-        public int height() {
-            return this.height;
-        }
-        
-        public int hashCode() {
-            if ((width > 0) && (height > 0))
-                return (((width * height) >> 8) << 16) | (url.hashCode() & 0xFFFF);
-            else
-                return ((size >> 8) << 16) | (url.hashCode() & 0xFFFF);
-        }
-        
-        public int compareTo(Object h) {
-            // this is needed if this object is stored in a TreeSet
-            assert (url != null);
-            assert (h instanceof plasmaSearchMedia.Entry);
-            int thc = this.hashCode();
-            int ohc = ((plasmaSearchMedia.Entry) h).hashCode();
-            if (thc < ohc) return -1;
-            if (thc > ohc) return 1;
-            return 0;
-        }
-    }
-    
-}
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -398,13 +398,12 @@ public class plasmaSnippetCache {
        }
    }
    
-    public byte[] getResource(URL url, boolean fetchOnline) {
+    public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) {
        // load the url as resource from the web
        try {
-            //return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
            byte[] resource = cacheManager.loadResource(url);
            if ((fetchOnline) && (resource == null)) {
-                loadResourceFromWeb(url, 5000);
+                loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout);
                resource = cacheManager.loadResource(url);
            }
            return resource;
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1958,7 +1958,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            // get set of words
            // Set words = plasmaCondenser.getWords(getText(getResource(url,
            // fetchOnline)));
-            Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText());
+            Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
            // delete all word references
            int count = removeReferences(urlhash, witer);
            // finally delete the url entry itself
--- a/source/de/anomic/server/serverDate.java
+++ b/source/de/anomic/server/serverDate.java
@ -299,7 +299,13 @@ public final class serverDate {
            return "unknown";
        }
    }
-        
+    
+    public static long remainingTime(long start, long due, long minimum) {
+        if (due < 0) return -1;
+        long r = due + start - System.currentTimeMillis();
+        if (r <= 0) return minimum; else return r;
+    }
+    
    public static void main(String[] args) {
        //System.out.println("kelondroDate is (" + new kelondroDate().toString() + ")");
        System.out.println("offset is " + (UTCDiff()/1000/60/60) + " hours, javaDate is " + new Date() + ", correctedDate is " + new Date(correctedUTCTime()));