Merge branch 'master' of git@gitorious.org:yacy/rc1.git

11 years ago · 95780eed32
parent 67501c9dda 67beef657f
commit 95780eed32
17 changed files with 416 additions and 322 deletions
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -43,8 +43,10 @@ import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.cora.util.SpaceExceededException;
 import net.yacy.crawler.CrawlSwitchboard;
+import net.yacy.crawler.data.Cache;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.retrieval.SitemapImporter;
+import net.yacy.crawler.robots.RobotsTxt;
 import net.yacy.data.WorkTables;
 import net.yacy.document.Document;
 import net.yacy.document.parser.html.ContentScraper;
@ -218,7 +220,11 @@ public class Crawler_p {
                if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr;
                
                // delete old robots entries
-                for (DigestURL ru: rootURLs) sb.robots.delete(ru);
+                for (DigestURL ru: rootURLs) {
+                    sb.robots.delete(ru);
+                    try {Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(ru)).hash());} catch (IOException e) {}
+                }
+                try {sb.robots.clear();} catch (IOException e) {} // to be safe: clear all.
                
                // set the crawl filter
                String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);
--- a/source/net/yacy/crawler/data/CrawlQueues.java
+++ b/source/net/yacy/crawler/data/CrawlQueues.java
@ -639,7 +639,7 @@ public class CrawlQueues {
                        } else {
                            // starting a load from the internet
                            request.setStatus("worker-loading", WorkflowJob.STATUS_RUNNING);
-                            String result = null;
+                            String error = null;
   
                            // load a resource and push queue entry to switchboard queue
                            // returns null if everything went fine, a fail reason string if a problem occurred
@ -651,23 +651,29 @@ public class CrawlQueues {
                                    if (CrawlQueues.log.isFine()) {
                                        CrawlQueues.log.fine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");
                                    }
-                                    result = "no content (possibly caused by cache policy)";
+                                    error = "no content (possibly caused by cache policy)";
                                } else {
                                    request.setStatus("loaded", WorkflowJob.STATUS_RUNNING);
                                    final String storedFailMessage = CrawlQueues.this.sb.toIndexer(response);
                                    request.setStatus("enqueued-" + ((storedFailMessage == null) ? "ok" : "fail"), WorkflowJob.STATUS_FINISHED);
-                                    result = (storedFailMessage == null) ? null : "not enqueued to indexer: " + storedFailMessage;
+                                    error = (storedFailMessage == null) ? null : "not enqueued to indexer: " + storedFailMessage;
                                }
                            } catch (final IOException e) {
                                request.setStatus("error", WorkflowJob.STATUS_FINISHED);
                                if (CrawlQueues.log.isFine()) {
                                    CrawlQueues.log.fine("problem loading " + request.url().toString() + ": " + e.getMessage());
                                }
-                                result = "load error - " + e.getMessage();
+                                error = "load error - " + e.getMessage();
                            }
   
-                            if (result != null) {
-                                CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + result, -1);
+                            if (error != null) {
+                                if (error.endsWith("$")) {
+                                    // the "$" mark at the end of the error message means, that the error was already pushed to the error-db by the reporting method
+                                    // thus we only push this message if we don't have that mark
+                                    error = error.substring(0, error.length() - 1).trim();
+                                } else {
+                                    CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + error, -1);
+                                }
                                request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED);
                            } else {
                                request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED);
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@ -78,7 +78,7 @@ public final class HTTPLoader {

        if (retryCount < 0) {
            this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
-            throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
+            throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
        }

        DigestURL url = request.url();
@ -94,7 +94,7 @@ public final class HTTPLoader {
        final String hostlow = host.toLowerCase();
        if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
            this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
-            throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
+            throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
        }

        // resolve yacy and yacyh domains
@ -141,7 +141,7 @@ public final class HTTPLoader {

            if (redirectionUrlString.isEmpty()) {
                this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
-                throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
+                throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
            }

            // normalize URL
@ -161,7 +161,7 @@ public final class HTTPLoader {
                // if we are already doing a shutdown we don't need to retry crawling
                if (Thread.currentThread().isInterrupted()) {
                    this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
-                    throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.");
+                    throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.$");
                }

                // retry crawling with new url
@ -170,11 +170,11 @@ public final class HTTPLoader {
    	    }
            // we don't want to follow redirects
            this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
-            throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
+            throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
        } else if (responseBody == null) {
    	    // no response, reject file
            this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
-            throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
+            throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
    	} else if (statusCode == 200 || statusCode == 203) {
            // the transfer is ok

@ -185,7 +185,7 @@ public final class HTTPLoader {
            // check length again in case it was not possible to get the length before loading
            if (maxFileSize >= 0 && contentLength > maxFileSize) {
            	this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
-            	throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
+            	throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$");
            }

            // create a new cache entry
@ -202,7 +202,7 @@ public final class HTTPLoader {
    	} else {
            // if the response has not the right response type then reject file
        	this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
-            throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
+            throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
        }
    }

--- a/source/net/yacy/crawler/robots/RobotsTxt.java
+++ b/source/net/yacy/crawler/robots/RobotsTxt.java
@ -155,13 +155,7 @@ public class RobotsTxt {
                }

                // generating the proper url to download the robots txt
-                DigestURL robotsURL = null;
-                try {
-                    robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt");
-                } catch (final MalformedURLException e) {
-                    log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
-                    robotsURL = null;
-                }
+                DigestURL robotsURL = robotsURL(urlHostPort);

                Response response = null;
                if (robotsURL != null) {
@ -230,14 +224,8 @@ public class RobotsTxt {
                    if (robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return;

                    // generating the proper url to download the robots txt
-                    DigestURL robotsURL = null;
-                    try {
-                        robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt");
-                    } catch (final MalformedURLException e) {
-                        log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
-                        robotsURL = null;
-                    }
-
+                    DigestURL robotsURL = robotsURL(urlHostPort);
+                    
                    Response response = null;
                    if (robotsURL != null) {
                        if (log.isFine()) log.fine("Trying to download the robots.txt file from URL '" + robotsURL + "'.");
@ -332,7 +320,7 @@ public class RobotsTxt {
        }
    }

-    static final String getHostPort(final MultiProtocolURL theURL) {
+    public static final String getHostPort(final MultiProtocolURL theURL) {
        int port = theURL.getPort();
        if (port == -1) {
            if (theURL.getProtocol().equalsIgnoreCase("http")) {
@ -349,7 +337,18 @@ public class RobotsTxt {
        sb.append(host).append(':').append(Integer.toString(port));
        return sb.toString();
    }
-
+    
+    public static DigestURL robotsURL(final String urlHostPort) {
+        DigestURL robotsURL = null;
+        try {
+            robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt");
+        } catch (final MalformedURLException e) {
+            log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
+            robotsURL = null;
+        }
+        return robotsURL;
+    }
+    
    public static class CheckEntry {
        public final DigestURL digestURL;
        public final RobotsTxtEntry robotsTxtEntry;
--- a/source/net/yacy/document/parser/html/AbstractScraper.java
+++ b/source/net/yacy/document/parser/html/AbstractScraper.java
@ -29,7 +29,6 @@

 package net.yacy.document.parser.html;

-import java.util.Properties;
 import java.util.Set;

 import net.yacy.kelondro.util.MemoryControl;
@ -72,10 +71,10 @@ public abstract class AbstractScraper implements Scraper {

    // the other methods must take into account to construct the return value correctly
    @Override
-    public abstract void scrapeTag0(String tagname, Properties tagopts);
+    public abstract void scrapeTag0(ContentScraper.Tag tag);

    @Override
-    public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text);
+    public abstract void scrapeTag1(ContentScraper.Tag tag);

    public static String stripAllTags(final char[] s) {
        if (s.length > 80 && !MemoryControl.request(s.length * 2, false)) return "";
--- a/source/net/yacy/document/parser/html/AbstractTransformer.java
+++ b/source/net/yacy/document/parser/html/AbstractTransformer.java
@ -24,7 +24,6 @@

 package net.yacy.document.parser.html;

-import java.util.Properties;
 import java.util.TreeSet;

 public abstract class AbstractTransformer implements Transformer {
@ -58,13 +57,13 @@ public abstract class AbstractTransformer implements Transformer {

    // the other methods must take into account to construct the return value correctly
    @Override
-    public char[] transformTag0(final String tagname, final Properties tagopts, final char quotechar) {
-        return TransformerWriter.genTag0(tagname, tagopts, quotechar);
+    public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) {
+        return TransformerWriter.genTag0(tag.name, tag.opts, quotechar);
    }

    @Override
-    public char[] transformTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
-        return TransformerWriter.genTag1(tagname, tagopts, text, quotechar);
+    public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) {
+        return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
    }

    @Override
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -59,6 +59,7 @@ import net.yacy.cora.util.NumberTools;
 import net.yacy.document.SentenceReader;
 import net.yacy.document.parser.htmlParser;
 import net.yacy.document.parser.html.Evaluation.Element;
+import net.yacy.document.parser.images.genericImageParser;
 import net.yacy.kelondro.io.CharBuffer;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.ISO639;
@ -80,7 +81,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        singleton, pair;
    }

-    public enum Tag {
+    public enum TagName {
        html(TagType.singleton), // scraped as singleton to get attached properties like 'lang'
        body(TagType.singleton), // scraped as singleton to get attached properties like 'class'
        div(TagType.singleton),  // scraped as singleton to get attached properties like 'id'
@ -111,14 +112,49 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        style(TagType.pair);

        public TagType type;
-        private Tag(final TagType type) {
+        private TagName(final TagType type) {
            this.type = type;
        }
    }

+    public static class Tag {
+        public String name;
+        public Properties opts;
+        public CharBuffer content;
+        public Tag(final String name) {
+            this.name = name;
+            this.opts = new Properties();
+            this.content = new CharBuffer(100);
+        }
+        public Tag(final String name, final Properties opts) {
+            this.name = name;
+            this.opts = opts;
+            this.content = new CharBuffer(100);
+        }
+        public Tag(final String name, final Properties opts, final CharBuffer content) {
+            this.name = name;
+            this.opts = opts;
+            this.content = content;
+        }
+        public void close() {
+            this.name = null;
+            this.opts = null;
+            if (this.content != null) this.content.close();
+            this.content = null;
+        }
+        @Override
+        public void finalize() {
+            this.close();
+        }
+        @Override
+        public String toString() {
+            return "<" + name + " " + opts + ">" + content + "</" + name + ">";
+        }
+    }
+
    // all these tags must be given in lowercase, because the tags from the files are compared in lowercase
    static {
-        for (final Tag tag: Tag.values()) {
+        for (final TagName tag: TagName.values()) {
            if (tag.type == TagType.singleton) linkTags0.add(tag.name());
            if (tag.type == TagType.pair) linkTags1.add(tag.name());
        }
@ -321,88 +357,88 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    }

    @Override
-    public void scrapeTag0(final String tagname, final Properties tagopts) {
-        if (tagname.equalsIgnoreCase("img")) {
-            final String src = tagopts.getProperty("src", EMPTY_STRING);
+    public void scrapeTag0(Tag tag) {
+        if (tag.name.equalsIgnoreCase("img")) {
+            final String src = tag.opts.getProperty("src", EMPTY_STRING);
            try {
                if (src.length() > 0) {
                    final AnchorURL url = absolutePath(src);
                    if (url != null) {
-                        final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
-                        final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
-                        final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", EMPTY_STRING), width, height, -1);
+                        final int width = Integer.parseInt(tag.opts.getProperty("width", "-1"));
+                        final int height = Integer.parseInt(tag.opts.getProperty("height", "-1"));
+                        final ImageEntry ie = new ImageEntry(url, tag.opts.getProperty("alt", EMPTY_STRING), width, height, -1);
                        this.images.add(ie);
                    }
                }
            } catch (final NumberFormatException e) {}
            this.evaluationScores.match(Element.imgpath, src);
-        } else if(tagname.equalsIgnoreCase("base")) {
+        } else if(tag.name.equalsIgnoreCase("base")) {
            try {
-                this.root = new DigestURL(tagopts.getProperty("href", EMPTY_STRING));
+                this.root = new DigestURL(tag.opts.getProperty("href", EMPTY_STRING));
            } catch (final MalformedURLException e) {}
-        } else if (tagname.equalsIgnoreCase("frame")) {
-            final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
-            tagopts.put("src", src.toNormalform(true));
-            src.setAll(tagopts);
+        } else if (tag.name.equalsIgnoreCase("frame")) {
+            final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
+            tag.opts.put("src", src.toNormalform(true));
+            src.setAll(tag.opts);
            this.anchors.add(src);
            this.frames.add(src);
            this.evaluationScores.match(Element.framepath, src.toNormalform(true));
-        } else if (tagname.equalsIgnoreCase("body")) {
-            final String c = tagopts.getProperty("class", EMPTY_STRING);
+        } else if (tag.name.equalsIgnoreCase("body")) {
+            final String c = tag.opts.getProperty("class", EMPTY_STRING);
            this.evaluationScores.match(Element.bodyclass, c);
-        } else if (tagname.equalsIgnoreCase("div")) {
-            final String id = tagopts.getProperty("id", EMPTY_STRING);
+        } else if (tag.name.equalsIgnoreCase("div")) {
+            final String id = tag.opts.getProperty("id", EMPTY_STRING);
            this.evaluationScores.match(Element.divid, id);
-            final String itemtype = tagopts.getProperty("itemtype", EMPTY_STRING);
+            final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
            if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
                breadcrumbs++;
            }
-        } else if (tagname.equalsIgnoreCase("meta")) {
-            final String content = tagopts.getProperty("content", EMPTY_STRING);
-            String name = tagopts.getProperty("name", EMPTY_STRING);
+        } else if (tag.name.equalsIgnoreCase("meta")) {
+            final String content = tag.opts.getProperty("content", EMPTY_STRING);
+            String name = tag.opts.getProperty("name", EMPTY_STRING);
            if (name.length() > 0) {
                this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
                if (name.toLowerCase().equals("generator")) {
                    this.evaluationScores.match(Element.metagenerator, content);
                }
            }
-            name = tagopts.getProperty("http-equiv", EMPTY_STRING);
+            name = tag.opts.getProperty("http-equiv", EMPTY_STRING);
            if (name.length() > 0) {
                this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
            }
-            name = tagopts.getProperty("property", EMPTY_STRING);
+            name = tag.opts.getProperty("property", EMPTY_STRING);
            if (name.length() > 0) {
                this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
            }
-        } else if (tagname.equalsIgnoreCase("area")) {
-            final String areatitle = cleanLine(tagopts.getProperty("title", EMPTY_STRING));
-            //String alt   = tagopts.getProperty("alt",EMPTY_STRING);
-            final String href  = tagopts.getProperty("href", EMPTY_STRING);
+        } else if (tag.name.equalsIgnoreCase("area")) {
+            final String areatitle = cleanLine(tag.opts.getProperty("title", EMPTY_STRING));
+            //String alt   = tag.opts.getProperty("alt",EMPTY_STRING);
+            final String href  = tag.opts.getProperty("href", EMPTY_STRING);
            if (href.length() > 0) {
-                tagopts.put("name", areatitle);
+                tag.opts.put("name", areatitle);
                AnchorURL url = absolutePath(href);
-                tagopts.put("href", url.toNormalform(true));
-                url.setAll(tagopts);
+                tag.opts.put("href", url.toNormalform(true));
+                url.setAll(tag.opts);
                this.anchors.add(url);
            }
-        } else if (tagname.equalsIgnoreCase("link")) {
-            final String href = tagopts.getProperty("href", EMPTY_STRING);
+        } else if (tag.name.equalsIgnoreCase("link")) {
+            final String href = tag.opts.getProperty("href", EMPTY_STRING);
            final AnchorURL newLink = absolutePath(href);

            if (newLink != null) {
-                tagopts.put("href", newLink.toNormalform(true));
-                String rel = tagopts.getProperty("rel", EMPTY_STRING);
-                final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
-                final String type = tagopts.getProperty("type", EMPTY_STRING);
-                final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING);
+                tag.opts.put("href", newLink.toNormalform(true));
+                String rel = tag.opts.getProperty("rel", EMPTY_STRING);
+                final String linktitle = tag.opts.getProperty("title", EMPTY_STRING);
+                final String type = tag.opts.getProperty("type", EMPTY_STRING);
+                final String hreflang = tag.opts.getProperty("hreflang", EMPTY_STRING);

                if (rel.equalsIgnoreCase("shortcut icon")) {
                    final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
                    this.images.add(ie);
                    this.favicon = newLink;
                } else if (rel.equalsIgnoreCase("canonical")) {
-                    tagopts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
-                    newLink.setAll(tagopts);
+                    tag.opts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
+                    newLink.setAll(tag.opts);
                    this.anchors.add(newLink);
                    this.canonical = newLink;
                } else if (rel.equalsIgnoreCase("publisher")) {
@ -417,130 +453,130 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                    this.css.put(newLink, rel);
                    this.evaluationScores.match(Element.csspath, href);
                } else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
-                    tagopts.put("name", linktitle);
-                    newLink.setAll(tagopts);
+                    tag.opts.put("name", linktitle);
+                    newLink.setAll(tag.opts);
                    this.anchors.add(newLink);
                }
            }
-        } else if(tagname.equalsIgnoreCase("embed")) {
-            final String src = tagopts.getProperty("src", EMPTY_STRING);
+        } else if(tag.name.equalsIgnoreCase("embed")) {
+            final String src = tag.opts.getProperty("src", EMPTY_STRING);
            try {
                if (src.length() > 0) {
                    final AnchorURL url = absolutePath(src);
                    if (url != null) {
-                        final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
-                        final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
-                        tagopts.put("src", url.toNormalform(true));
-                        final EmbedEntry ie = new EmbedEntry(url, width, height, tagopts.getProperty("type", EMPTY_STRING), tagopts.getProperty("pluginspage", EMPTY_STRING));
+                        final int width = Integer.parseInt(tag.opts.getProperty("width", "-1"));
+                        final int height = Integer.parseInt(tag.opts.getProperty("height", "-1"));
+                        tag.opts.put("src", url.toNormalform(true));
+                        final EmbedEntry ie = new EmbedEntry(url, width, height, tag.opts.getProperty("type", EMPTY_STRING), tag.opts.getProperty("pluginspage", EMPTY_STRING));
                        this.embeds.put(url, ie);
-                        url.setAll(tagopts);
+                        url.setAll(tag.opts);
                        this.anchors.add(url);
                    }
                }
            } catch (final NumberFormatException e) {}
-        } else if(tagname.equalsIgnoreCase("param")) {
-            final String name = tagopts.getProperty("name", EMPTY_STRING);
+        } else if(tag.name.equalsIgnoreCase("param")) {
+            final String name = tag.opts.getProperty("name", EMPTY_STRING);
            if (name.equalsIgnoreCase("movie")) {
-                AnchorURL url = absolutePath(tagopts.getProperty("value", EMPTY_STRING));
-                tagopts.put("value", url.toNormalform(true));
-                url.setAll(tagopts);
+                AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING));
+                tag.opts.put("value", url.toNormalform(true));
+                url.setAll(tag.opts);
                this.anchors.add(url);
            }
-        } else if (tagname.equalsIgnoreCase("iframe")) {
-            final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
-            tagopts.put("src", src.toNormalform(true));
-            src.setAll(tagopts);
+        } else if (tag.name.equalsIgnoreCase("iframe")) {
+            final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
+            tag.opts.put("src", src.toNormalform(true));
+            src.setAll(tag.opts);
            this.anchors.add(src);
            this.iframes.add(src);
            this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
-        } else if (tagname.equalsIgnoreCase("html")) {
-            final String lang = tagopts.getProperty("lang", EMPTY_STRING);
+        } else if (tag.name.equalsIgnoreCase("html")) {
+            final String lang = tag.opts.getProperty("lang", EMPTY_STRING);
            if (!lang.isEmpty()) // fake a language meta to preserv detection from <html lang="xx" />
                this.metas.put("dc.language",lang.substring(0,2)); // fix found entries like "hu-hu"
        }

        // fire event
-        fireScrapeTag0(tagname, tagopts);
+        fireScrapeTag0(tag.name, tag.opts);
    }

    @Override
-    public void scrapeTag1(final String tagname, final Properties tagopts, char[] text) {
-        // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
-        if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
-            String href = tagopts.getProperty("href", EMPTY_STRING);
+    public void scrapeTag1(Tag tag) {
+        // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
+        if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
+            String href = tag.opts.getProperty("href", EMPTY_STRING);
            href = CharacterCoding.html2unicode(href);
            AnchorURL url;
            if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
                final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
-                if (ext.equals("png") || ext.equals("gif") || ext.equals("jpg") || ext.equals("jpeg") || ext.equals("tiff") || ext.equals("tif")) {
+                if (genericImageParser.SUPPORTED_EXTENSIONS.contains(ext)) {
                    // special handling of such urls: put them to the image urls
-                    final ImageEntry ie = new ImageEntry(url, recursiveParse(url, text), -1, -1, -1);
+                    final ImageEntry ie = new ImageEntry(url, recursiveParse(url, tag.content.getChars()), -1, -1, -1);
                    this.images.add(ie);
                } else {
                    if (followDenied()) {
-                        String rel = tagopts.getProperty("rel", EMPTY_STRING);
+                        String rel = tag.opts.getProperty("rel", EMPTY_STRING);
                        if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; 
-                        tagopts.put("rel", rel);
+                        tag.opts.put("rel", rel);
                    }
-                    tagopts.put("text", new String(text));
-                    tagopts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
-                    url.setAll(tagopts);
-                    recursiveParse(url, text);
+                    tag.opts.put("text", new String(tag.content.getChars()));
+                    tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
+                    url.setAll(tag.opts);
+                    recursiveParse(url, tag.content.getChars());
                    this.anchors.add(url);
                }
            }
            this.evaluationScores.match(Element.apath, href);
        }
        final String h;
-        if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
            if (h.length() > 0) this.headlines[0].add(h);
-        } else if((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if((tag.name.equalsIgnoreCase("h2")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
            if (h.length() > 0) this.headlines[1].add(h);
-        } else if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("h3")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
            if (h.length() > 0) this.headlines[2].add(h);
-        } else if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("h4")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
            if (h.length() > 0) this.headlines[3].add(h);
-        } else if ((tagname.equalsIgnoreCase("h5")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("h5")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
            if (h.length() > 0) this.headlines[4].add(h);
-        } else if ((tagname.equalsIgnoreCase("h6")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("h6")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
            if (h.length() > 0) this.headlines[5].add(h);
-        } else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
-            String t = recursiveParse(null, text);
-            this.titles.add(t);
-            this.evaluationScores.match(Element.title, t);
-        } else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("title")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
+            this.titles.add(h);
+            this.evaluationScores.match(Element.title, h);
+        } else if ((tag.name.equalsIgnoreCase("b")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
            if (h.length() > 0) this.bold.inc(h);
-        } else if ((tagname.equalsIgnoreCase("strong")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("strong")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
            if (h.length() > 0) this.bold.inc(h);
-        } else if ((tagname.equalsIgnoreCase("i")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("i")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
            if (h.length() > 0) this.italic.inc(h);
-        } else if ((tagname.equalsIgnoreCase("u")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("u")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
            if (h.length() > 0) this.underline.inc(h);
-        } else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("li")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
            if (h.length() > 0) this.li.add(h);
-        } else if (tagname.equalsIgnoreCase("script")) {
-            final String src = tagopts.getProperty("src", EMPTY_STRING);
+        } else if (tag.name.equalsIgnoreCase("script")) {
+            final String src = tag.opts.getProperty("src", EMPTY_STRING);
            if (src.length() > 0) {
                this.script.add(absolutePath(src));
                this.evaluationScores.match(Element.scriptpath, src);
            } else {
-                this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(text)).replaceAll(" "));
+                this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(tag.content.getChars())).replaceAll(" "));
            }
        }

        // fire event
-        fireScrapeTag1(tagname, tagopts, text);
+        fireScrapeTag1(tag.name, tag.opts, tag.content.getChars());
    }


@ -570,15 +606,20 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        for (final AnchorURL entry: scraper.getAnchors()) {
            this.anchors.add(entry);
        }
+        String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
        for (ImageEntry ie: scraper.images) {
            if (linkurl != null) {
                ie.setLinkurl(linkurl);
-                ie.setAnchortext(new String(inlineHtml));
+                ie.setAnchortext(line);
+            }
+            // this image may have been added recently from the same location (as this is a recursive parse)
+            // we want to keep only one of them, check if they are equal
+            if (this.images.size() > 0 && this.images.get(this.images.size() - 1).url().equals(ie.url())) {
+                this.images.remove(this.images.size() - 1);
            }
            this.images.add(ie);
        }

-        String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
        scraper.close();
        return line;
    }
@ -681,6 +722,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    }
    
    public String getText() {
+        this.content.trim();
        try {
            return this.content.toString();
        } catch (final OutOfMemoryError e) {
--- a/source/net/yacy/document/parser/html/ContentTransformer.java
+++ b/source/net/yacy/document/parser/html/ContentTransformer.java
@ -29,7 +29,6 @@ import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Properties;
 import java.util.TreeSet;

 import net.yacy.cora.document.encoding.ASCII;
@ -115,27 +114,27 @@ public class ContentTransformer extends AbstractTransformer implements Transform
    }

    @Override
-    public char[] transformTag0(final String tagname, final Properties tagopts, final char quotechar) {
-        if (tagname.equals("img")) {
+    public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) {
+        if (tag.name.equals("img")) {
            // check bluelist
-            if (bluelistHit(tagopts.getProperty("src", "").toCharArray())) return genBlueLetters(5);
-            if (bluelistHit(tagopts.getProperty("alt", "").toCharArray())) return genBlueLetters(5);
+            if (bluelistHit(tag.opts.getProperty("src", "").toCharArray())) return genBlueLetters(5);
+            if (bluelistHit(tag.opts.getProperty("alt", "").toCharArray())) return genBlueLetters(5);

            // replace image alternative name
-            tagopts.setProperty("alt", new String(transformText(tagopts.getProperty("alt", "").toCharArray())));
+            tag.opts.setProperty("alt", new String(transformText(tag.opts.getProperty("alt", "").toCharArray())));
        }
-        if (tagname.equals("input") && (tagopts.getProperty("type") != null && tagopts.getProperty("type").equals("submit"))) {
+        if (tag.name.equals("input") && (tag.opts.getProperty("type") != null && tag.opts.getProperty("type").equals("submit"))) {
            // rewrite button name
-            tagopts.setProperty("value", new String(transformText(tagopts.getProperty("value", "").toCharArray())));
+            tag.opts.setProperty("value", new String(transformText(tag.opts.getProperty("value", "").toCharArray())));
        }
-        return TransformerWriter.genTag0(tagname, tagopts, quotechar);
+        return TransformerWriter.genTag0(tag.name, tag.opts, quotechar);
    }

    @Override
-    public char[] transformTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
-        if (bluelistHit(tagopts.getProperty("href","").toCharArray())) return genBlueLetters(text.length);
-        if (bluelistHit(text)) return genBlueLetters(text.length);
-        return TransformerWriter.genTag1(tagname, tagopts, text, quotechar);
+    public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) {
+        if (bluelistHit(tag.opts.getProperty("href","").toCharArray())) return genBlueLetters(tag.content.length());
+        if (bluelistHit(tag.content.getChars())) return genBlueLetters(tag.content.length());
+        return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
    }

    @Override
--- a/source/net/yacy/document/parser/html/Scraper.java
+++ b/source/net/yacy/document/parser/html/Scraper.java
@ -24,8 +24,6 @@

 package net.yacy.document.parser.html;

-import java.util.Properties;
-
 public interface Scraper {

    public boolean isTag0(String tag);
@ -34,9 +32,9 @@ public interface Scraper {

    public void scrapeText(char[] text, String insideTag);

-    public void scrapeTag0(String tagname, Properties tagopts);
+    public void scrapeTag0(ContentScraper.Tag tag);

-    public void scrapeTag1(String tagname, Properties tagopts, char[] text);
+    public void scrapeTag1(ContentScraper.Tag tag);

    public void scrapeComment(final char[] comment);

--- a/source/net/yacy/document/parser/html/Transformer.java
+++ b/source/net/yacy/document/parser/html/Transformer.java
@ -24,8 +24,6 @@

 package net.yacy.document.parser.html;

-import java.util.Properties;
-
 public interface Transformer {

    // the init method is used to initialize the transformer with some values
@ -52,10 +50,10 @@ public interface Transformer {
    public char[] transformText(char[] text);

    // method that is called when a body-less tag occurs
-    public char[] transformTag0(String tagname, Properties tagopts, char quotechar);
+    public char[] transformTag0(ContentScraper.Tag tag, char quotechar);

    // method that is called when a body-containing text occurs
-    public char[] transformTag1(String tagname, Properties tagopts, char[] text, char quotechar);
+    public char[] transformTag1(ContentScraper.Tag tag, char quotechar);

    public void close();
 }
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@ -43,6 +43,7 @@ import java.net.MalformedURLException;
 import java.nio.charset.Charset;
 import java.util.Enumeration;
 import java.util.Properties;
+import java.util.Stack;

 import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.DigestURL;
@ -62,9 +63,7 @@ public final class TransformerWriter extends Writer {
    private final OutputStream outStream;
    private OutputStreamWriter out;
    private CharBuffer buffer;
-    private String       filterTag;
-    private Properties   filterOpts;
-    private CharBuffer filterCont;
+    private Stack<ContentScraper.Tag> tagStack;
    private final Scraper scraper;
    private final Transformer transformer;
    private boolean inSingleQuote;
@ -72,7 +71,7 @@ public final class TransformerWriter extends Writer {
    private boolean inComment;
    private boolean binaryUnsuspect;
    private final boolean passbyIfBinarySuspect;
-
+    
    public TransformerWriter(
            final OutputStream outStream,
            final Charset charSet,
@ -95,9 +94,7 @@ public final class TransformerWriter extends Writer {
        this.scraper       = scraper;
        this.transformer   = transformer;
        this.buffer        = new CharBuffer(ContentScraper.MAX_DOCSIZE, initialBufferSize);
-        this.filterTag     = null;
-        this.filterOpts    = null;
-        this.filterCont    = null;
+        this.tagStack      = new Stack<ContentScraper.Tag>();
        this.inSingleQuote = false;
        this.inDoubleQuote = false;
        this.inComment     = false;
@ -186,63 +183,105 @@ public final class TransformerWriter extends Writer {
            return result;
    }

-    private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) {
-        //System.out.println("filterTag: filterTag=" + ((this.filterTag == null) ? "null" : this.filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
-        // distinguish the following cases:
-        // - (1) not collecting data for a tag and getting no tag (not opener and not close)
-        // - (2) not collecting data for a tag and getting a tag opener
-        // - (3) not collecting data for a tag and getting a tag close
-        // - (4) collecting data for a tag and getting no tag (not opener and not close)
-        // - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag
-        // - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener)
-        // - (7) collecting data for a tag and getting the correct close tag for that collecting tag
-
-        if (this.filterTag == null) {
+    /**
+     * the token processor distinguishes three different types of input: opening tag, closing tag, text content
+     * @param in - the token to be processed
+     * @param quotechar
+     * @return a processed version of the token
+     */
+    private char[] tokenProcessor(final char[] in, final char quotechar) {
+        if (in.length == 0) return in;
+        
+        // scan the string and parse structure
+        if (in.length <= 2 || in[0] != lb) return filterTag(in); // this is a text
+
+        // this is a tag
+        String tag;
+        int tagend;
+        if (in[1] == '/') {
+            // a closing tag
+            tagend = tagEnd(in, 2);
+            tag = new String(in, 2, tagend - 2).toLowerCase();
+            final char[] text = new char[in.length - tagend - 1];
+            System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
+            return filterTag(text, quotechar, tag, false);
+        }
+
+        // an opening tag
+        tagend = tagEnd(in, 1);
+        tag = new String(in, 1, tagend - 1).toLowerCase();
+        final char[] text = new char[in.length - tagend - 1];
+        System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
+        return filterTag(text, quotechar, tag, true);
+    }
+    
+    // distinguish the following cases:
+    // - (1) not collecting data for a tag and getting no tag (not opener and not close)
+    // - (2) not collecting data for a tag and getting a tag opener
+    // - (3) not collecting data for a tag and getting a tag close
+    // - (4) collecting data for a tag and getting no tag (not opener and not close)
+    // - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag
+    // - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener)
+    // - (7) collecting data for a tag and getting the correct close tag for that collecting tag
+    
+    /**
+     * 
+     * @param content
+     * @return
+     */
+    private char[] filterTag(final char[] content) {
+        if (this.tagStack.size() == 0) {
            // we are not collection tag text -> case (1) - (3)
+            // case (1): this is not a tag opener/closer
+            if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null);
+            if (this.transformer != null) return this.transformer.transformText(content);
+            return content;
+        }

-            if (tag == null) {
-                // case (1): this is not a tag opener/closer
-                if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null);
-                if (this.transformer != null) return this.transformer.transformText(content);
-                return content;
-            }
+        // we are collection tag text for the tag 'filterTag' -> case (4) - (7)
+        // case (4): getting no tag, go on collecting content
+        if (this.scraper != null) {
+            this.scraper.scrapeText(content, this.tagStack.lastElement().name);
+        }
+        if (this.transformer != null) {
+            this.tagStack.lastElement().content.append(this.transformer.transformText(content));
+        } else {
+            this.tagStack.lastElement().content.append(content);
+        }
+        return new char[0];
+    }
+            
+    private char[] filterTag(final char[] content, final char quotechar, final String tagname, final boolean opening) {
+        assert tagname != null;
+        
+        if (this.tagStack.size() == 0) {
+            // we are not collection tag text -> case (1) - (3)

            // we have a new tag
            if (opening) {
                // case (2):
-                return filterTagOpening(tag, content, quotechar);
+                return filterTagOpening(tagname, content, quotechar);
            }

-            // its a close tag
+            // its a close tag where no should be
            // case (3): we ignore that thing and return it again
-            return genTag0raw(tag, false, content);
+            return genTag0raw(tagname, false, content);

        }

        // we are collection tag text for the tag 'filterTag' -> case (4) - (7)
-        if (tag == null || tag.equals("!")) {
-            // case (4): getting no tag, go on collecting content
-            if (this.scraper != null) {
-                this.scraper.scrapeText(content, this.filterTag);
-            }
-            if (this.transformer != null) {
-                this.filterCont.append(this.transformer.transformText(content));
-            } else {
-                this.filterCont.append(content);
-            }
-            return new char[0];
-        }
+        if (tagname.equals("!")) filterTag(content);

        // it's a tag! which one?
        if (opening) {
            // case (5): the opening should not be here. But we keep the order anyway
-            this.filterCont.append(filterTagOpening(tag, content, quotechar));
-            return filterTagCloseing(quotechar);
+            this.tagStack.lastElement().content.append(filterTagOpening(tagname, content, quotechar));
+            return new char[0];
        }

-        if (!tag.equalsIgnoreCase(this.filterTag)) {
+        if (!tagname.equalsIgnoreCase(this.tagStack.lastElement().name)) {
            // case (6): its a closing tag, but the wrong one. just add it.
-            this.filterCont.append(genTag0raw(tag, opening, content));
+            this.tagStack.lastElement().content.append(genTag0raw(tagname, opening, content));
            return new char[0];
        }

@ -250,101 +289,66 @@ public final class TransformerWriter extends Writer {
        return filterTagCloseing(quotechar);
    }

-    private char[] filterTagOpening(final String tag, final char[] content, final char quotechar) {
-        if (this.scraper != null && this.scraper.isTag0(tag)) {
+    private char[] filterTagOpening(final String tagname, final char[] content, final char quotechar) {
+        final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
+        ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser());
+        charBuffer.close();
+        if (this.scraper != null && this.scraper.isTag0(tagname)) {
            // this single tag is collected at once here
-            final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
-            this.scraper.scrapeTag0(tag, charBuffer.propParser());
-            charBuffer.close();
+            this.scraper.scrapeTag0(tag);
        }
-        if (this.transformer != null && this.transformer.isTag0(tag)) {
+        if (this.transformer != null && this.transformer.isTag0(tagname)) {
            // this single tag is collected at once here
-            final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
            char[] b = new char[0];
-            try {
-                b = this.transformer.transformTag0(tag, scb.propParser(), quotechar);
-            } finally {
-                scb.close();
-            }
+            b = this.transformer.transformTag0(tag, quotechar);
            return b;
-        } else if ((this.scraper != null && this.scraper.isTag1(tag)) ||
-                   (this.transformer != null && this.transformer.isTag1(tag))) {
-            // ok, start collecting
-            this.filterTag = tag;
-            final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
-            this.filterOpts = scb.propParser();
-            scb.close();
-            if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
+        } else if ((this.scraper != null && this.scraper.isTag1(tagname)) ||
+                   (this.transformer != null && this.transformer.isTag1(tagname))) {
+            // ok, start collecting; we don't push this here to the scraper or transformer; we do that when the tag is closed.
+            this.tagStack.push(tag);
            return new char[0];
        } else {
             // we ignore that thing and return it again
-             return genTag0raw(tag, true, content);
+             return genTag0raw(tagname, true, content);
        }
    }

    private char[] filterTagCloseing(final char quotechar) {
        char[] ret;
-        if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
+        ContentScraper.Tag tag = this.tagStack.lastElement();
+        if (this.scraper != null) this.scraper.scrapeTag1(tag);
        if (this.transformer != null) {
-            ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
+            ret = this.transformer.transformTag1(tag, quotechar);
        } else {
-            ret = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
+            ret = genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
+        }
+        if ((this.scraper != null && this.scraper.isTag1(tag.name)) ||
+            (this.transformer != null && this.transformer.isTag1(tag.name))) {
+            // remove the tag from the stack as soon as the tag is processed
+            this.tagStack.pop();
+            // at this point the characters from the recently processed tag must be attached to the previous tag
+            if (this.tagStack.size() > 0) this.tagStack.lastElement().content.append(ret);
        }
-        this.filterTag = null;
-        this.filterOpts = null;
-        this.filterCont = null;
        return ret;
    }

    private char[] filterFinalize(final char quotechar) {
-        if (this.filterTag == null) {
+        if (this.tagStack.size() == 0) {
            return new char[0];
        }

        // it's our closing tag! return complete result.
        char[] ret;
-        if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
+        if (this.scraper != null) this.scraper.scrapeTag1(this.tagStack.lastElement());
        if (this.transformer != null) {
-            ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
+            ret = this.transformer.transformTag1(this.tagStack.lastElement(), quotechar);
        } else {
-            ret = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
+            ret = genTag1(this.tagStack.lastElement().name, this.tagStack.lastElement().opts, this.tagStack.lastElement().content.getChars(), quotechar);
        }
-        this.filterTag = null;
-        this.filterOpts = null;
-        this.filterCont = null;
+        this.tagStack.pop();
        return ret;
    }

-    private char[] filterSentence(final char[] in, final char quotechar) {
-        if (in.length == 0) return in;
-        //System.out.println("filterSentence, quotechar = \"" + quotechar + "\": " + new String(in)); // debug
-        // scan the string and parse structure
-        if (in.length > 2 && in[0] == lb) {
-
-            // a tag
-            String tag;
-            int tagend;
-            if (in[1] == '/') {
-                // a closing tag
-                tagend = tagEnd(in, 2);
-                tag = new String(in, 2, tagend - 2).toLowerCase();
-                final char[] text = new char[in.length - tagend - 1];
-                System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
-                return filterTag(tag, false, text, quotechar);
-            }
-
-            // an opening tag
-            tagend = tagEnd(in, 1);
-            tag = new String(in, 1, tagend - 1).toLowerCase();
-            final char[] text = new char[in.length - tagend - 1];
-            System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
-            return filterTag(tag, true, text, quotechar);
-        }
-
-        // a text
-        return filterTag(null, true, in, quotechar);
-    }
-
    private static int tagEnd(final char[] tag, final int start) {
        char c;
        for (int i = start; i < tag.length; i++) {
@ -358,6 +362,14 @@ public final class TransformerWriter extends Writer {
        return tag.length - 1;
    }

+    /**
+     * this is the tokenizer of the parser: it splits the input into pieces which are
+     * - quoted text parts
+     * - commented text parts
+     * - tags (opening and closing)
+     * - text content between all these parts
+     * The tokens are then parsed with the filterSentence method
+     */
    @Override
    public void write(final int c) throws IOException {
        //System.out.println((char) c);
@ -375,7 +387,7 @@ public final class TransformerWriter extends Writer {
                if ((c == rb) && (this.buffer.length() > 0 && this.buffer.charAt(0) == lb)) {
                    this.inSingleQuote = false;
                    // the tag ends here. after filtering: pass on
-                    filtered = filterSentence(this.buffer.getChars(), singlequote);
+                    filtered = tokenProcessor(this.buffer.getChars(), singlequote);
                    if (this.out != null) { this.out.write(filtered); }
                    // this.buffer = new serverByteBuffer();
                    this.buffer.reset();
@ -387,7 +399,7 @@ public final class TransformerWriter extends Writer {
                if (c == rb && this.buffer.length() > 0 && this.buffer.charAt(0) == lb) {
                    this.inDoubleQuote = false;
                    // the tag ends here. after filtering: pass on
-                    filtered = filterSentence(this.buffer.getChars(), doublequote);
+                    filtered = tokenProcessor(this.buffer.getChars(), doublequote);
                    if (this.out != null) this.out.write(filtered);
                    // this.buffer = new serverByteBuffer();
                    this.buffer.reset();
@ -425,7 +437,7 @@ public final class TransformerWriter extends Writer {
                    } else if (c == rb) {
                        this.buffer.append(c);
                        // the tag ends here. after filtering: pass on
-                        filtered = filterSentence(this.buffer.getChars(), doublequote);
+                        filtered = tokenProcessor(this.buffer.getChars(), doublequote);
                        if (this.out != null) this.out.write(filtered);
                        // this.buffer = new serverByteBuffer();
                        this.buffer.reset();
@ -433,7 +445,7 @@ public final class TransformerWriter extends Writer {
                        // this is an error case
                        // we consider that there is one rb missing
                        if (this.buffer.length() > 0) {
-                            filtered = filterSentence(this.buffer.getChars(), doublequote);
+                            filtered = tokenProcessor(this.buffer.getChars(), doublequote);
                            if (this.out != null) this.out.write(filtered);
                        }
                        // this.buffer = new serverByteBuffer();
@ -447,7 +459,7 @@ public final class TransformerWriter extends Writer {
                    if (c == lb) {
                        // the text ends here
                        if (this.buffer.length() > 0) {
-                            filtered = filterSentence(this.buffer.getChars(), doublequote);
+                            filtered = tokenProcessor(this.buffer.getChars(), doublequote);
                            if (this.out != null) this.out.write(filtered);
                        }
                        // this.buffer = new serverByteBuffer();
@ -492,7 +504,7 @@ public final class TransformerWriter extends Writer {
        final char quotechar = (this.inSingleQuote) ? singlequote : doublequote;
        if (this.buffer != null) {
            if (this.buffer.length() > 0) {
-                final char[] filtered = filterSentence(this.buffer.getChars(), quotechar);
+                final char[] filtered = tokenProcessor(this.buffer.getChars(), quotechar);
                if (this.out != null) this.out.write(filtered);
            }
            this.buffer.close();
@ -504,10 +516,8 @@ public final class TransformerWriter extends Writer {
            this.out.flush();
            this.out.close();
        }
-        this.filterTag = null;
-        this.filterOpts = null;
-        if (this.filterCont != null) this.filterCont.close();
-        this.filterCont = null;
+        this.tagStack.clear();
+        this.tagStack = null;
        if (this.scraper != null) this.scraper.finish();
    }

--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -28,16 +28,18 @@ import java.io.BufferedInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.LinkedHashMap;
-import java.util.regex.Pattern;

+import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.protocol.ClientIdentification;
+import net.yacy.cora.util.CommonPattern;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
@ -53,9 +55,7 @@ import com.ibm.icu.text.CharsetDetector;

 public class htmlParser extends AbstractParser implements Parser {

-    private static final Pattern patternUnderline = Pattern.compile("_");
-    private final int maxLinks = 10000;
-    private Charset detectedcharset;
+    private static final int maxLinks = 10000;

    public htmlParser() {
        super("Streaming HTML Parser");
@ -97,9 +97,10 @@ public class htmlParser extends AbstractParser implements Parser {

        try {
            // first get a document from the parsed html
-            final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream, maxLinks);
+            Charset[] detectedcharsetcontainer = new Charset[]{null};
+            final ContentScraper scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks);
            // parseToScraper also detects/corrects/sets charset from html content tag
-            final Document document = transformScraper(location, mimeType, detectedcharset.name(), scraper);
+            final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);

            return new Document[]{document};
        } catch (final IOException e) {
@ -155,9 +156,27 @@ public class htmlParser extends AbstractParser implements Parser {
        return ppd;
    }

-    public ContentScraper parseToScraper(
+    public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, String input, int maxLinks) throws IOException {
+        Charset[] detectedcharsetcontainer = new Charset[]{null};
+        InputStream sourceStream;
+        try {
+            sourceStream = new ByteArrayInputStream(documentCharset == null ? UTF8.getBytes(input) : input.getBytes(documentCharset));
+        } catch (UnsupportedEncodingException e) {
+            sourceStream = new ByteArrayInputStream(UTF8.getBytes(input));
+        }
+        ContentScraper scraper;
+        try {
+            scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks);
+        } catch (Failure e) {
+            throw new IOException(e.getMessage());
+        }
+        return scraper;
+    }
+    
+    public static ContentScraper parseToScraper(
            final DigestURL location,
            final String documentCharset,
+            Charset[] detectedcharsetcontainer,
            InputStream sourceStream,
            final int maxLinks) throws Parser.Failure, IOException {

@ -171,13 +190,15 @@ public class htmlParser extends AbstractParser implements Parser {

        // nothing found: try to find a meta-tag
        if (charset == null) {
+            ScraperInputStream htmlFilter = null;
            try {
-                final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream, documentCharset, location, null, false, maxLinks);
+                htmlFilter = new ScraperInputStream(sourceStream, documentCharset, location, null, false, maxLinks);
                sourceStream = htmlFilter;
                charset = htmlFilter.detectCharset();
-                htmlFilter.close();
            } catch (final IOException e1) {
                throw new Parser.Failure("Charset error:" + e1.getMessage(), location);
+            } finally {
+                if (htmlFilter != null) htmlFilter.close();
            }
        }

@ -193,21 +214,22 @@ public class htmlParser extends AbstractParser implements Parser {

        // wtf? still nothing, just take system-standard
        if (charset == null) {
-            detectedcharset = Charset.defaultCharset();
+            detectedcharsetcontainer[0] = Charset.defaultCharset();
        } else {
            try {
-                detectedcharset = Charset.forName(charset);
+                detectedcharsetcontainer[0] = Charset.forName(charset);
            } catch (final IllegalCharsetNameException e) {
-                detectedcharset = Charset.defaultCharset();
+                detectedcharsetcontainer[0] = Charset.defaultCharset();
            } catch (final UnsupportedCharsetException e) {
-                detectedcharset = Charset.defaultCharset();
+                detectedcharsetcontainer[0] = Charset.defaultCharset();
            }
        }
+        
        // parsing the content
        final ContentScraper scraper = new ContentScraper(location, maxLinks);
        final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
        try {
-            FileUtils.copy(sourceStream, writer, detectedcharset);
+            FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]);
        } catch (final IOException e) {
            throw new Parser.Failure("IO error:" + e.getMessage(), location);
        } finally {
@ -250,7 +272,7 @@ public class htmlParser extends AbstractParser implements Parser {
        if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman";

        // fix wrong fill characters
-        encoding = patternUnderline.matcher(encoding).replaceAll("-");
+        encoding = CommonPattern.UNDERSCORE.matcher(encoding).replaceAll("-");

        if (encoding.matches("GB[_-]?2312([-_]80)?")) return "GB2312";
        if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8";
@ -306,10 +328,9 @@ public class htmlParser extends AbstractParser implements Parser {
        try {
            url = new AnchorURL(args[0]);
            final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null);
-            final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
+            final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new ByteArrayInputStream(content));
            final String title = document[0].dc_title();
            System.out.println(title);
-            System.out.println(CharacterCoding.unicode2html(title, false));
        } catch (final MalformedURLException e) {
            e.printStackTrace();
        } catch (final IOException e) {
@ -319,6 +340,7 @@ public class htmlParser extends AbstractParser implements Parser {
        } catch (final InterruptedException e) {
            e.printStackTrace();
        }
+        System.exit(0);
    }

 }
--- a/source/net/yacy/document/parser/images/genericImageParser.java
+++ b/source/net/yacy/document/parser/images/genericImageParser.java
@ -81,6 +81,8 @@ public class genericImageParser extends AbstractParser implements Parser {
        SUPPORTED_EXTENSIONS.add("jpeg");
        SUPPORTED_EXTENSIONS.add("jpe");
        SUPPORTED_EXTENSIONS.add("bmp");
+        SUPPORTED_EXTENSIONS.add("tif");
+        SUPPORTED_EXTENSIONS.add("tiff");
        SUPPORTED_MIME_TYPES.add("image/png");
        SUPPORTED_MIME_TYPES.add("image/gif");
        SUPPORTED_MIME_TYPES.add("image/jpeg");
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -163,12 +163,13 @@ public final class LoaderDispatcher {
            check = this.loaderSteering.remove(request.url());
            if (check != null) check.release(1000);
            return response;
-        } catch (final IOException e) {
+        } catch (final IOException e) {
+            throw new IOException(e);
+        } finally {
            // release the semaphore anyway
            check = this.loaderSteering.remove(request.url());
-            if (check != null) check.release(1000);
-            // Very noisy: ConcurrentLog.logException(e);
-            throw new IOException(e);
+            if (check != null) check.release(1000);
+            // Very noisy: ConcurrentLog.logException(e);            
        }
    }

@ -190,7 +191,7 @@ public final class LoaderDispatcher {
        // check if url is in blacklist
        if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
            this.sb.crawlQueues.errorURL.push(request.url(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
-            throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
+            throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
        }

        // check if we have the page in the cache
@ -244,13 +245,13 @@ public final class LoaderDispatcher {
            }
        }

-        // check case where we want results from the cache exclusively, and never from the internet (offline mode)
+        // check case where we want results from the cache exclusively, and never from the Internet (offline mode)
        if (cacheStrategy == CacheStrategy.CACHEONLY) {
            // we had a chance to get the content from the cache .. its over. We don't have it.
            throw new IOException("cache only strategy");
        }

-        // now forget about the cache, nothing there. Try to load the content from the internet
+        // now forget about the cache, nothing there. Try to load the content from the Internet

        // check access time: this is a double-check (we checked possibly already in the balancer)
        // to make sure that we don't DoS the target by mistake
@ -302,7 +303,7 @@ public final class LoaderDispatcher {
            // no caching wanted. Thats ok, do not write any message
            return response;
        }
-        // second check tells us if the protocoll tells us something about caching
+        // second check tells us if the protocol tells us something about caching
        final String storeError = response.shallStoreCacheForCrawler();
        if (storeError == null) {
            try {
--- a/source/net/yacy/search/index/ErrorCache.java
+++ b/source/net/yacy/search/index/ErrorCache.java
@ -114,8 +114,14 @@ public class ErrorCache {
        if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
            // send the error to solr
            try {
-                SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration());
-                this.fulltext.getDefaultConnector().add(errorDoc);
+                // do not overwrite error reports with error reports
+                SolrDocument olddoc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(failDoc.getDigestURL().hash()), CollectionSchema.httpstatus_i.getSolrFieldName());
+                if (olddoc == null ||
+                    olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) == null ||
+                    ((Integer) olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName())) == 200) {
+                    SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration());
+                    this.fulltext.getDefaultConnector().add(errorDoc);
+                }
            } catch (final IOException e) {
                ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage());
            }
--- a/source/net/yacy/search/schema/HyperlinkGraph.java
+++ b/source/net/yacy/search/schema/HyperlinkGraph.java
@ -176,7 +176,7 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
                remaining--;
            }
        }
-        if (nodes.size() == 0) ConcurrentLog.warn("HyperlinkGraph", "could not find a root node for " + hostname + " in " + this.edges.size() + " edges");
+        if (nodes.size() == 0 && this.edges.size() > 0) ConcurrentLog.warn("HyperlinkGraph", "could not find a root node for " + hostname + " in " + this.edges.size() + " edges");

        // recusively step into depth and find next level
        int depth = 1;
--- a/source/net/yacy/search/schema/WebgraphConfiguration.java
+++ b/source/net/yacy/search/schema/WebgraphConfiguration.java
@ -51,6 +51,8 @@ import net.yacy.cora.protocol.Domains;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.document.parser.htmlParser;
+import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.ImageEntry;

 public class WebgraphConfiguration extends SchemaConfiguration implements Serializable {
@ -219,26 +221,31 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
            add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
            processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
        }
+
+        // parse text to find images and clear text
+        ContentScraper textContent = null;
+        try {textContent = htmlParser.parseToScraper(source_url, null, text, 10);} catch (IOException e) {}
+        String extractedText = textContent.getText();
        
        // add the source attributes about the target
        if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
        if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : "");
        if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : "");
        if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : ""));
-        if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : "");
-        if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length());
-        if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0);
+        if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, extractedText.length() > 0 ? extractedText : "");
+        if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, extractedText.length());
+        if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, extractedText.length() > 0 ? CommonPattern.SPACE.split(extractedText).length : 0);
        
-        ImageEntry ientry = null;
-        for (ImageEntry ie: images) {
-            if (ie.linkurl() != null && ie.linkurl().equals(target_url)) {ientry = ie; break;}
+        StringBuilder alttext = new StringBuilder(textContent == null ? 0 : textContent.getImages().size() * 30);
+        if (textContent != null) for (ImageEntry ie: textContent.getImages()) {
+            if (ie.alt().length() > 0) alttext.append(ie.alt()).append(' ');
        }
-        String alttext = ientry == null ? "" : ientry.alt();
-        if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext);
+        while (alttext.length() > 0 && alttext.charAt(alttext.length() - 1) == ' ') alttext.setLength(alttext.length() - 1);
+        if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext.toString());
        if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length());
        if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0);
        
-        // add the target attributes            
+        // add the target attributes
        add(edge, WebgraphSchema.target_id_s, target_id);
        final String target_url_string = target_url.toNormalform(false);
        int pr_target = target_url_string.indexOf("://",0);