From 67beef657f82e92f48dd8425073ad81896a2ff4b Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Thu, 10 Apr 2014 18:58:03 +0200
Subject: [PATCH] strong redesign of html parser: object recursion is now made
 using a stack on html tag objects, not using a recursive parse-again method
 which may cause bad performance and huge memory allocation. The new method
 also produced better parsed image objects with exact anchor text references.

---
 .../document/parser/html/AbstractScraper.java |   5 +-
 .../parser/html/AbstractTransformer.java      |   9 +-
 .../document/parser/html/ContentScraper.java  | 254 ++++++++++--------
 .../parser/html/ContentTransformer.java       |  25 +-
 .../yacy/document/parser/html/Scraper.java    |   6 +-
 .../document/parser/html/Transformer.java     |   6 +-
 .../parser/html/TransformerWriter.java        | 246 +++++++++--------
 .../net/yacy/document/parser/htmlParser.java  |  56 ++--
 .../parser/images/genericImageParser.java     |   2 +
 .../yacy/search/schema/HyperlinkGraph.java    |   2 +-
 .../search/schema/WebgraphConfiguration.java  |  25 +-
 11 files changed, 356 insertions(+), 280 deletions(-)

diff --git a/source/net/yacy/document/parser/html/AbstractScraper.java b/source/net/yacy/document/parser/html/AbstractScraper.java
index d3bc2ffeb..e0980c21b 100644
--- a/source/net/yacy/document/parser/html/AbstractScraper.java
+++ b/source/net/yacy/document/parser/html/AbstractScraper.java
@@ -29,7 +29,6 @@
 
 package net.yacy.document.parser.html;
 
-import java.util.Properties;
 import java.util.Set;
 
 import net.yacy.kelondro.util.MemoryControl;
@@ -72,10 +71,10 @@ public abstract class AbstractScraper implements Scraper {
 
     // the other methods must take into account to construct the return value correctly
     @Override
-    public abstract void scrapeTag0(String tagname, Properties tagopts);
+    public abstract void scrapeTag0(ContentScraper.Tag tag);
 
     @Override
-    public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text);
+    public abstract void scrapeTag1(ContentScraper.Tag tag);
 
     public static String stripAllTags(final char[] s) {
         if (s.length > 80 && !MemoryControl.request(s.length * 2, false)) return "";
diff --git a/source/net/yacy/document/parser/html/AbstractTransformer.java b/source/net/yacy/document/parser/html/AbstractTransformer.java
index d812606c9..370d277e6 100644
--- a/source/net/yacy/document/parser/html/AbstractTransformer.java
+++ b/source/net/yacy/document/parser/html/AbstractTransformer.java
@@ -24,7 +24,6 @@
 
 package net.yacy.document.parser.html;
 
-import java.util.Properties;
 import java.util.TreeSet;
 
 public abstract class AbstractTransformer implements Transformer {
@@ -58,13 +57,13 @@ public abstract class AbstractTransformer implements Transformer {
 
     // the other methods must take into account to construct the return value correctly
     @Override
-    public char[] transformTag0(final String tagname, final Properties tagopts, final char quotechar) {
-        return TransformerWriter.genTag0(tagname, tagopts, quotechar);
+    public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) {
+        return TransformerWriter.genTag0(tag.name, tag.opts, quotechar);
     }
 
     @Override
-    public char[] transformTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
-        return TransformerWriter.genTag1(tagname, tagopts, text, quotechar);
+    public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) {
+        return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
     }
 
     @Override
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 285cf26a1..0b4770c6e 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -59,6 +59,7 @@ import net.yacy.cora.util.NumberTools;
 import net.yacy.document.SentenceReader;
 import net.yacy.document.parser.htmlParser;
 import net.yacy.document.parser.html.Evaluation.Element;
+import net.yacy.document.parser.images.genericImageParser;
 import net.yacy.kelondro.io.CharBuffer;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.ISO639;
@@ -80,7 +81,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         singleton, pair;
     }
 
-    public enum Tag {
+    public enum TagName {
         html(TagType.singleton), // scraped as singleton to get attached properties like 'lang'
         body(TagType.singleton), // scraped as singleton to get attached properties like 'class'
         div(TagType.singleton),  // scraped as singleton to get attached properties like 'id'
@@ -111,14 +112,49 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         style(TagType.pair);
 
         public TagType type;
-        private Tag(final TagType type) {
+        private TagName(final TagType type) {
             this.type = type;
         }
     }
 
+    public static class Tag {
+        public String name;
+        public Properties opts;
+        public CharBuffer content;
+        public Tag(final String name) {
+            this.name = name;
+            this.opts = new Properties();
+            this.content = new CharBuffer(100);
+        }
+        public Tag(final String name, final Properties opts) {
+            this.name = name;
+            this.opts = opts;
+            this.content = new CharBuffer(100);
+        }
+        public Tag(final String name, final Properties opts, final CharBuffer content) {
+            this.name = name;
+            this.opts = opts;
+            this.content = content;
+        }
+        public void close() {
+            this.name = null;
+            this.opts = null;
+            if (this.content != null) this.content.close();
+            this.content = null;
+        }
+        @Override
+        public void finalize() {
+            this.close();
+        }
+        @Override
+        public String toString() {
+            return "<" + name + " " + opts + ">" + content + "</" + name + ">";
+        }
+    }
+
     // all these tags must be given in lowercase, because the tags from the files are compared in lowercase
     static {
-        for (final Tag tag: Tag.values()) {
+        for (final TagName tag: TagName.values()) {
             if (tag.type == TagType.singleton) linkTags0.add(tag.name());
             if (tag.type == TagType.pair) linkTags1.add(tag.name());
         }
@@ -321,88 +357,88 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     }
 
     @Override
-    public void scrapeTag0(final String tagname, final Properties tagopts) {
-        if (tagname.equalsIgnoreCase("img")) {
-            final String src = tagopts.getProperty("src", EMPTY_STRING);
+    public void scrapeTag0(Tag tag) {
+        if (tag.name.equalsIgnoreCase("img")) {
+            final String src = tag.opts.getProperty("src", EMPTY_STRING);
             try {
                 if (src.length() > 0) {
                     final AnchorURL url = absolutePath(src);
                     if (url != null) {
-                        final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
-                        final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
-                        final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", EMPTY_STRING), width, height, -1);
+                        final int width = Integer.parseInt(tag.opts.getProperty("width", "-1"));
+                        final int height = Integer.parseInt(tag.opts.getProperty("height", "-1"));
+                        final ImageEntry ie = new ImageEntry(url, tag.opts.getProperty("alt", EMPTY_STRING), width, height, -1);
                         this.images.add(ie);
                     }
                 }
             } catch (final NumberFormatException e) {}
             this.evaluationScores.match(Element.imgpath, src);
-        } else if(tagname.equalsIgnoreCase("base")) {
+        } else if(tag.name.equalsIgnoreCase("base")) {
             try {
-                this.root = new DigestURL(tagopts.getProperty("href", EMPTY_STRING));
+                this.root = new DigestURL(tag.opts.getProperty("href", EMPTY_STRING));
             } catch (final MalformedURLException e) {}
-        } else if (tagname.equalsIgnoreCase("frame")) {
-            final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
-            tagopts.put("src", src.toNormalform(true));
-            src.setAll(tagopts);
+        } else if (tag.name.equalsIgnoreCase("frame")) {
+            final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
+            tag.opts.put("src", src.toNormalform(true));
+            src.setAll(tag.opts);
             this.anchors.add(src);
             this.frames.add(src);
             this.evaluationScores.match(Element.framepath, src.toNormalform(true));
-        } else if (tagname.equalsIgnoreCase("body")) {
-            final String c = tagopts.getProperty("class", EMPTY_STRING);
+        } else if (tag.name.equalsIgnoreCase("body")) {
+            final String c = tag.opts.getProperty("class", EMPTY_STRING);
             this.evaluationScores.match(Element.bodyclass, c);
-        } else if (tagname.equalsIgnoreCase("div")) {
-            final String id = tagopts.getProperty("id", EMPTY_STRING);
+        } else if (tag.name.equalsIgnoreCase("div")) {
+            final String id = tag.opts.getProperty("id", EMPTY_STRING);
             this.evaluationScores.match(Element.divid, id);
-            final String itemtype = tagopts.getProperty("itemtype", EMPTY_STRING);
+            final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
             if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
                 breadcrumbs++;
             }
-        } else if (tagname.equalsIgnoreCase("meta")) {
-            final String content = tagopts.getProperty("content", EMPTY_STRING);
-            String name = tagopts.getProperty("name", EMPTY_STRING);
+        } else if (tag.name.equalsIgnoreCase("meta")) {
+            final String content = tag.opts.getProperty("content", EMPTY_STRING);
+            String name = tag.opts.getProperty("name", EMPTY_STRING);
             if (name.length() > 0) {
                 this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
                 if (name.toLowerCase().equals("generator")) {
                     this.evaluationScores.match(Element.metagenerator, content);
                 }
             }
-            name = tagopts.getProperty("http-equiv", EMPTY_STRING);
+            name = tag.opts.getProperty("http-equiv", EMPTY_STRING);
             if (name.length() > 0) {
                 this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
             }
-            name = tagopts.getProperty("property", EMPTY_STRING);
+            name = tag.opts.getProperty("property", EMPTY_STRING);
             if (name.length() > 0) {
                 this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
             }
-        } else if (tagname.equalsIgnoreCase("area")) {
-            final String areatitle = cleanLine(tagopts.getProperty("title", EMPTY_STRING));
-            //String alt   = tagopts.getProperty("alt",EMPTY_STRING);
-            final String href  = tagopts.getProperty("href", EMPTY_STRING);
+        } else if (tag.name.equalsIgnoreCase("area")) {
+            final String areatitle = cleanLine(tag.opts.getProperty("title", EMPTY_STRING));
+            //String alt   = tag.opts.getProperty("alt",EMPTY_STRING);
+            final String href  = tag.opts.getProperty("href", EMPTY_STRING);
             if (href.length() > 0) {
-                tagopts.put("name", areatitle);
+                tag.opts.put("name", areatitle);
                 AnchorURL url = absolutePath(href);
-                tagopts.put("href", url.toNormalform(true));
-                url.setAll(tagopts);
+                tag.opts.put("href", url.toNormalform(true));
+                url.setAll(tag.opts);
                 this.anchors.add(url);
             }
-        } else if (tagname.equalsIgnoreCase("link")) {
-            final String href = tagopts.getProperty("href", EMPTY_STRING);
+        } else if (tag.name.equalsIgnoreCase("link")) {
+            final String href = tag.opts.getProperty("href", EMPTY_STRING);
             final AnchorURL newLink = absolutePath(href);
 
             if (newLink != null) {
-                tagopts.put("href", newLink.toNormalform(true));
-                String rel = tagopts.getProperty("rel", EMPTY_STRING);
-                final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
-                final String type = tagopts.getProperty("type", EMPTY_STRING);
-                final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING);
+                tag.opts.put("href", newLink.toNormalform(true));
+                String rel = tag.opts.getProperty("rel", EMPTY_STRING);
+                final String linktitle = tag.opts.getProperty("title", EMPTY_STRING);
+                final String type = tag.opts.getProperty("type", EMPTY_STRING);
+                final String hreflang = tag.opts.getProperty("hreflang", EMPTY_STRING);
 
                 if (rel.equalsIgnoreCase("shortcut icon")) {
                     final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
                     this.images.add(ie);
                     this.favicon = newLink;
                 } else if (rel.equalsIgnoreCase("canonical")) {
-                    tagopts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
-                    newLink.setAll(tagopts);
+                    tag.opts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
+                    newLink.setAll(tag.opts);
                     this.anchors.add(newLink);
                     this.canonical = newLink;
                 } else if (rel.equalsIgnoreCase("publisher")) {
@@ -417,130 +453,130 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                     this.css.put(newLink, rel);
                     this.evaluationScores.match(Element.csspath, href);
                 } else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
-                    tagopts.put("name", linktitle);
-                    newLink.setAll(tagopts);
+                    tag.opts.put("name", linktitle);
+                    newLink.setAll(tag.opts);
                     this.anchors.add(newLink);
                 }
             }
-        } else if(tagname.equalsIgnoreCase("embed")) {
-            final String src = tagopts.getProperty("src", EMPTY_STRING);
+        } else if(tag.name.equalsIgnoreCase("embed")) {
+            final String src = tag.opts.getProperty("src", EMPTY_STRING);
             try {
                 if (src.length() > 0) {
                     final AnchorURL url = absolutePath(src);
                     if (url != null) {
-                        final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
-                        final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
-                        tagopts.put("src", url.toNormalform(true));
-                        final EmbedEntry ie = new EmbedEntry(url, width, height, tagopts.getProperty("type", EMPTY_STRING), tagopts.getProperty("pluginspage", EMPTY_STRING));
+                        final int width = Integer.parseInt(tag.opts.getProperty("width", "-1"));
+                        final int height = Integer.parseInt(tag.opts.getProperty("height", "-1"));
+                        tag.opts.put("src", url.toNormalform(true));
+                        final EmbedEntry ie = new EmbedEntry(url, width, height, tag.opts.getProperty("type", EMPTY_STRING), tag.opts.getProperty("pluginspage", EMPTY_STRING));
                         this.embeds.put(url, ie);
-                        url.setAll(tagopts);
+                        url.setAll(tag.opts);
                         this.anchors.add(url);
                     }
                 }
             } catch (final NumberFormatException e) {}
-        } else if(tagname.equalsIgnoreCase("param")) {
-            final String name = tagopts.getProperty("name", EMPTY_STRING);
+        } else if(tag.name.equalsIgnoreCase("param")) {
+            final String name = tag.opts.getProperty("name", EMPTY_STRING);
             if (name.equalsIgnoreCase("movie")) {
-                AnchorURL url = absolutePath(tagopts.getProperty("value", EMPTY_STRING));
-                tagopts.put("value", url.toNormalform(true));
-                url.setAll(tagopts);
+                AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING));
+                tag.opts.put("value", url.toNormalform(true));
+                url.setAll(tag.opts);
                 this.anchors.add(url);
             }
-        } else if (tagname.equalsIgnoreCase("iframe")) {
-            final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
-            tagopts.put("src", src.toNormalform(true));
-            src.setAll(tagopts);
+        } else if (tag.name.equalsIgnoreCase("iframe")) {
+            final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
+            tag.opts.put("src", src.toNormalform(true));
+            src.setAll(tag.opts);
             this.anchors.add(src);
             this.iframes.add(src);
             this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
-        } else if (tagname.equalsIgnoreCase("html")) {
-            final String lang = tagopts.getProperty("lang", EMPTY_STRING);
+        } else if (tag.name.equalsIgnoreCase("html")) {
+            final String lang = tag.opts.getProperty("lang", EMPTY_STRING);
             if (!lang.isEmpty()) // fake a language meta to preserv detection from <html lang="xx" />
                 this.metas.put("dc.language",lang.substring(0,2)); // fix found entries like "hu-hu"
         }
 
         // fire event
-        fireScrapeTag0(tagname, tagopts);
+        fireScrapeTag0(tag.name, tag.opts);
     }
 
     @Override
-    public void scrapeTag1(final String tagname, final Properties tagopts, char[] text) {
-        // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
-        if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
-            String href = tagopts.getProperty("href", EMPTY_STRING);
+    public void scrapeTag1(Tag tag) {
+        // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
+        if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
+            String href = tag.opts.getProperty("href", EMPTY_STRING);
             href = CharacterCoding.html2unicode(href);
             AnchorURL url;
             if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
                 final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
-                if (ext.equals("png") || ext.equals("gif") || ext.equals("jpg") || ext.equals("jpeg") || ext.equals("tiff") || ext.equals("tif")) {
+                if (genericImageParser.SUPPORTED_EXTENSIONS.contains(ext)) {
                     // special handling of such urls: put them to the image urls
-                    final ImageEntry ie = new ImageEntry(url, recursiveParse(url, text), -1, -1, -1);
+                    final ImageEntry ie = new ImageEntry(url, recursiveParse(url, tag.content.getChars()), -1, -1, -1);
                     this.images.add(ie);
                 } else {
                     if (followDenied()) {
-                        String rel = tagopts.getProperty("rel", EMPTY_STRING);
+                        String rel = tag.opts.getProperty("rel", EMPTY_STRING);
                         if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; 
-                        tagopts.put("rel", rel);
+                        tag.opts.put("rel", rel);
                     }
-                    tagopts.put("text", new String(text));
-                    tagopts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
-                    url.setAll(tagopts);
-                    recursiveParse(url, text);
+                    tag.opts.put("text", new String(tag.content.getChars()));
+                    tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
+                    url.setAll(tag.opts);
+                    recursiveParse(url, tag.content.getChars());
                     this.anchors.add(url);
                 }
             }
             this.evaluationScores.match(Element.apath, href);
         }
         final String h;
-        if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
             if (h.length() > 0) this.headlines[0].add(h);
-        } else if((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if((tag.name.equalsIgnoreCase("h2")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
             if (h.length() > 0) this.headlines[1].add(h);
-        } else if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("h3")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
             if (h.length() > 0) this.headlines[2].add(h);
-        } else if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("h4")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
             if (h.length() > 0) this.headlines[3].add(h);
-        } else if ((tagname.equalsIgnoreCase("h5")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("h5")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
             if (h.length() > 0) this.headlines[4].add(h);
-        } else if ((tagname.equalsIgnoreCase("h6")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("h6")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
             if (h.length() > 0) this.headlines[5].add(h);
-        } else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
-            String t = recursiveParse(null, text);
-            this.titles.add(t);
-            this.evaluationScores.match(Element.title, t);
-        } else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("title")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
+            this.titles.add(h);
+            this.evaluationScores.match(Element.title, h);
+        } else if ((tag.name.equalsIgnoreCase("b")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
             if (h.length() > 0) this.bold.inc(h);
-        } else if ((tagname.equalsIgnoreCase("strong")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("strong")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
             if (h.length() > 0) this.bold.inc(h);
-        } else if ((tagname.equalsIgnoreCase("i")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("i")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
             if (h.length() > 0) this.italic.inc(h);
-        } else if ((tagname.equalsIgnoreCase("u")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("u")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
             if (h.length() > 0) this.underline.inc(h);
-        } else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) {
-            h = recursiveParse(null, text);
+        } else if ((tag.name.equalsIgnoreCase("li")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
             if (h.length() > 0) this.li.add(h);
-        } else if (tagname.equalsIgnoreCase("script")) {
-            final String src = tagopts.getProperty("src", EMPTY_STRING);
+        } else if (tag.name.equalsIgnoreCase("script")) {
+            final String src = tag.opts.getProperty("src", EMPTY_STRING);
             if (src.length() > 0) {
                 this.script.add(absolutePath(src));
                 this.evaluationScores.match(Element.scriptpath, src);
             } else {
-                this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(text)).replaceAll(" "));
+                this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(tag.content.getChars())).replaceAll(" "));
             }
         }
 
         // fire event
-        fireScrapeTag1(tagname, tagopts, text);
+        fireScrapeTag1(tag.name, tag.opts, tag.content.getChars());
     }
 
 
@@ -570,15 +606,20 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         for (final AnchorURL entry: scraper.getAnchors()) {
             this.anchors.add(entry);
         }
+        String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
         for (ImageEntry ie: scraper.images) {
             if (linkurl != null) {
                 ie.setLinkurl(linkurl);
-                ie.setAnchortext(new String(inlineHtml));
+                ie.setAnchortext(line);
+            }
+            // this image may have been added recently from the same location (as this is a recursive parse)
+            // we want to keep only one of them, check if they are equal
+            if (this.images.size() > 0 && this.images.get(this.images.size() - 1).url().equals(ie.url())) {
+                this.images.remove(this.images.size() - 1);
             }
             this.images.add(ie);
         }
 
-        String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
         scraper.close();
         return line;
     }
@@ -681,6 +722,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     }
     
     public String getText() {
+        this.content.trim();
         try {
             return this.content.toString();
         } catch (final OutOfMemoryError e) {
diff --git a/source/net/yacy/document/parser/html/ContentTransformer.java b/source/net/yacy/document/parser/html/ContentTransformer.java
index e4dbf6238..5b6fd8252 100644
--- a/source/net/yacy/document/parser/html/ContentTransformer.java
+++ b/source/net/yacy/document/parser/html/ContentTransformer.java
@@ -29,7 +29,6 @@ import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Properties;
 import java.util.TreeSet;
 
 import net.yacy.cora.document.encoding.ASCII;
@@ -115,27 +114,27 @@ public class ContentTransformer extends AbstractTransformer implements Transform
     }
 
     @Override
-    public char[] transformTag0(final String tagname, final Properties tagopts, final char quotechar) {
-        if (tagname.equals("img")) {
+    public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) {
+        if (tag.name.equals("img")) {
             // check bluelist
-            if (bluelistHit(tagopts.getProperty("src", "").toCharArray())) return genBlueLetters(5);
-            if (bluelistHit(tagopts.getProperty("alt", "").toCharArray())) return genBlueLetters(5);
+            if (bluelistHit(tag.opts.getProperty("src", "").toCharArray())) return genBlueLetters(5);
+            if (bluelistHit(tag.opts.getProperty("alt", "").toCharArray())) return genBlueLetters(5);
 
             // replace image alternative name
-            tagopts.setProperty("alt", new String(transformText(tagopts.getProperty("alt", "").toCharArray())));
+            tag.opts.setProperty("alt", new String(transformText(tag.opts.getProperty("alt", "").toCharArray())));
         }
-        if (tagname.equals("input") && (tagopts.getProperty("type") != null && tagopts.getProperty("type").equals("submit"))) {
+        if (tag.name.equals("input") && (tag.opts.getProperty("type") != null && tag.opts.getProperty("type").equals("submit"))) {
             // rewrite button name
-            tagopts.setProperty("value", new String(transformText(tagopts.getProperty("value", "").toCharArray())));
+            tag.opts.setProperty("value", new String(transformText(tag.opts.getProperty("value", "").toCharArray())));
         }
-        return TransformerWriter.genTag0(tagname, tagopts, quotechar);
+        return TransformerWriter.genTag0(tag.name, tag.opts, quotechar);
     }
 
     @Override
-    public char[] transformTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
-        if (bluelistHit(tagopts.getProperty("href","").toCharArray())) return genBlueLetters(text.length);
-        if (bluelistHit(text)) return genBlueLetters(text.length);
-        return TransformerWriter.genTag1(tagname, tagopts, text, quotechar);
+    public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) {
+        if (bluelistHit(tag.opts.getProperty("href","").toCharArray())) return genBlueLetters(tag.content.length());
+        if (bluelistHit(tag.content.getChars())) return genBlueLetters(tag.content.length());
+        return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
     }
 
     @Override
diff --git a/source/net/yacy/document/parser/html/Scraper.java b/source/net/yacy/document/parser/html/Scraper.java
index e1dfe73e1..dc8e3e964 100644
--- a/source/net/yacy/document/parser/html/Scraper.java
+++ b/source/net/yacy/document/parser/html/Scraper.java
@@ -24,8 +24,6 @@
 
 package net.yacy.document.parser.html;
 
-import java.util.Properties;
-
 public interface Scraper {
 
     public boolean isTag0(String tag);
@@ -34,9 +32,9 @@ public interface Scraper {
 
     public void scrapeText(char[] text, String insideTag);
 
-    public void scrapeTag0(String tagname, Properties tagopts);
+    public void scrapeTag0(ContentScraper.Tag tag);
 
-    public void scrapeTag1(String tagname, Properties tagopts, char[] text);
+    public void scrapeTag1(ContentScraper.Tag tag);
 
     public void scrapeComment(final char[] comment);
 
diff --git a/source/net/yacy/document/parser/html/Transformer.java b/source/net/yacy/document/parser/html/Transformer.java
index 2aedfa120..9b605340e 100644
--- a/source/net/yacy/document/parser/html/Transformer.java
+++ b/source/net/yacy/document/parser/html/Transformer.java
@@ -24,8 +24,6 @@
 
 package net.yacy.document.parser.html;
 
-import java.util.Properties;
-
 public interface Transformer {
 
     // the init method is used to initialize the transformer with some values
@@ -52,10 +50,10 @@ public interface Transformer {
     public char[] transformText(char[] text);
 
     // method that is called when a body-less tag occurs
-    public char[] transformTag0(String tagname, Properties tagopts, char quotechar);
+    public char[] transformTag0(ContentScraper.Tag tag, char quotechar);
 
     // method that is called when a body-containing text occurs
-    public char[] transformTag1(String tagname, Properties tagopts, char[] text, char quotechar);
+    public char[] transformTag1(ContentScraper.Tag tag, char quotechar);
 
     public void close();
 }
diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java
index 53d8e91de..408322dcb 100644
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@@ -43,6 +43,7 @@ import java.net.MalformedURLException;
 import java.nio.charset.Charset;
 import java.util.Enumeration;
 import java.util.Properties;
+import java.util.Stack;
 
 import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.DigestURL;
@@ -62,9 +63,7 @@ public final class TransformerWriter extends Writer {
     private final OutputStream outStream;
     private OutputStreamWriter out;
     private CharBuffer buffer;
-    private String       filterTag;
-    private Properties   filterOpts;
-    private CharBuffer filterCont;
+    private Stack<ContentScraper.Tag> tagStack;
     private final Scraper scraper;
     private final Transformer transformer;
     private boolean inSingleQuote;
@@ -72,7 +71,7 @@ public final class TransformerWriter extends Writer {
     private boolean inComment;
     private boolean binaryUnsuspect;
     private final boolean passbyIfBinarySuspect;
-
+    
     public TransformerWriter(
             final OutputStream outStream,
             final Charset charSet,
@@ -95,9 +94,7 @@ public final class TransformerWriter extends Writer {
         this.scraper       = scraper;
         this.transformer   = transformer;
         this.buffer        = new CharBuffer(ContentScraper.MAX_DOCSIZE, initialBufferSize);
-        this.filterTag     = null;
-        this.filterOpts    = null;
-        this.filterCont    = null;
+        this.tagStack      = new Stack<ContentScraper.Tag>();
         this.inSingleQuote = false;
         this.inDoubleQuote = false;
         this.inComment     = false;
@@ -186,63 +183,105 @@ public final class TransformerWriter extends Writer {
             return result;
     }
 
-    private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) {
-        //System.out.println("filterTag: filterTag=" + ((this.filterTag == null) ? "null" : this.filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
-        // distinguish the following cases:
-        // - (1) not collecting data for a tag and getting no tag (not opener and not close)
-        // - (2) not collecting data for a tag and getting a tag opener
-        // - (3) not collecting data for a tag and getting a tag close
-        // - (4) collecting data for a tag and getting no tag (not opener and not close)
-        // - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag
-        // - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener)
-        // - (7) collecting data for a tag and getting the correct close tag for that collecting tag
-
-        if (this.filterTag == null) {
+    /**
+     * the token processor distinguishes three different types of input: opening tag, closing tag, text content
+     * @param in - the token to be processed
+     * @param quotechar
+     * @return a processed version of the token
+     */
+    private char[] tokenProcessor(final char[] in, final char quotechar) {
+        if (in.length == 0) return in;
+        
+        // scan the string and parse structure
+        if (in.length <= 2 || in[0] != lb) return filterTag(in); // this is a text
+
+        // this is a tag
+        String tag;
+        int tagend;
+        if (in[1] == '/') {
+            // a closing tag
+            tagend = tagEnd(in, 2);
+            tag = new String(in, 2, tagend - 2).toLowerCase();
+            final char[] text = new char[in.length - tagend - 1];
+            System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
+            return filterTag(text, quotechar, tag, false);
+        }
+
+        // an opening tag
+        tagend = tagEnd(in, 1);
+        tag = new String(in, 1, tagend - 1).toLowerCase();
+        final char[] text = new char[in.length - tagend - 1];
+        System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
+        return filterTag(text, quotechar, tag, true);
+    }
+    
+    // distinguish the following cases:
+    // - (1) not collecting data for a tag and getting no tag (not opener and not close)
+    // - (2) not collecting data for a tag and getting a tag opener
+    // - (3) not collecting data for a tag and getting a tag close
+    // - (4) collecting data for a tag and getting no tag (not opener and not close)
+    // - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag
+    // - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener)
+    // - (7) collecting data for a tag and getting the correct close tag for that collecting tag
+    
+    /**
+     * 
+     * @param content
+     * @return
+     */
+    private char[] filterTag(final char[] content) {
+        if (this.tagStack.size() == 0) {
             // we are not collection tag text -> case (1) - (3)
+            // case (1): this is not a tag opener/closer
+            if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null);
+            if (this.transformer != null) return this.transformer.transformText(content);
+            return content;
+        }
 
-            if (tag == null) {
-                // case (1): this is not a tag opener/closer
-                if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null);
-                if (this.transformer != null) return this.transformer.transformText(content);
-                return content;
-            }
+        // we are collection tag text for the tag 'filterTag' -> case (4) - (7)
+        // case (4): getting no tag, go on collecting content
+        if (this.scraper != null) {
+            this.scraper.scrapeText(content, this.tagStack.lastElement().name);
+        }
+        if (this.transformer != null) {
+            this.tagStack.lastElement().content.append(this.transformer.transformText(content));
+        } else {
+            this.tagStack.lastElement().content.append(content);
+        }
+        return new char[0];
+    }
+            
+    private char[] filterTag(final char[] content, final char quotechar, final String tagname, final boolean opening) {
+        assert tagname != null;
+        
+        if (this.tagStack.size() == 0) {
+            // we are not collection tag text -> case (1) - (3)
 
             // we have a new tag
             if (opening) {
                 // case (2):
-                return filterTagOpening(tag, content, quotechar);
+                return filterTagOpening(tagname, content, quotechar);
             }
 
-            // its a close tag
+            // its a close tag where no should be
             // case (3): we ignore that thing and return it again
-            return genTag0raw(tag, false, content);
+            return genTag0raw(tagname, false, content);
 
         }
 
         // we are collection tag text for the tag 'filterTag' -> case (4) - (7)
-        if (tag == null || tag.equals("!")) {
-            // case (4): getting no tag, go on collecting content
-            if (this.scraper != null) {
-                this.scraper.scrapeText(content, this.filterTag);
-            }
-            if (this.transformer != null) {
-                this.filterCont.append(this.transformer.transformText(content));
-            } else {
-                this.filterCont.append(content);
-            }
-            return new char[0];
-        }
+        if (tagname.equals("!")) filterTag(content);
 
         // it's a tag! which one?
         if (opening) {
             // case (5): the opening should not be here. But we keep the order anyway
-            this.filterCont.append(filterTagOpening(tag, content, quotechar));
-            return filterTagCloseing(quotechar);
+            this.tagStack.lastElement().content.append(filterTagOpening(tagname, content, quotechar));
+            return new char[0];
         }
 
-        if (!tag.equalsIgnoreCase(this.filterTag)) {
+        if (!tagname.equalsIgnoreCase(this.tagStack.lastElement().name)) {
             // case (6): its a closing tag, but the wrong one. just add it.
-            this.filterCont.append(genTag0raw(tag, opening, content));
+            this.tagStack.lastElement().content.append(genTag0raw(tagname, opening, content));
             return new char[0];
         }
 
@@ -250,101 +289,66 @@ public final class TransformerWriter extends Writer {
         return filterTagCloseing(quotechar);
     }
 
-    private char[] filterTagOpening(final String tag, final char[] content, final char quotechar) {
-        if (this.scraper != null && this.scraper.isTag0(tag)) {
+    private char[] filterTagOpening(final String tagname, final char[] content, final char quotechar) {
+        final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
+        ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser());
+        charBuffer.close();
+        if (this.scraper != null && this.scraper.isTag0(tagname)) {
             // this single tag is collected at once here
-            final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
-            this.scraper.scrapeTag0(tag, charBuffer.propParser());
-            charBuffer.close();
+            this.scraper.scrapeTag0(tag);
         }
-        if (this.transformer != null && this.transformer.isTag0(tag)) {
+        if (this.transformer != null && this.transformer.isTag0(tagname)) {
             // this single tag is collected at once here
-            final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
             char[] b = new char[0];
-            try {
-                b = this.transformer.transformTag0(tag, scb.propParser(), quotechar);
-            } finally {
-                scb.close();
-            }
+            b = this.transformer.transformTag0(tag, quotechar);
             return b;
-        } else if ((this.scraper != null && this.scraper.isTag1(tag)) ||
-                   (this.transformer != null && this.transformer.isTag1(tag))) {
-            // ok, start collecting
-            this.filterTag = tag;
-            final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
-            this.filterOpts = scb.propParser();
-            scb.close();
-            if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
+        } else if ((this.scraper != null && this.scraper.isTag1(tagname)) ||
+                   (this.transformer != null && this.transformer.isTag1(tagname))) {
+            // ok, start collecting; we don't push this here to the scraper or transformer; we do that when the tag is closed.
+            this.tagStack.push(tag);
             return new char[0];
         } else {
              // we ignore that thing and return it again
-             return genTag0raw(tag, true, content);
+             return genTag0raw(tagname, true, content);
         }
     }
 
     private char[] filterTagCloseing(final char quotechar) {
         char[] ret;
-        if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
+        ContentScraper.Tag tag = this.tagStack.lastElement();
+        if (this.scraper != null) this.scraper.scrapeTag1(tag);
         if (this.transformer != null) {
-            ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
+            ret = this.transformer.transformTag1(tag, quotechar);
         } else {
-            ret = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
+            ret = genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
+        }
+        if ((this.scraper != null && this.scraper.isTag1(tag.name)) ||
+            (this.transformer != null && this.transformer.isTag1(tag.name))) {
+            // remove the tag from the stack as soon as the tag is processed
+            this.tagStack.pop();
+            // at this point the characters from the recently processed tag must be attached to the previous tag
+            if (this.tagStack.size() > 0) this.tagStack.lastElement().content.append(ret);
         }
-        this.filterTag = null;
-        this.filterOpts = null;
-        this.filterCont = null;
         return ret;
     }
 
     private char[] filterFinalize(final char quotechar) {
-        if (this.filterTag == null) {
+        if (this.tagStack.size() == 0) {
             return new char[0];
         }
 
         // it's our closing tag! return complete result.
         char[] ret;
-        if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
+        if (this.scraper != null) this.scraper.scrapeTag1(this.tagStack.lastElement());
         if (this.transformer != null) {
-            ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
+            ret = this.transformer.transformTag1(this.tagStack.lastElement(), quotechar);
         } else {
-            ret = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
+            ret = genTag1(this.tagStack.lastElement().name, this.tagStack.lastElement().opts, this.tagStack.lastElement().content.getChars(), quotechar);
         }
-        this.filterTag = null;
-        this.filterOpts = null;
-        this.filterCont = null;
+        this.tagStack.pop();
         return ret;
     }
 
-    private char[] filterSentence(final char[] in, final char quotechar) {
-        if (in.length == 0) return in;
-        //System.out.println("filterSentence, quotechar = \"" + quotechar + "\": " + new String(in)); // debug
-        // scan the string and parse structure
-        if (in.length > 2 && in[0] == lb) {
-
-            // a tag
-            String tag;
-            int tagend;
-            if (in[1] == '/') {
-                // a closing tag
-                tagend = tagEnd(in, 2);
-                tag = new String(in, 2, tagend - 2).toLowerCase();
-                final char[] text = new char[in.length - tagend - 1];
-                System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
-                return filterTag(tag, false, text, quotechar);
-            }
-
-            // an opening tag
-            tagend = tagEnd(in, 1);
-            tag = new String(in, 1, tagend - 1).toLowerCase();
-            final char[] text = new char[in.length - tagend - 1];
-            System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
-            return filterTag(tag, true, text, quotechar);
-        }
-
-        // a text
-        return filterTag(null, true, in, quotechar);
-    }
-
     private static int tagEnd(final char[] tag, final int start) {
         char c;
         for (int i = start; i < tag.length; i++) {
@@ -358,6 +362,14 @@ public final class TransformerWriter extends Writer {
         return tag.length - 1;
     }
 
+    /**
+     * this is the tokenizer of the parser: it splits the input into pieces which are
+     * - quoted text parts
+     * - commented text parts
+     * - tags (opening and closing)
+     * - text content between all these parts
+     * The tokens are then parsed with the filterSentence method
+     */
     @Override
     public void write(final int c) throws IOException {
         //System.out.println((char) c);
@@ -375,7 +387,7 @@ public final class TransformerWriter extends Writer {
                 if ((c == rb) && (this.buffer.length() > 0 && this.buffer.charAt(0) == lb)) {
                     this.inSingleQuote = false;
                     // the tag ends here. after filtering: pass on
-                    filtered = filterSentence(this.buffer.getChars(), singlequote);
+                    filtered = tokenProcessor(this.buffer.getChars(), singlequote);
                     if (this.out != null) { this.out.write(filtered); }
                     // this.buffer = new serverByteBuffer();
                     this.buffer.reset();
@@ -387,7 +399,7 @@ public final class TransformerWriter extends Writer {
                 if (c == rb && this.buffer.length() > 0 && this.buffer.charAt(0) == lb) {
                     this.inDoubleQuote = false;
                     // the tag ends here. after filtering: pass on
-                    filtered = filterSentence(this.buffer.getChars(), doublequote);
+                    filtered = tokenProcessor(this.buffer.getChars(), doublequote);
                     if (this.out != null) this.out.write(filtered);
                     // this.buffer = new serverByteBuffer();
                     this.buffer.reset();
@@ -425,7 +437,7 @@ public final class TransformerWriter extends Writer {
                     } else if (c == rb) {
                         this.buffer.append(c);
                         // the tag ends here. after filtering: pass on
-                        filtered = filterSentence(this.buffer.getChars(), doublequote);
+                        filtered = tokenProcessor(this.buffer.getChars(), doublequote);
                         if (this.out != null) this.out.write(filtered);
                         // this.buffer = new serverByteBuffer();
                         this.buffer.reset();
@@ -433,7 +445,7 @@ public final class TransformerWriter extends Writer {
                         // this is an error case
                         // we consider that there is one rb missing
                         if (this.buffer.length() > 0) {
-                            filtered = filterSentence(this.buffer.getChars(), doublequote);
+                            filtered = tokenProcessor(this.buffer.getChars(), doublequote);
                             if (this.out != null) this.out.write(filtered);
                         }
                         // this.buffer = new serverByteBuffer();
@@ -447,7 +459,7 @@ public final class TransformerWriter extends Writer {
                     if (c == lb) {
                         // the text ends here
                         if (this.buffer.length() > 0) {
-                            filtered = filterSentence(this.buffer.getChars(), doublequote);
+                            filtered = tokenProcessor(this.buffer.getChars(), doublequote);
                             if (this.out != null) this.out.write(filtered);
                         }
                         // this.buffer = new serverByteBuffer();
@@ -492,7 +504,7 @@ public final class TransformerWriter extends Writer {
         final char quotechar = (this.inSingleQuote) ? singlequote : doublequote;
         if (this.buffer != null) {
             if (this.buffer.length() > 0) {
-                final char[] filtered = filterSentence(this.buffer.getChars(), quotechar);
+                final char[] filtered = tokenProcessor(this.buffer.getChars(), quotechar);
                 if (this.out != null) this.out.write(filtered);
             }
             this.buffer.close();
@@ -504,10 +516,8 @@ public final class TransformerWriter extends Writer {
             this.out.flush();
             this.out.close();
         }
-        this.filterTag = null;
-        this.filterOpts = null;
-        if (this.filterCont != null) this.filterCont.close();
-        this.filterCont = null;
+        this.tagStack.clear();
+        this.tagStack = null;
         if (this.scraper != null) this.scraper.finish();
     }
 
diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java
index 3ed19ebf1..b75d06dd5 100644
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@@ -28,16 +28,18 @@ import java.io.BufferedInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.LinkedHashMap;
-import java.util.regex.Pattern;
 
+import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.protocol.ClientIdentification;
+import net.yacy.cora.util.CommonPattern;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
@@ -53,9 +55,7 @@ import com.ibm.icu.text.CharsetDetector;
 
 public class htmlParser extends AbstractParser implements Parser {
 
-    private static final Pattern patternUnderline = Pattern.compile("_");
-    private final int maxLinks = 10000;
-    private Charset detectedcharset;
+    private static final int maxLinks = 10000;
 
     public htmlParser() {
         super("Streaming HTML Parser");
@@ -97,9 +97,10 @@ public class htmlParser extends AbstractParser implements Parser {
 
         try {
             // first get a document from the parsed html
-            final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream, maxLinks);
+            Charset[] detectedcharsetcontainer = new Charset[]{null};
+            final ContentScraper scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks);
             // parseToScraper also detects/corrects/sets charset from html content tag
-            final Document document = transformScraper(location, mimeType, detectedcharset.name(), scraper);
+            final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
 
             return new Document[]{document};
         } catch (final IOException e) {
@@ -155,9 +156,27 @@ public class htmlParser extends AbstractParser implements Parser {
         return ppd;
     }
 
-    public ContentScraper parseToScraper(
+    public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, String input, int maxLinks) throws IOException {
+        Charset[] detectedcharsetcontainer = new Charset[]{null};
+        InputStream sourceStream;
+        try {
+            sourceStream = new ByteArrayInputStream(documentCharset == null ? UTF8.getBytes(input) : input.getBytes(documentCharset));
+        } catch (UnsupportedEncodingException e) {
+            sourceStream = new ByteArrayInputStream(UTF8.getBytes(input));
+        }
+        ContentScraper scraper;
+        try {
+            scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks);
+        } catch (Failure e) {
+            throw new IOException(e.getMessage());
+        }
+        return scraper;
+    }
+    
+    public static ContentScraper parseToScraper(
             final DigestURL location,
             final String documentCharset,
+            Charset[] detectedcharsetcontainer,
             InputStream sourceStream,
             final int maxLinks) throws Parser.Failure, IOException {
 
@@ -171,13 +190,15 @@ public class htmlParser extends AbstractParser implements Parser {
 
         // nothing found: try to find a meta-tag
         if (charset == null) {
+            ScraperInputStream htmlFilter = null;
             try {
-                final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream, documentCharset, location, null, false, maxLinks);
+                htmlFilter = new ScraperInputStream(sourceStream, documentCharset, location, null, false, maxLinks);
                 sourceStream = htmlFilter;
                 charset = htmlFilter.detectCharset();
-                htmlFilter.close();
             } catch (final IOException e1) {
                 throw new Parser.Failure("Charset error:" + e1.getMessage(), location);
+            } finally {
+                if (htmlFilter != null) htmlFilter.close();
             }
         }
 
@@ -193,21 +214,22 @@ public class htmlParser extends AbstractParser implements Parser {
 
         // wtf? still nothing, just take system-standard
         if (charset == null) {
-            detectedcharset = Charset.defaultCharset();
+            detectedcharsetcontainer[0] = Charset.defaultCharset();
         } else {
             try {
-                detectedcharset = Charset.forName(charset);
+                detectedcharsetcontainer[0] = Charset.forName(charset);
             } catch (final IllegalCharsetNameException e) {
-                detectedcharset = Charset.defaultCharset();
+                detectedcharsetcontainer[0] = Charset.defaultCharset();
             } catch (final UnsupportedCharsetException e) {
-                detectedcharset = Charset.defaultCharset();
+                detectedcharsetcontainer[0] = Charset.defaultCharset();
             }
         }
+        
         // parsing the content
         final ContentScraper scraper = new ContentScraper(location, maxLinks);
         final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
         try {
-            FileUtils.copy(sourceStream, writer, detectedcharset);
+            FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]);
         } catch (final IOException e) {
             throw new Parser.Failure("IO error:" + e.getMessage(), location);
         } finally {
@@ -250,7 +272,7 @@ public class htmlParser extends AbstractParser implements Parser {
         if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman";
 
         // fix wrong fill characters
-        encoding = patternUnderline.matcher(encoding).replaceAll("-");
+        encoding = CommonPattern.UNDERSCORE.matcher(encoding).replaceAll("-");
 
         if (encoding.matches("GB[_-]?2312([-_]80)?")) return "GB2312";
         if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8";
@@ -306,10 +328,9 @@ public class htmlParser extends AbstractParser implements Parser {
         try {
             url = new AnchorURL(args[0]);
             final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null);
-            final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
+            final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new ByteArrayInputStream(content));
             final String title = document[0].dc_title();
             System.out.println(title);
-            System.out.println(CharacterCoding.unicode2html(title, false));
         } catch (final MalformedURLException e) {
             e.printStackTrace();
         } catch (final IOException e) {
@@ -319,6 +340,7 @@ public class htmlParser extends AbstractParser implements Parser {
         } catch (final InterruptedException e) {
             e.printStackTrace();
         }
+        System.exit(0);
     }
 
 }
diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java
index 420c64417..a6394f66f 100644
--- a/source/net/yacy/document/parser/images/genericImageParser.java
+++ b/source/net/yacy/document/parser/images/genericImageParser.java
@@ -81,6 +81,8 @@ public class genericImageParser extends AbstractParser implements Parser {
         SUPPORTED_EXTENSIONS.add("jpeg");
         SUPPORTED_EXTENSIONS.add("jpe");
         SUPPORTED_EXTENSIONS.add("bmp");
+        SUPPORTED_EXTENSIONS.add("tif");
+        SUPPORTED_EXTENSIONS.add("tiff");
         SUPPORTED_MIME_TYPES.add("image/png");
         SUPPORTED_MIME_TYPES.add("image/gif");
         SUPPORTED_MIME_TYPES.add("image/jpeg");
diff --git a/source/net/yacy/search/schema/HyperlinkGraph.java b/source/net/yacy/search/schema/HyperlinkGraph.java
index 312f70674..a8ce35fe0 100644
--- a/source/net/yacy/search/schema/HyperlinkGraph.java
+++ b/source/net/yacy/search/schema/HyperlinkGraph.java
@@ -161,7 +161,7 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
                 remaining--;
             }
         }
-        if (nodes.size() == 0) ConcurrentLog.warn("HyperlinkGraph", "could not find a root node for " + hostname + " in " + this.edges.size() + " edges");
+        if (nodes.size() == 0 && this.edges.size() > 0) ConcurrentLog.warn("HyperlinkGraph", "could not find a root node for " + hostname + " in " + this.edges.size() + " edges");
 
         // recusively step into depth and find next level
         int depth = 1;
diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java
index 364bf483b..22ae657d1 100644
--- a/source/net/yacy/search/schema/WebgraphConfiguration.java
+++ b/source/net/yacy/search/schema/WebgraphConfiguration.java
@@ -51,6 +51,8 @@ import net.yacy.cora.protocol.Domains;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.document.parser.htmlParser;
+import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.ImageEntry;
 
 public class WebgraphConfiguration extends SchemaConfiguration implements Serializable {
@@ -219,26 +221,31 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
             add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
             processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
         }
+
+        // parse text to find images and clear text
+        ContentScraper textContent = null;
+        try {textContent = htmlParser.parseToScraper(source_url, null, text, 10);} catch (IOException e) {}
+        String extractedText = textContent.getText();
         
         // add the source attributes about the target
         if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
         if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : "");
         if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : "");
         if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : ""));
-        if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : "");
-        if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length());
-        if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0);
+        if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, extractedText.length() > 0 ? extractedText : "");
+        if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, extractedText.length());
+        if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, extractedText.length() > 0 ? CommonPattern.SPACE.split(extractedText).length : 0);
         
-        ImageEntry ientry = null;
-        for (ImageEntry ie: images) {
-            if (ie.linkurl() != null && ie.linkurl().equals(target_url)) {ientry = ie; break;}
+        StringBuilder alttext = new StringBuilder(textContent == null ? 0 : textContent.getImages().size() * 30);
+        if (textContent != null) for (ImageEntry ie: textContent.getImages()) {
+            if (ie.alt().length() > 0) alttext.append(ie.alt()).append(' ');
         }
-        String alttext = ientry == null ? "" : ientry.alt();
-        if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext);
+        while (alttext.length() > 0 && alttext.charAt(alttext.length() - 1) == ' ') alttext.setLength(alttext.length() - 1);
+        if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext.toString());
         if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length());
         if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0);
         
-        // add the target attributes            
+        // add the target attributes
         add(edge, WebgraphSchema.target_id_s, target_id);
         final String target_url_string = target_url.toNormalform(false);
         int pr_target = target_url_string.indexOf("://",0);