- bugfixes in html parser

- new fields in solr - extended file viewer to debug parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7897 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · 5dd2efc9a2
parent 2c595a6a47
commit 5dd2efc9a2
7 changed files with 200 additions and 54 deletions
--- a/defaults/solr.keys.list
+++ b/defaults/solr.keys.list
@ -73,7 +73,12 @@ text_t
 wordcount_i

 ## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
-attr_inboundlinks
+attr_inboundlinks_tag
+attr_inboundlinks_protocol
+attr_inboundlinks_urlstub
+attr_inboundlinks_name
+attr_inboundlinks_rel
+attr_inboundlinks_text

 ## total number of inbound links, int
 inboundlinkscount_i
@ -82,7 +87,12 @@ inboundlinkscount_i
 inboundlinksnoindexcount_i

 ## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
-attr_outboundlinks
+attr_outboundlinks_tag
+attr_outboundlinks_protocol
+attr_outboundlinks_urlstub
+attr_outboundlinks_name
+attr_outboundlinks_rel
+attr_outboundlinks_text

 ## total number of external links, int
 outboundlinkscount_i
--- a/htroot/ViewFile.html
+++ b/htroot/ViewFile.html
@ -115,13 +115,22 @@
 :: <!-- 6 -->
 	<fieldset><legend>Link List</legend>
 	<table border="0" cellpadding="2" cellspacing="1">
+		<tr class="head">
+			<td>nr</td>
+			<td class="tt">type</td>
+			<td class="tt">name</td>
+			<td class="tt">link</td>
+			<td class="tt">text</td>
+			<td class="tt">rel</td>
+		</tr>
 		#{links}#
 		<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
 			<td>#[nr]#</td>
 			<td class="tt"><tt>#[type]#</tt></td>
-			<td class="tt"><tt>#[text]#</tt></td>
+			<td class="tt"><tt>#[name]#</tt></td>
 			<td class="tt"><tt><a href="#[url]#">#[link]#</a></tt></td>
-			<td class="tt"><tt>#[attr]#</tt></td>
+			<td class="tt"><tt>#[text]#</tt></td>
+			<td class="tt"><tt>#[rel]#</tt></td>
 		</tr>#{/links}#
 	</table>
        </fieldset>
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -36,6 +36,7 @@ import java.util.Collection;
 import java.util.Enumeration;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Properties;

 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.MultiProtocolURI;
@ -309,8 +310,8 @@ public class ViewFile {
                prop.put("viewMode", VIEW_MODE_AS_LINKLIST);
                boolean dark = true;
                int i = 0;
-                i += putMediaInfo(prop, wordArray, i, document.getVideolinks(), "video", (i % 2 == 0));
-                i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
+                i += putMediaInfo(prop, wordArray, i, document.getVideolinks(), "video", (i % 2 == 0), document.getAnchors());
+                i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0), document.getAnchors());
                dark = (i % 2 == 0);

                final Map<MultiProtocolURI, ImageEntry> ts = document.getImages();
@ -324,15 +325,17 @@ public class ViewFile {
                    prop.put("viewMode_links_" + i + "_text", (entry.alt().isEmpty()) ? "&nbsp;" : markup(wordArray, entry.alt()));
                    prop.put("viewMode_links_" + i + "_url", entry.url().toNormalform(false, true));
                    prop.put("viewMode_links_" + i + "_link", markup(wordArray, entry.url().toNormalform(false, true)));
-                    if (entry.width() > 0 && entry.height() > 0)
-                        prop.put("viewMode_links_" + i + "_attr", entry.width() + "x" + entry.height() + " Pixel");
-                    else
-                        prop.put("viewMode_links_" + i + "_attr", "unknown");
+                    if (entry.width() > 0 && entry.height() > 0) {
+                        prop.put("viewMode_links_" + i + "_rel", entry.width() + "x" + entry.height() + " Pixel");
+                    } else {
+                        prop.put("viewMode_links_" + i + "_rel", "");
+                    }
+                    prop.put("viewMode_links_" + i + "_name", "");
                    dark = !dark;
                    i++;
                }
-                i += putMediaInfo(prop, wordArray, i, document.getApplinks(), "app", (i % 2 == 0));
-                i += putMediaInfo(prop, wordArray, i, document.getHyperlinks(), "link", (i % 2 == 0));
+                i += putMediaInfo(prop, wordArray, i, document.getApplinks(), "app", (i % 2 == 0), document.getAnchors());
+                i += putMediaInfo(prop, wordArray, i, document.getHyperlinks(), "link", (i % 2 == 0), document.getAnchors());
                prop.put("viewMode_links", i);

            }
@ -382,16 +385,29 @@ public class ViewFile {
        return message;
    }

-    private static int putMediaInfo(final serverObjects prop, final String[] wordArray, int c, final Map<MultiProtocolURI, String> media, final String name, boolean dark) {
+    private static int putMediaInfo(
+                    final serverObjects prop,
+                    final String[] wordArray,
+                    int c,
+                    final Map<MultiProtocolURI, String> media,
+                    final String type,
+                    boolean dark,
+                    final Map<MultiProtocolURI, Properties> alllinks) {
        int i = 0;
        for (final Map.Entry<MultiProtocolURI, String> entry : media.entrySet()) {
+            final Properties p = alllinks.get(entry.getKey());
+            final String name = p.getProperty("name", ""); // the name attribute
+            final String rel = p.getProperty("rel", "");   // the rel-attribute
+            final String text = p.getProperty("text", ""); // the text between the <a></a> tag
+
            prop.put("viewMode_links_" + c + "_nr", c);
            prop.put("viewMode_links_" + c + "_dark", ((dark) ? 1 : 0));
-            prop.putHTML("viewMode_links_" + c + "_type", name);
-            prop.put("viewMode_links_" + c + "_text", ((entry.getValue().isEmpty()) ? "&nbsp;" : markup(wordArray, entry.getValue()) ));
+            prop.putHTML("viewMode_links_" + c + "_type", type);
+            prop.put("viewMode_links_" + c + "_text", text + "/" + ((entry.getValue().isEmpty()) ? "&nbsp;" : markup(wordArray, entry.getValue()) ));
            prop.put("viewMode_links_" + c + "_link", markup(wordArray, entry.getKey().toNormalform(true, false)));
            prop.put("viewMode_links_" + c + "_url", entry.getKey().toNormalform(true, false));
-            prop.put("viewMode_links_" + c + "_attr", "&nbsp;");
+            prop.put("viewMode_links_" + c + "_rel", rel);
+            prop.put("viewMode_links_" + c + "_name", name);
            dark = !dark;
            c++;
            i++;
--- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java
@ -129,36 +129,76 @@ public class SolrScheme extends ConfigurationSet {
        if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount());
        if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount());
        if (isEmpty() || contains("attr_inboundlinks")) {
-            final String[] inboundlinks = new String[yacydoc.inboundLinkCount()];
+            final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()];
+            final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()];
+            final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()];
+            final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()];
+            final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()];
+            final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()];
            for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
                final Properties p = alllinks.get(url);
-                final String name = p.getProperty("name", "");
-                final String rel = p.getProperty("rel", "");
-                inboundlinks[c++] =
+                final String name = p.getProperty("name", ""); // the name attribute
+                final String rel = p.getProperty("rel", "");   // the rel-attribute
+                final String text = p.getProperty("text", ""); // the text between the <a></a> tag
+                final String urls = url.toNormalform(false, false);
+                final int pr = urls.indexOf("://");
+                inboundlinksURLProtocol[c] = urls.substring(0, pr);
+                inboundlinksURLStub[c] = urls.substring(pr + 3);
+                inboundlinksName[c] = name.length() > 0 ? name : "";
+                inboundlinksRel[c] = rel.length() > 0 ? rel : "";
+                inboundlinksText[c] = text.length() > 0 ? rel : "";
+                inboundlinksTag[c] =
                    "<a href=\"" + url.toNormalform(false, false) + "\"" +
                    (rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
                    ">" +
                    ((name.length() > 0) ? name : "") + "</a>";
+                c++;
            }
-            addSolr(solrdoc, "attr_inboundlinks", inboundlinks);
+            addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag);
+            addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol);
+            addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub);
+            addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName);
+            addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel);
+            addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText);
        }
+
        c = 0;
        if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount());
        if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount());
        if (isEmpty() || contains("attr_outboundlinks")) {
-            final String[] outboundlinks = new String[yacydoc.outboundLinkCount()];
+            final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()];
+            final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()];
+            final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()];
+            final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()];
+            final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()];
+            final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()];
            for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
                final Properties p = alllinks.get(url);
-                final String name = p.getProperty("name", "");
-                final String rel = p.getProperty("rel", "");
-                outboundlinks[c++] =
+                final String name = p.getProperty("name", ""); // the name attribute
+                final String rel = p.getProperty("rel", "");   // the rel-attribute
+                final String text = p.getProperty("text", ""); // the text between the <a></a> tag
+                final String urls = url.toNormalform(false, false);
+                final int pr = urls.indexOf("://");
+                outboundlinksURLProtocol[c] = urls.substring(0, pr);
+                outboundlinksURLStub[c] = urls.substring(pr + 3);
+                outboundlinksName[c] = name.length() > 0 ? name : "";
+                outboundlinksRel[c] = rel.length() > 0 ? rel : "";
+                outboundlinksText[c] = text.length() > 0 ? rel : "";
+                outboundlinksTag[c] =
                    "<a href=\"" + url.toNormalform(false, false) + "\"" +
                    (rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
                    ">" +
                    ((name.length() > 0) ? name : "") + "</a>";
+                c++;
            }
-            addSolr(solrdoc, "attr_outboundlinks", outboundlinks);
+            addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag);
+            addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol);
+            addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub);
+            addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName);
+            addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel);
+            addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText);
        }
+
        // charset
        addSolr(solrdoc, "charset_s", yacydoc.getCharset());

--- a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java
@ -242,8 +242,9 @@ public class SolrSingleConnector {
    }

    protected void addSolr(final Collection<SolrInputDocument> docs) throws IOException, SolrException {
+
        try {
-            this.server.add(docs);
+            if (docs.size() != 0) this.server.add(docs);
            this.server.commit();
            /* To immediately commit after adding documents, you could use:
                  UpdateRequest req = new UpdateRequest();
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -37,6 +37,7 @@ import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Properties;
 import java.util.Set;
 import java.util.regex.Matcher;
@ -172,6 +173,36 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        this.canonical = null;
    }

+    private void mergeAnchors(final MultiProtocolURI url, final Properties p) {
+        final Properties p0 = this.anchors.get(url);
+        if (p0 == null) {
+            this.anchors.put(url, p);
+            return;
+        }
+        // merge properties
+        for (final Entry<Object, Object> entry: p.entrySet()) {
+            if (entry.getValue() != null && entry.getValue().toString().length() > 0) p0.put(entry.getKey(), entry.getValue());
+        }
+        this.anchors.put(url, p0);
+    }
+
+    /*
+    private void mergeAnchors(final MultiProtocolURI url, final String key, final String value) {
+        if (value == null) return;
+        if (value.length() == 0) return;
+        Properties p0 = this.anchors.get(url);
+        if (p0 == null) {
+            p0 = new Properties();
+            p0.put(key, value);
+            this.anchors.put(url, p0);
+            return;
+        }
+        // merge properties
+        p0.put(key, value);
+        this.anchors.put(url, p0);
+    }
+     */
+
    public void scrapeText(final char[] newtext, final String insideTag) {
        // System.out.println("SCRAPE: " + UTF8.String(newtext));
        int p, pl, q, s = 0;
@ -258,7 +289,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            s = p + 6;
            try {
                url = new MultiProtocolURI(u);
-                this.anchors.put(url, new Properties());
+                mergeAnchors(url, new Properties());
                continue;
            } catch (final MalformedURLException e) {}
        }
@ -306,7 +337,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            } catch (final MalformedURLException e) {}
        } else if (tagname.equalsIgnoreCase("frame")) {
            final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", ""));
-            this.anchors.put(src, tagopts /* with property "name" */);
+            mergeAnchors(src, tagopts /* with property "name" */);
            this.frames.add(src);
            this.evaluationScores.match(Element.framepath, src.toNormalform(true, false));
        } else if (tagname.equalsIgnoreCase("body")) {
@ -333,8 +364,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            final String areatitle = cleanLine(tagopts.getProperty("title",""));
            //String alt   = tagopts.getProperty("alt","");
            final String href  = tagopts.getProperty("href", "");
-            final Properties p = new Properties(); p.put("name", areatitle);
-            if (href.length() > 0) this.anchors.put(absolutePath(href), p);
+            tagopts.put("nme", areatitle);
+            if (href.length() > 0) mergeAnchors(absolutePath(href), tagopts);
        } else if (tagname.equalsIgnoreCase("link")) {
            final String href = tagopts.getProperty("href", "");
            final MultiProtocolURI newLink = absolutePath(href);
@ -349,8 +380,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                    this.images.put(ie.url(), ie);
                    this.favicon = newLink;
                } else if (rel.equalsIgnoreCase("canonical")) {
-                    final Properties p = new Properties(); p.put("name", this.title);
-                    this.anchors.put(newLink, p);
+                    tagopts.put("name", this.title);
+                    mergeAnchors(newLink, tagopts);
                    this.canonical = newLink;
                } else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) {
                    this.rss.put(newLink, linktitle);
@ -358,16 +389,16 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                    this.css.put(newLink, rel);
                    this.evaluationScores.match(Element.csspath, href);
                } else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
-                    final Properties p = new Properties(); p.put("name", linktitle);
-                    this.anchors.put(newLink, p);
+                    tagopts.put("name", linktitle);
+                    mergeAnchors(newLink, tagopts);
                }
            }
        } else if(tagname.equalsIgnoreCase("embed")) {
-            this.anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
+            mergeAnchors(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
        } else if(tagname.equalsIgnoreCase("param")) {
            final String name = tagopts.getProperty("name", "");
            if (name.equalsIgnoreCase("movie")) {
-                this.anchors.put(absolutePath(tagopts.getProperty("value", "")), tagopts /* with property "name" */);
+                mergeAnchors(absolutePath(tagopts.getProperty("value", "")), tagopts /* with property "name" */);
            }
        }

@ -389,8 +420,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                    final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1);
                    addImage(this.images, ie);
                } else {
-                    tagopts.put("name", recursiveParse(text));
-                    this.anchors.put(url, tagopts);
+                    tagopts.put("text", recursiveParse(text));
+                    mergeAnchors(url, tagopts);
                }
            }
            this.evaluationScores.match(Element.apath, href);
@ -430,7 +461,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            if (h.length() > 0) this.li.add(h);
        } else if (tagname.equalsIgnoreCase("iframe")) {
            final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", ""));
-            this.anchors.put(src, tagopts /* with property "name" */);
+            mergeAnchors(src, tagopts /* with property "name" */);
            this.iframes.add(src);
            this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false));
        } else if (tagname.equalsIgnoreCase("script")) {
@ -466,7 +497,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            Log.logException(e);
            return cleanLine(super.stripAll(inlineHtml));
        }
-        this.anchors.putAll(scraper.getAnchors());
+        for (final Map.Entry<MultiProtocolURI, Properties> entry: scraper.getAnchors().entrySet()) {
+            mergeAnchors(entry.getKey(), entry.getValue());
+        }
        this.images.putAll(scraper.images);

        return cleanLine(super.stripAll(scraper.content.getChars()));
@ -559,7 +592,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    }

    public boolean containsFlash() {
-        this.anchors = new HashMap<MultiProtocolURI, Properties>();
        String ext;
        for (final MultiProtocolURI url: this.anchors.keySet()) {
            ext = url.getFileExtension();
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@ -199,15 +199,51 @@ public final class TransformerWriter extends Writer {
            return result;
    }

+    private static final char[] mergedScrape(final char[] a, final char[] b) {
+        if (a == null) return b;
+        if (b == null) return a;
+        final StringBuilder sb = new StringBuilder(a.length + b.length + 1);
+        sb.append(a).append(' ').append(b);
+        return sb.toString().toCharArray();
+    }
+
+    /**
+     * every tag that appears is handed to the filterTag method. The method then returns text from the tag
+     * but also operates on the tag content to scrape information from it. In case that a tag is unclosed if
+     * another tag appears, both, the unclosed and the new one are merged into one new char[]
+     * @param tag
+     * @param opening
+     * @param content
+     * @param quotechar
+     * @return
+     */
    private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) {
-//      System.out.println("FILTER1: filterTag=" + ((filterTag == null) ? "null" : filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + UTF8.String(content)); // debug
+        //System.out.println("FILTER1: filterTag=" + ((this.filterTag == null) ? "null" : this.filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
+        char[] unclosed = null;
+
+        if (this.filterTag != null && opening) {
+            // there is a missing close tag for the currently parsed tag filterTag
+            // close that tag here and go on with new tag
+            if (this.scraper != null) {
+                this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
+            }
+            if (this.transformer != null) {
+                unclosed = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
+            } else {
+                unclosed = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
+            }
+            this.filterTag = null;
+            this.filterOpts = null;
+            this.filterCont = null;
+        }
+
        if (this.filterTag == null) {
            // we are not collection tag text
            if (tag == null) {
                // and this is not a tag opener/closer
                if (this.scraper != null) this.scraper.scrapeText(content, null);
-                if (this.transformer != null) return this.transformer.transformText(content);
-                return content;
+                if (this.transformer != null) return mergedScrape(unclosed, this.transformer.transformText(content));
+                return mergedScrape(unclosed, content);
            }

            // we have a new tag
@ -227,7 +263,7 @@ public final class TransformerWriter extends Writer {
                    // this single tag is collected at once here
                	final CharBuffer scb = new CharBuffer(content);
                	try {
-                		return this.transformer.transformTag0(tag, scb.propParser(), quotechar);
+                		return mergedScrape(unclosed, this.transformer.transformTag0(tag, scb.propParser(), quotechar));
                	} finally {
                		try {
 							scb.close();
@ -247,15 +283,15 @@ public final class TransformerWriter extends Writer {
 					    Log.logException(e);
 					}
                    if (this.filterCont == null) this.filterCont = new CharBuffer(Math.max(100, content.length)); else this.filterCont.reset();
-                    return new char[0];
+                    return mergedScrape(unclosed, new char[0]);
                } else {
                     // we ignore that thing and return it again
-                     return genTag0raw(tag, true, content);
+                     return mergedScrape(unclosed, genTag0raw(tag, true, content));
                }
            }

            // we ignore that thing and return it again
-            return genTag0raw(tag, false, content);
+            return mergedScrape(unclosed, genTag0raw(tag, false, content));

        }

@ -270,14 +306,14 @@ public final class TransformerWriter extends Writer {
                    this.filterCont.append(content);
                }
            } catch (final OutOfMemoryError e) {}
-            return new char[0];
+            return mergedScrape(unclosed, new char[0]);
        }

        // it's a tag! which one?
-        if ((opening) || (!(tag.equalsIgnoreCase(this.filterTag)))) {
+        if (opening || !(tag.equalsIgnoreCase(this.filterTag))) {
            // this tag is not our concern. just add it
            this.filterCont.append(genTag0raw(tag, opening, content));
-            return new char[0];
+            return mergedScrape(unclosed, new char[0]);
        }

        // it's our closing tag! return complete result.
@ -291,7 +327,7 @@ public final class TransformerWriter extends Writer {
        this.filterTag = null;
        this.filterOpts = null;
        this.filterCont = null;
-        return ret;
+        return mergedScrape(unclosed, ret);
    }

    private char[] filterFinalize(final char quotechar) {
@ -301,7 +337,9 @@ public final class TransformerWriter extends Writer {

        // it's our closing tag! return complete result.
        char[] ret;
-        if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
+        if (this.scraper != null) {
+            this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
+        }
        if (this.transformer != null) {
            ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
        } else {