bugfixes in html parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7912 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · 1c007188ad
parent b00e69c5df
commit 1c007188ad
7 changed files with 230 additions and 170 deletions
--- a/defaults/solr.keys.list
+++ b/defaults/solr.keys.list
@ -45,8 +45,11 @@ keywords
 ## character encoding, string
 charset_s

+## tags of css entries, normalized with absolute URL, textgen
+attr_css_tag
+
 ## urls of css entries, normalized with absolute URL, textgen
-attr_css
+attr_css_url

 ## number of css entries, int
 csscount_i
@ -74,11 +77,24 @@ wordcount_i

 ## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
 attr_inboundlinks_tag
-attr_inboundlinks_protocol
-attr_inboundlinks_urlstub
-attr_inboundlinks_name
-attr_inboundlinks_rel
-attr_inboundlinks_text
+
+## internal links, only the protocol
+#attr_inboundlinks_protocol
+
+## internal links, the url only without the protocol
+#attr_inboundlinks_urlstub
+
+## internal links, the name property of the a-tag
+#attr_inboundlinks_name
+
+## internal links, the rel property of the a-tag
+#attr_inboundlinks_rel
+
+## internal links, the rel property of the a-tag, coded binary
+#attr_inboundlinks_relcode
+
+## internal links, the text content of the a-tag
+#attr_inboundlinks_text

 ## total number of inbound links, int
 inboundlinkscount_i
@ -88,18 +104,43 @@ inboundlinksnoindexcount_i

 ## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
 attr_outboundlinks_tag
-attr_outboundlinks_protocol
-attr_outboundlinks_urlstub
-attr_outboundlinks_name
-attr_outboundlinks_rel
-attr_outboundlinks_text

-## total number of external links, int
-outboundlinkscount_i
+## external links, only the protocol
+#attr_outboundlinks_protocol
+
+## external links, the url only without the protocol
+#attr_outboundlinks_urlstub
+
+## external links, the name property of the a-tag
+#attr_outboundlinks_name
+
+## external links, the rel property of the a-tag
+#attr_outboundlinks_rel
+
+## external links, the text content of the a-tag
+#attr_outboundlinks_text
+
+## external number of inbound links, int
+outboundlinks_i

 ## number of external links with noindex tag, int
 outboundlinksnoindexcount_i

+## all image tags, encoded as <img> tag inclusive alt- and title property, textgen
+attr_images_tag
+
+## all image links without the protocol and '://'
+#attr_images_urlstub
+
+## all image link protocols
+#attr_images_protocol
+
+## all image link alt tag
+#attr_images_alt
+
+## number of images, int
+imagescount_i
+
 ## h1 header, textgen
 attr_h1

@ -154,12 +195,6 @@ attr_italiccount
 ## total number of occurrences of <i>, int
 italic_i

-## all image tags, encoded as <img> tag inclusive alt- and title property, textgen
-attr_images
-
-## number of images, int
-imagescount_i
-
 ## flag that shows if a swf file is linked, boolean
 flash_b

@ -205,6 +240,12 @@ attr_tracker
 ## number of attribute counts in attr_tracker, textgen
 attr_trackercount

+## names matching title expressions, textgen
+attr_title
+
+## number of matching title expressions, textgen
+attr_titlecount
+
 ## fail reason if a page was not loaded. if the page was loaded then this field is empty, text
 failreason_t

--- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java
@ -128,76 +128,75 @@ public class SolrScheme extends ConfigurationSet {
        int c = 0;
        if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount());
        if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount());
-        if (isEmpty() || contains("attr_inboundlinks")) {
-            final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()];
-            final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()];
-            final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()];
-            final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()];
-            final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()];
-            final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()];
-            for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
-                final Properties p = alllinks.get(url);
-                final String name = p.getProperty("name", ""); // the name attribute
-                final String rel = p.getProperty("rel", "");   // the rel-attribute
-                final String text = p.getProperty("text", ""); // the text between the <a></a> tag
-                final String urls = url.toNormalform(false, false);
-                final int pr = urls.indexOf("://");
-                inboundlinksURLProtocol[c] = urls.substring(0, pr);
-                inboundlinksURLStub[c] = urls.substring(pr + 3);
-                inboundlinksName[c] = name.length() > 0 ? name : "";
-                inboundlinksRel[c] = rel.length() > 0 ? rel : "";
-                inboundlinksText[c] = text.length() > 0 ? rel : "";
-                inboundlinksTag[c] =
-                    "<a href=\"" + url.toNormalform(false, false) + "\"" +
-                    (rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
-                    ">" +
-                    ((name.length() > 0) ? name : "") + "</a>";
-                c++;
-            }
-            addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag);
-            addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol);
-            addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub);
-            addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName);
-            addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel);
-            addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText);
+        final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()];
+        final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()];
+        final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()];
+        final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()];
+        final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()];
+        final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()];
+        for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
+            final Properties p = alllinks.get(url);
+            final String name = p.getProperty("name", ""); // the name attribute
+            final String rel = p.getProperty("rel", "");   // the rel-attribute
+            final String text = p.getProperty("text", ""); // the text between the <a></a> tag
+            final String urls = url.toNormalform(false, false);
+            final int pr = urls.indexOf("://");
+            inboundlinksURLProtocol[c] = urls.substring(0, pr);
+            inboundlinksURLStub[c] = urls.substring(pr + 3);
+            inboundlinksName[c] = name.length() > 0 ? name : "";
+            inboundlinksRel[c] = rel.length() > 0 ? rel : "";
+            inboundlinksText[c] = text.length() > 0 ? text : "";
+            inboundlinksTag[c] =
+                "<a href=\"" + url.toNormalform(false, false) + "\"" +
+                (rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
+                (name.length() > 0 ? " name=\"" + name + "\"" : "") +
+                ">" +
+                ((text.length() > 0) ? text : "") + "</a>";
+            c++;
        }
+        if (isEmpty() || contains("attr_inboundlinks_tag")) addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag);
+        if (isEmpty() || contains("attr_inboundlinks_protocol")) addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol);
+        if (isEmpty() || contains("attr_inboundlinks_urlstub")) addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub);
+        if (isEmpty() || contains("attr_inboundlinks_name")) addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName);
+        if (isEmpty() || contains("attr_inboundlinks_rel")) addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel);
+        if (isEmpty() || contains("attr_inboundlinks_text")) addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText);

        c = 0;
        if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount());
        if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount());
-        if (isEmpty() || contains("attr_outboundlinks")) {
-            final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()];
-            final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()];
-            final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()];
-            final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()];
-            final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()];
-            final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()];
-            for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
-                final Properties p = alllinks.get(url);
-                final String name = p.getProperty("name", ""); // the name attribute
-                final String rel = p.getProperty("rel", "");   // the rel-attribute
-                final String text = p.getProperty("text", ""); // the text between the <a></a> tag
-                final String urls = url.toNormalform(false, false);
-                final int pr = urls.indexOf("://");
-                outboundlinksURLProtocol[c] = urls.substring(0, pr);
-                outboundlinksURLStub[c] = urls.substring(pr + 3);
-                outboundlinksName[c] = name.length() > 0 ? name : "";
-                outboundlinksRel[c] = rel.length() > 0 ? rel : "";
-                outboundlinksText[c] = text.length() > 0 ? rel : "";
-                outboundlinksTag[c] =
-                    "<a href=\"" + url.toNormalform(false, false) + "\"" +
-                    (rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
-                    ">" +
-                    ((name.length() > 0) ? name : "") + "</a>";
-                c++;
-            }
-            addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag);
-            addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol);
-            addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub);
-            addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName);
-            addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel);
-            addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText);
+        final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()];
+        final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()];
+        final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()];
+        final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()];
+        final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()];
+        final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()];
+        for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
+            final Properties p = alllinks.get(url);
+            final String name = p.getProperty("name", ""); // the name attribute
+            final String rel = p.getProperty("rel", "");   // the rel-attribute
+            final String text = p.getProperty("text", ""); // the text between the <a></a> tag
+            final String urls = url.toNormalform(false, false);
+            final int pr = urls.indexOf("://");
+            outboundlinksURLProtocol[c] = urls.substring(0, pr);
+            outboundlinksURLStub[c] = urls.substring(pr + 3);
+            outboundlinksName[c] = name.length() > 0 ? name : "";
+            outboundlinksRel[c] = rel.length() > 0 ? rel : "";
+            outboundlinksText[c] = text.length() > 0 ? text : "";
+            outboundlinksTag[c] =
+                "<a href=\"" + url.toNormalform(false, false) + "\"" +
+                (rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
+                (name.length() > 0 ? " name=\"" + name + "\"" : "") +
+                ">" +
+                ((text.length() > 0) ? text : "") + "</a>";
+            c++;
        }
+        if (isEmpty() || contains("attr_outboundlinks_tag")) addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag);
+        if (isEmpty() || contains("attr_outboundlinks_protocol")) addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol);
+        if (isEmpty() || contains("attr_outboundlinks_urlstub")) addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub);
+        if (isEmpty() || contains("attr_outboundlinks_name")) addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName);
+        if (isEmpty() || contains("attr_outboundlinks_rel")) addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel);
+        if (isEmpty() || contains("attr_outboundlinks_text")) addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText);
+

        // charset
        addSolr(solrdoc, "charset_s", yacydoc.getCharset());
@ -255,27 +254,43 @@ public class SolrScheme extends ConfigurationSet {
            if (li.length > 0) addSolr(solrdoc, "attr_li", li);

            // images
-            if (isEmpty() || contains("attr_images")) {
-                final Collection<ImageEntry> imagesc = html.getImages().values();
-                final String[] images = new String[imagesc.size()];
-                c = 0;
-                for (final ImageEntry ie: imagesc) images[c++] = ie.toString();
-                addSolr(solrdoc, "imagescount_i", images.length);
-                if (images.length > 0) addSolr(solrdoc, "attr_images", images);
+            final Collection<ImageEntry> imagesc = html.getImages().values();
+            final String[] imgtags  = new String[imagesc.size()];
+            final String[] imgprots = new String[imagesc.size()];
+            final String[] imgstubs = new String[imagesc.size()];
+            final String[] imgalts  = new String[imagesc.size()];
+            c = 0;
+            for (final ImageEntry ie: imagesc) {
+                final MultiProtocolURI uri = ie.url();
+                imgtags[c] = ie.toString();
+                imgprots[c] = uri.getProtocol();
+                imgstubs[c] = uri.toString().substring(imgprots[c].length() + 3);
+                imgalts[c] = ie.alt();
+                c++;
            }
+            addSolr(solrdoc, "imagescount_i", imgtags.length);
+            if (isEmpty() || contains("attr_images_tag")) addSolr(solrdoc, "attr_images_tag", imgtags);
+            if (isEmpty() || contains("attr_images_protocol")) addSolr(solrdoc, "attr_images_protocol", imgprots);
+            if (isEmpty() || contains("attr_images_urlstub")) addSolr(solrdoc, "attr_images_urlstub", imgstubs);
+            if (isEmpty() || contains("attr_images_alt")) addSolr(solrdoc, "attr_images_alt", imgalts);

            // style sheets
            if (isEmpty() || contains("attr_css")) {
                final Map<MultiProtocolURI, String> csss = html.getCSS();
-                final String[] css = new String[csss.size()];
+                final String[] css_tag = new String[csss.size()];
+                final String[] css_url = new String[csss.size()];
                c = 0;
                for (final Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) {
-                    css[c++] =
+                    final String url = entry.getKey().toNormalform(false, false, false, false);
+                    css_tag[c] =
                        "<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" +
-                        " href=\""+ entry.getKey().toNormalform(false, false, false, false) + "\" />";
+                        " href=\""+ url + "\" />";
+                    css_url[c] = url;
+                    c++;
                }
-                addSolr(solrdoc, "csscount_i", css.length);
-                if (css.length > 0) addSolr(solrdoc, "attr_css", css);
+                addSolr(solrdoc, "csscount_i", css_tag.length);
+                if (css_tag.length > 0) addSolr(solrdoc, "attr_css_tag", css_tag);
+                if (css_url.length > 0) addSolr(solrdoc, "attr_css_url", css_url);
            }

            // Scripts
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -447,6 +447,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            if (h.length() > 0) this.headlines[5].add(h);
        } else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
            this.title = recursiveParse(text);
+            this.evaluationScores.match(Element.title, this.title);
        } else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
            h = recursiveParse(text);
            if (h.length() > 0) this.bold.inc(h);
--- a/source/net/yacy/document/parser/html/Evaluation.java
+++ b/source/net/yacy/document/parser/html/Evaluation.java
@ -62,6 +62,7 @@ public class Evaluation {

    public static enum Element {
        text,
+        title,
        bodyclass,
        divid,
        csspath,
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -11,12 +11,12 @@
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
- *  
+ *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
- *  
+ *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
@ -34,8 +34,6 @@ import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.regex.Pattern;

-import com.ibm.icu.text.CharsetDetector;
-
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.document.AbstractParser;
@ -47,47 +45,49 @@ import net.yacy.document.parser.html.ScraperInputStream;
 import net.yacy.document.parser.html.TransformerWriter;
 import net.yacy.kelondro.util.FileUtils;

+import com.ibm.icu.text.CharsetDetector;
+

 public class htmlParser extends AbstractParser implements Parser {

    private static final Pattern patternUnderline = Pattern.compile("_");

    public htmlParser() {
-        super("HTML Parser"); 
-        SUPPORTED_EXTENSIONS.add("htm");
-        SUPPORTED_EXTENSIONS.add("html");
-        SUPPORTED_EXTENSIONS.add("phtml");
-        SUPPORTED_EXTENSIONS.add("shtml");
-        SUPPORTED_EXTENSIONS.add("xhtml");
-        SUPPORTED_EXTENSIONS.add("php");
-        SUPPORTED_EXTENSIONS.add("php3");
-        SUPPORTED_EXTENSIONS.add("php4");
-        SUPPORTED_EXTENSIONS.add("php5");
-        SUPPORTED_EXTENSIONS.add("cfm");
-        SUPPORTED_EXTENSIONS.add("asp");
-        SUPPORTED_EXTENSIONS.add("aspx");
-        SUPPORTED_EXTENSIONS.add("tex");
-        SUPPORTED_EXTENSIONS.add("txt");
+        super("HTML Parser");
+        this.SUPPORTED_EXTENSIONS.add("htm");
+        this.SUPPORTED_EXTENSIONS.add("html");
+        this.SUPPORTED_EXTENSIONS.add("phtml");
+        this.SUPPORTED_EXTENSIONS.add("shtml");
+        this.SUPPORTED_EXTENSIONS.add("xhtml");
+        this.SUPPORTED_EXTENSIONS.add("php");
+        this.SUPPORTED_EXTENSIONS.add("php3");
+        this.SUPPORTED_EXTENSIONS.add("php4");
+        this.SUPPORTED_EXTENSIONS.add("php5");
+        this.SUPPORTED_EXTENSIONS.add("cfm");
+        this.SUPPORTED_EXTENSIONS.add("asp");
+        this.SUPPORTED_EXTENSIONS.add("aspx");
+        this.SUPPORTED_EXTENSIONS.add("tex");
+        this.SUPPORTED_EXTENSIONS.add("txt");
        //SUPPORTED_EXTENSIONS.add("js");
-        SUPPORTED_EXTENSIONS.add("jsp");
-        SUPPORTED_EXTENSIONS.add("mf");
-        SUPPORTED_EXTENSIONS.add("pl");
-        SUPPORTED_EXTENSIONS.add("py");
-        SUPPORTED_MIME_TYPES.add("text/html");
-        SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
-        SUPPORTED_MIME_TYPES.add("application/xhtml+xml");
-        SUPPORTED_MIME_TYPES.add("application/x-httpd-php");
-        SUPPORTED_MIME_TYPES.add("application/x-tex");
-        SUPPORTED_MIME_TYPES.add("text/plain");
-        SUPPORTED_MIME_TYPES.add("text/sgml");
-        SUPPORTED_MIME_TYPES.add("text/csv");
+        this.SUPPORTED_EXTENSIONS.add("jsp");
+        this.SUPPORTED_EXTENSIONS.add("mf");
+        this.SUPPORTED_EXTENSIONS.add("pl");
+        this.SUPPORTED_EXTENSIONS.add("py");
+        this.SUPPORTED_MIME_TYPES.add("text/html");
+        this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
+        this.SUPPORTED_MIME_TYPES.add("application/xhtml+xml");
+        this.SUPPORTED_MIME_TYPES.add("application/x-httpd-php");
+        this.SUPPORTED_MIME_TYPES.add("application/x-tex");
+        this.SUPPORTED_MIME_TYPES.add("text/plain");
+        this.SUPPORTED_MIME_TYPES.add("text/sgml");
+        this.SUPPORTED_MIME_TYPES.add("text/csv");
    }
-    
+
    public static ContentScraper parseToScraper(
-            final MultiProtocolURI location, 
-            final String documentCharset, 
+            final MultiProtocolURI location,
+            final String documentCharset,
            InputStream sourceStream) throws Parser.Failure, IOException {
-        
+
        // make a scraper
        String charset = null;

@ -95,72 +95,72 @@ public class htmlParser extends AbstractParser implements Parser {
        if (documentCharset != null) {
            charset = patchCharsetEncoding(documentCharset);
        }
-        
+
        // nothing found: try to find a meta-tag
        if (charset == null) {
            try {
                final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
                sourceStream = htmlFilter;
                charset = htmlFilter.detectCharset();
-            } catch (IOException e1) {
+            } catch (final IOException e1) {
                throw new Parser.Failure("Charset error:" + e1.getMessage(), location);
            }
        }

        // the author didn't tell us the encoding, try the mozilla-heuristic
        if (charset == null) {
-        	CharsetDetector det = new CharsetDetector();
+        	final CharsetDetector det = new CharsetDetector();
        	det.enableInputFilter(true);
-        	InputStream detStream = new BufferedInputStream(sourceStream);
+        	final InputStream detStream = new BufferedInputStream(sourceStream);
        	det.setText(detStream);
        	charset = det.detect().getName();
        	sourceStream = detStream;
        }
-        
+
        // wtf? still nothing, just take system-standard
        if (charset == null) {
            charset = Charset.defaultCharset().name();
        }
-        
+
        Charset c;
        try {
        	c = Charset.forName(charset);
-        } catch (IllegalCharsetNameException e) {
+        } catch (final IllegalCharsetNameException e) {
        	c = Charset.defaultCharset();
-        } catch (UnsupportedCharsetException e) {
+        } catch (final UnsupportedCharsetException e) {
        	c = Charset.defaultCharset();
        }
-        
+
        // parsing the content
-        final ContentScraper scraper = new ContentScraper(location);        
+        final ContentScraper scraper = new ContentScraper(location);
        final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false);
        try {
            FileUtils.copy(sourceStream, writer, c);
-        } catch (IOException e) {
+        } catch (final IOException e) {
            throw new Parser.Failure("IO error:" + e.getMessage(), location);
        } finally {
        	sourceStream.close();
            writer.close();
        }
-        //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);            
+        //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
        //serverFileUtils.copy(sourceFile, hfos);
        //hfos.close();
        if (writer.binarySuspect()) {
            final String errorMsg = "Binary data found in resource";
-            throw new Parser.Failure(errorMsg, location);    
+            throw new Parser.Failure(errorMsg, location);
        }
        return scraper;
    }

    public Document[] parse(
-            final MultiProtocolURI location, 
-            final String mimeType, 
-            final String documentCharset, 
+            final MultiProtocolURI location,
+            final String mimeType,
+            final String documentCharset,
            final InputStream sourceStream) throws Parser.Failure, InterruptedException {
-        
+
        try {
 			return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream));
-		} catch (IOException e) {
+		} catch (final IOException e) {
 			throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
 		}
    }
@ -197,7 +197,7 @@ public class htmlParser extends AbstractParser implements Parser {
                scraper.getRSS(),
                scraper.getImages(),
                scraper.indexingDenied())};
-        //scraper.close();            
+        //scraper.close();
        for (final Document ppd: ppds) {
            ppd.setFavicon(scraper.getFavicon());
        }
@ -214,10 +214,10 @@ public class htmlParser extends AbstractParser implements Parser {
     * @return patched encoding name
     */
    public static String patchCharsetEncoding(String encoding) {
-        
+
        // do nothing with null
        if ((encoding == null) || (encoding.length() < 3)) return null;
-        
+
        // trim encoding string
        encoding = encoding.trim();

@ -228,7 +228,7 @@ public class htmlParser extends AbstractParser implements Parser {
        // all other names but such with "windows" use uppercase
        if (encoding.startsWith("WINDOWS")) encoding = "windows" + encoding.substring(7);
        if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman";
-        
+
        // fix wrong fill characters
        encoding = patternUnderline.matcher(encoding).replaceAll("-");

@ -236,7 +236,7 @@ public class htmlParser extends AbstractParser implements Parser {
        if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8";
        if (encoding.startsWith("US")) return "US-ASCII";
        if (encoding.startsWith("KOI")) return "KOI8-R";
-        
+
        // patch missing '-'
        if (encoding.startsWith("windows") && encoding.length() > 7) {
            final char c = encoding.charAt(7);
@ -244,7 +244,7 @@ public class htmlParser extends AbstractParser implements Parser {
                encoding = "windows-" + encoding.substring(7);
            }
        }
-        
+
        if (encoding.startsWith("ISO")) {
            // patch typos
            if (encoding.length() > 3) {
@ -256,11 +256,11 @@ public class htmlParser extends AbstractParser implements Parser {
            if (encoding.length() > 8) {
                final char c = encoding.charAt(8);
                if ((c >= '0') && (c <= '9')) {
-                    encoding = encoding.substring(0, 8) + "-" + encoding.substring(8);           
-                } 
+                    encoding = encoding.substring(0, 8) + "-" + encoding.substring(8);
+                }
            }
        }
-        
+
        // patch wrong name
        if (encoding.startsWith("ISO-8559")) {
            // popular typo
@ -279,26 +279,26 @@ public class htmlParser extends AbstractParser implements Parser {

        return encoding;
    }
- 
-    public static void main(String[] args) {
+
+    public static void main(final String[] args) {
        // test parsing of a url
        MultiProtocolURI url;
        try {
            url = new MultiProtocolURI(args[0]);
-            byte[] content = url.get(ClientIdentification.getUserAgent(), 3000);
-            Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
-            String title = document[0].dc_title();
+            final byte[] content = url.get(ClientIdentification.getUserAgent(), 3000);
+            final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
+            final String title = document[0].dc_title();
            System.out.println(title);
            System.out.println(CharacterCoding.unicode2html(title, false));
-        } catch (MalformedURLException e) {
+        } catch (final MalformedURLException e) {
            e.printStackTrace();
-        } catch (IOException e) {
+        } catch (final IOException e) {
            e.printStackTrace();
-        } catch (Parser.Failure e) {
+        } catch (final Parser.Failure e) {
            e.printStackTrace();
-        } catch (InterruptedException e) {
+        } catch (final InterruptedException e) {
            e.printStackTrace();
        }
    }
-    
+
 }
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -147,7 +147,7 @@ public final class LoaderDispatcher {
        FileUtils.copy(b, tmp);
        tmp.renameTo(targetFile);
    }
-    
+
    public Response load(final Request request, final CacheStrategy cacheStrategy, final boolean checkBlacklist) throws IOException {
    	return load(request, cacheStrategy, protocolMaxFileSize(request.url()), checkBlacklist);
    }
@ -274,7 +274,7 @@ public final class LoaderDispatcher {
        if (response != null && response.getContent() != null) {
            // we got something. Now check if we want to store that to the cache
            // first check looks if we want to store the content to the cache
-            if (!crawlProfile.storeHTCache()) {
+            if (crawlProfile == null || !crawlProfile.storeHTCache()) {
                // no caching wanted. Thats ok, do not write any message
                return response;
            }
@ -294,7 +294,7 @@ public final class LoaderDispatcher {

        throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
    }
-    
+
    private int protocolMaxFileSize(final DigestURI url) {
    	if (url.isHTTP() || url.isHTTPS())
    		return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
--- a/startYACY.sh
+++ b/startYACY.sh
@ -124,6 +124,8 @@ then
    then 
        ENABLEHUGEPAGES=1
    fi
+    # the G1 GC is on by default in Java7, so we try that here as well
+    # JAVA_ARGS="$JAVA_ARGS -XX:+UnlockExperimentalVMOptions -XX:+UseG1GC"
 elif [ $OS = "SunOS" ]
 then
 	# the UseConcMarkSweepGC option caused a full CPU usage - bug on Darwin.