added more attributes for html parser and enhanced data structures

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7679 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · f6077b3cc0
parent 0b02083e97
commit f6077b3cc0
18 changed files with 462 additions and 81 deletions
--- a/bin/clearcache.sh
+++ b/bin/clearcache.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+cd "`dirname $0`"
+./apicall.sh "/IndexControlRWIs_p.html?deleteIndex=off&deleteSolr=off&deleteCache=on&deleteCrawlQueues=off&deleteRobots=on&deleteSearchFl=on&deletecomplete=" > /dev/null
--- a/bin/clearindex.sh
+++ b/bin/clearindex.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+cd "`dirname $0`"
+./apicall.sh "/IndexControlRWIs_p.html?deleteIndex=on&deleteSolr=on&deleteCache=off&deleteCrawlQueues=on&deleteRobots=on&deleteSearchFl=on&deletecomplete=" > /dev/null
--- a/bin/importurllist.sh
+++ b/bin/importurllist.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+cd "`dirname $0`"
+./apicall.sh "/Crawler_p.html?bookmarkFolder=/crawlStart&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&crawlingIfOlderUnit=day&cachePolicy=ifexist&indexText=on&crawlingMode=file&crawlingURL=http://&bookmarkTitle=&mustnotmatch=&crawlingstart=import&mustmatch=.*&crawlingIfOlderNumber=7&repeat_unit=seldays&crawlingDepth=0&crawlingFile=$1" > /dev/null
--- a/build.xml
+++ b/build.xml
@ -405,13 +405,20 @@
    <!-- copy searchtest -->
    <copy todir="${release_main}/bin">
      <fileset dir="bin">
-        <include name="searchtest*"/>
-        <include name="localsearch.sh"/>
        <include name="apicall.sh"/>
-        <include name="importmediawiki.sh"/>
        <include name="clearall.sh"/>
-        <include name="up.sh"/>
+        <include name="clearcache.sh"/>
+        <include name="clearindex.sh"/>
        <include name="down.sh"/>
+        <include name="importmediawiki.sh"/>
+        <include name="importOAIList.sh"/>
+        <include name="localsearch.sh"/>
+        <include name="searchtest*"/>
+        <include name="surrogateCleanOut.sh"/>
+        <include name="surrogateMVin.sh"/>
+        <include name="surrogateMVtmp.sh"/>
+        <include name="surrogateRefeed.sh"/>
+        <include name="up.sh"/>
      </fileset>
    </copy>

--- a/htroot/WebStructurePicture_p.java
+++ b/htroot/WebStructurePicture_p.java
@ -112,7 +112,7 @@ public class WebStructurePicture_p {
            
            // recursively find domains, up to a specific depth
            final GraphPlotter graph = new GraphPlotter();
-            if (host != null) place(graph, sb.webStructure, hash, host, nodes, timeout, 0.0, 0.0, 0, depth);
+            if (host != null && hash != null) place(graph, sb.webStructure, hash, host, nodes, timeout, 0.0, 0.0, 0, depth);
            //graph.print();
            
            graphPicture = graph.draw(width, height, 40, 40, 16, 16, color_back, color_dot, color_line, color_lineend, color_text);
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -98,6 +98,7 @@ import net.yacy.document.TextParser;
 import net.yacy.document.content.DCEntry;
 import net.yacy.document.content.SurrogateReader;
 import net.yacy.document.importer.OAIListFriendsLoader;
+import net.yacy.document.parser.html.Evaluation;
 import net.yacy.kelondro.blob.Tables;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -485,7 +486,7 @@ public final class Switchboard extends serverSwitch {
        //starting blog
        initBlog();
        
-        // Init User DB
+        // init User DB
        this.log.logConfig("Loading User DB");
        final File userDbFile = new File(getDataPath(), "DATA/SETTINGS/user.heap");
        this.userDB = new UserDB(userDbFile);
@ -493,7 +494,19 @@ public final class Switchboard extends serverSwitch {
        ", " + this.userDB.size() + " entries" +
        ", " + ppRamString(userDbFile.length()/1024));
        
-        // Init bookmarks DB: needs more time since this does a DNS lookup for each Bookmark.
+        // init html parser evaluation scheme
+        File parserPropertiesPath = new File("defaults/");
+        String[] settingsList = parserPropertiesPath.list();
+        for (String l: settingsList) {
+            if (l.startsWith("parser.") && l.endsWith(".properties")) Evaluation.add(new File(parserPropertiesPath, l));
+        }
+        parserPropertiesPath = new File(getDataPath(), "DATA/SETTINGS/");
+        settingsList = parserPropertiesPath.list();
+        for (String l: settingsList) {
+            if (l.startsWith("parser.") && l.endsWith(".properties")) Evaluation.add(new File(parserPropertiesPath, l));
+        }
+        
+        // init bookmarks DB: needs more time since this does a DNS lookup for each Bookmark.
        // Can be started concurrently
        new Thread(){
            @Override
--- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java
@ -61,29 +61,6 @@ public enum SolrScheme {
        InetAddress address = Domains.dnsResolve(digestURI.getHost());
        if (address != null) solrdoc.addField("attr_ip", address.getHostAddress());
        if (digestURI.getHost() != null) solrdoc.addField("attr_host", digestURI.getHost());
-        /*
-    private final String charset;               // the charset of the document
-    private final List<String> keywords;        // most resources provide a keyword field
-    private       StringBuilder title;          // a document title, taken from title or h1 tag; shall appear as headline of search result
-    private final StringBuilder creator;        // author or copyright
-    private final String publisher;             // publisher
-    private final List<String>  sections;       // if present: more titles/headlines appearing in the document
-    private final StringBuilder description;    // an abstract, if present: short content description
-    private Object text;                        // the clear text, all that is visible
-    private final Map<MultiProtocolURI, String> anchors; // all links embedded as clickeable entities (anchor tags)
-    private final Map<MultiProtocolURI, String> rss; // all embedded rss feeds
-    private final Map<MultiProtocolURI, ImageEntry> images; // all visible pictures in document
-    // the anchors and images - Maps are URL-to-EntityDescription mappings.
-    // The EntityDescription appear either as visible text in anchors or as alternative
-    // text in image tags.
-    private Map<MultiProtocolURI, String> hyperlinks, audiolinks, videolinks, applinks;
-    private Map<String, String> emaillinks;
-    private MultiProtocolURI favicon;
-    private boolean resorted;
-    private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
-    private Set<String> languages;
-    private boolean indexingDenied;
-         */
        solrdoc.addField("title", yacydoc.dc_title());
        solrdoc.addField("author", yacydoc.dc_creator());
        solrdoc.addField("description", yacydoc.dc_description());
@ -166,9 +143,17 @@ public enum SolrScheme {
            
            // bold, italic
            String[] bold = html.getBold();
-            if (bold.length > 0) solrdoc.addField("attr_bold", bold);
+            solrdoc.addField("boldcount_i", bold.length);
+            if (bold.length > 0) {
+                solrdoc.addField("attr_bold", bold);
+                solrdoc.addField("attr_boldcount", html.getBoldCount(bold));
+            }
            String[] italic = html.getItalic();
-            if (bold.length > 0) solrdoc.addField("attr_italic", italic);
+            solrdoc.addField("italiccount_i", italic.length);
+            if (italic.length > 0) {
+                solrdoc.addField("attr_italic", italic);
+                solrdoc.addField("attr_italiccount", html.getItalicCount(italic));
+            }
            String[] li = html.getLi();
            solrdoc.addField("licount_i", li.length);
            if (li.length > 0) solrdoc.addField("attr_li", li);
@ -225,6 +210,15 @@ public enum SolrScheme {
            
            // flash embedded
            solrdoc.addField("flash_b", html.containsFlash());
+            
+            // generic evaluation pattern
+            for (String model: html.getEvaluationModelNames()) {
+                String[] scorenames = html.getEvaluationModelScoreNames(model);
+                if (scorenames.length > 0) {
+                    solrdoc.addField("attr_" + model, scorenames);
+                    solrdoc.addField("attr_" + model + "count", html.getEvaluationModelScoreCounts(model, scorenames));
+                }
+            }
        }
        return solrdoc;
    }
--- a/source/net/yacy/cora/storage/AbstractScoreMap.java
+++ b/source/net/yacy/cora/storage/AbstractScoreMap.java
@ -0,0 +1,39 @@
+/**
+ *  AbstractScoreMap
+ *  Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
+ *  First released 28.04.2011 at http://yacy.net
+ *
+ *  $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
+ *  $LastChangedRevision: 7653 $
+ *  $LastChangedBy: orbiter $
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *  
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file lgpl21.txt
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+package net.yacy.cora.storage;
+
+public abstract class AbstractScoreMap<E> implements ScoreMap<E> {
+
+    /**
+     * apply all E/int mappings from an external ScoreMap to this ScoreMap
+     */
+    public void inc(ScoreMap<E> map) {
+        if (map == null) return;
+        for (E entry: map) {
+            this.inc(entry, map.get(entry));
+        }
+    }
+    
+}
--- a/source/net/yacy/cora/storage/ClusteredScoreMap.java
+++ b/source/net/yacy/cora/storage/ClusteredScoreMap.java
@ -35,7 +35,7 @@ import java.util.TreeMap;

 import net.yacy.cora.document.UTF8;

-public final class ClusteredScoreMap<E> implements ReversibleScoreMap<E> {
+public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements ReversibleScoreMap<E> {
    
    protected final Map<E, Long> map; // a mapping from a reference to the cluster key
    protected final TreeMap<Long, E> pam; // a mapping from the cluster key to the reference
@ -48,6 +48,10 @@ public final class ClusteredScoreMap<E> implements ReversibleScoreMap<E> {
        gcount = 0;
        encnt = 0;
    }
+
+    public Iterator<E> iterator() {
+        return map.keySet().iterator();
+    }
    
    public synchronized void clear() {
        map.clear();
--- a/source/net/yacy/cora/storage/ConcurrentScoreMap.java
+++ b/source/net/yacy/cora/storage/ConcurrentScoreMap.java
@ -35,7 +35,7 @@ import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicLong;


-public class ConcurrentScoreMap<E> implements ScoreMap<E> {
+public class ConcurrentScoreMap<E> extends AbstractScoreMap<E> implements ScoreMap<E> {

    protected final ConcurrentHashMap<E, AtomicLong> map; // a mapping from a reference to the cluster key
    private long gcount;
@ -44,6 +44,10 @@ public class ConcurrentScoreMap<E> implements ScoreMap<E> {
        map = new ConcurrentHashMap<E, AtomicLong>();
        gcount = 0;
    }
+
+    public Iterator<E> iterator() {
+        return map.keySet().iterator();
+    }
    
    public synchronized void clear() {
        map.clear();
--- a/source/net/yacy/cora/storage/OrderedScoreMap.java
+++ b/source/net/yacy/cora/storage/OrderedScoreMap.java
@ -38,7 +38,7 @@ import java.util.TreeSet;
 import java.util.concurrent.atomic.AtomicInteger;


-public class OrderedScoreMap<E> implements ScoreMap<E> {
+public class OrderedScoreMap<E> extends AbstractScoreMap<E> implements ScoreMap<E> {
    
    protected final Map<E, AtomicInteger> map; // a mapping from a reference to the cluster key
    
@ -49,6 +49,10 @@ public class OrderedScoreMap<E> implements ScoreMap<E> {
            map = new TreeMap<E, AtomicInteger>(comparator);
        }
    }
+
+    public Iterator<E> iterator() {
+        return map.keySet().iterator();
+    }
    
    public synchronized void clear() {
        map.clear();
--- a/source/net/yacy/cora/storage/ScoreMap.java
+++ b/source/net/yacy/cora/storage/ScoreMap.java
@ -26,7 +26,7 @@ package net.yacy.cora.storage;

 import java.util.Iterator;

-public interface ScoreMap<E> {
+public interface ScoreMap<E> extends Iterable<E> {

    public void clear();
    
@ -65,4 +65,5 @@ public interface ScoreMap<E> {
    public void dec(final E obj);
    public void dec(final E obj, final int incrementScore);

+    public void inc(ScoreMap<E> map);
 }
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -402,7 +402,10 @@ dc_rights
            for (Map.Entry<MultiProtocolURI, Properties> entry: anchors.entrySet()) {
                url = entry.getKey();
                if (url == null) continue;
-                if (url.getHost() != null && thishost != null && url.getHost().equals(thishost)) this.inboundlinks.put(url, "anchor"); else this.outboundlinks.put(url, "anchor");
+                if (url.getHost() != null && thishost != null &&
+                    url.getHost().endsWith(thishost) ||
+                    (thishost.startsWith("www.") && url.getHost().endsWith(thishost.substring(4)))
+                    ) this.inboundlinks.put(url, "anchor"); else this.outboundlinks.put(url, "anchor");
                u = url.toNormalform(true, false);
                String name = entry.getValue().getProperty("name", "");
                if (u.startsWith("mailto:")) {
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -44,8 +44,10 @@ import java.util.regex.Pattern;
 import javax.swing.event.EventListenerList;

 import net.yacy.cora.document.MultiProtocolURI;
+import net.yacy.cora.storage.ClusteredScoreMap;
 import net.yacy.document.SentenceReader;
 import net.yacy.document.parser.htmlParser;
+import net.yacy.document.parser.html.Evaluation.Element;
 import net.yacy.kelondro.io.CharBuffer;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.FileUtils;
@ -60,33 +62,51 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    // statics: for initialization of the HTMLFilterAbstractScraper
    private static final Set<String> linkTags0 = new HashSet<String>(9,0.99f);
    private static final Set<String> linkTags1 = new HashSet<String>(7,0.99f);
-
+    
+    public enum TagType {
+        singleton, pair;
+    }
+    
+    public enum Tag {
+        html(TagType.singleton), // scraped as singleton to get attached properties like 'lang'
+        body(TagType.singleton), // scraped as singleton to get attached properties like 'class'
+        div(TagType.singleton),  // scraped as singleton to get attached properties like 'id'
+        img(TagType.singleton),
+        base(TagType.singleton),
+        frame(TagType.singleton),
+        meta(TagType.singleton),
+        area(TagType.singleton),
+        link(TagType.singleton),
+        embed(TagType.singleton), //added by [MN]
+        param(TagType.singleton), //added by [MN]
+
+        a(TagType.pair),
+        h1(TagType.pair),
+        h2(TagType.pair),
+        h3(TagType.pair),
+        h4(TagType.pair),
+        h5(TagType.pair),
+        h6(TagType.pair),
+        title(TagType.pair),
+        b(TagType.pair),
+        strong(TagType.pair),
+        i(TagType.pair),
+        li(TagType.pair),
+        iframe(TagType.pair),
+        script(TagType.pair);
+
+        public TagType type;
+        private Tag(TagType type) {
+            this.type = type;
+        }
+    }
+    
    // all these tags must be given in lowercase, because the tags from the files are compared in lowercase
    static {
-        linkTags0.add("html");      // scraped as tag 0 to get attached properties like 'lang'
-        linkTags0.add("img");
-        linkTags0.add("base");
-        linkTags0.add("frame");
-        linkTags0.add("meta");
-        linkTags0.add("area");
-        linkTags0.add("link");
-        linkTags0.add("script");
-        linkTags0.add("embed");     //added by [MN]
-        linkTags0.add("param");     //added by [MN]
-
-        linkTags1.add("a");
-        linkTags1.add("h1");
-        linkTags1.add("h2");
-        linkTags1.add("h3");
-        linkTags1.add("h4");
-        linkTags1.add("h5");
-        linkTags1.add("h6");
-        linkTags1.add("title");
-        linkTags1.add("b");
-        linkTags1.add("strong");
-        linkTags1.add("i");
-        linkTags1.add("li");
-        linkTags1.add("iframe");
+        for (Tag tag: Tag.values()) {
+            if (tag.type == TagType.singleton) linkTags0.add(tag.name());
+            if (tag.type == TagType.pair) linkTags1.add(tag.name());
+        }
        //<iframe src="../../../index.htm" name="SELFHTML_in_a_box" width="90%" height="400">
    }

@ -99,7 +119,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private String title;
    //private String headline;
    private List<String>[] headlines;
-    private List<String> bold, italic, li;
+    private ClusteredScoreMap<String> bold, italic;
+    private List<String> li;
    private CharBuffer content;
    private final EventListenerList htmlFilterEventListeners;
    private float lon, lat;
@ -113,6 +134,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     * The document root {@link MultiProtocolURI} 
     */
    private MultiProtocolURI root;
+    
+    /**
+     * evaluation scores: count appearance of specific attributes
+     */
+    private Evaluation.Scores evaluationScores;

    @SuppressWarnings("unchecked")
    public ContentScraper(final MultiProtocolURI root) {
@ -120,6 +146,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        // it is only the reference for relative links
        super(linkTags0, linkTags1);
        this.root = root;
+        this.evaluationScores = new Evaluation.Scores();
        this.rss = new HashMap<MultiProtocolURI, String>();
        this.css = new HashMap<MultiProtocolURI, String>();
        this.anchors = new HashMap<MultiProtocolURI, Properties>();
@ -131,19 +158,23 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        this.title = "";
        this.headlines = new ArrayList[6];
        for (int i = 0; i < this.headlines.length; i++) headlines[i] = new ArrayList<String>();
-        this.bold = new ArrayList<String>();
-        this.italic = new ArrayList<String>();
+        this.bold = new ClusteredScoreMap<String>();
+        this.italic = new ClusteredScoreMap<String>();
        this.li = new ArrayList<String>();
        this.content = new CharBuffer(1024);
        this.htmlFilterEventListeners = new EventListenerList();
        this.lon = 0.0f;
        this.lat = 0.0f;
+        Evaluation.match(Element.url, root.toNormalform(false, false), this.evaluationScores);
    }
    
    public void scrapeText(final char[] newtext, final String insideTag) {
        // System.out.println("SCRAPE: " + UTF8.String(newtext));
        int p, pl, q, s = 0;

+        // match evaluation pattern
+        Evaluation.match(Element.text, newtext, this.evaluationScores);
+        
        // try to find location information in text
        // Opencaching:
        // <nobr>N 50o 05.453&#039;</nobr><nobr>E 008o 30.191&#039;</nobr>
@ -246,11 +277,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            try {
                final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
                final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
-                //if (width > 15 && height > 15) {
-                    final MultiProtocolURI url = absolutePath(tagopts.getProperty("src", ""));
-                    final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", ""), width, height, -1);
-                    addImage(images, ie);
-                //}
+                String src = tagopts.getProperty("src", "");
+                if (src.length() > 0) {
+                    final MultiProtocolURI url = absolutePath(src);
+                    if (url != null) {
+                        final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", ""), width, height, -1);
+                        addImage(images, ie);
+                    }
+                }
            } catch (final NumberFormatException e) {}
        } else if(tagname.equalsIgnoreCase("base")) {
            try {
@ -262,16 +296,24 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        } else if (tagname.equalsIgnoreCase("iframe")) {
            anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
            iframes.add(absolutePath(tagopts.getProperty("src", "")));
-        } else if (tagname.equalsIgnoreCase("script")) {
-            script.add(absolutePath(tagopts.getProperty("src", "")));
+        } else if (tagname.equalsIgnoreCase("body")) {
+            String c = tagopts.getProperty("class", "");
+            Evaluation.match(Element.bodyclass, c, this.evaluationScores);
+        } else if (tagname.equalsIgnoreCase("div")) {
+            String id = tagopts.getProperty("id", "");
+            Evaluation.match(Element.divid, id, this.evaluationScores);
        } else if (tagname.equalsIgnoreCase("meta")) {
            String name = tagopts.getProperty("name", "");
+            String content = tagopts.getProperty("content","");
            if (name.length() > 0) {
-                metas.put(name.toLowerCase(), CharacterCoding.html2unicode(tagopts.getProperty("content","")));
+                metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
+                if (name.equals("generator")) {
+                    Evaluation.match(Element.metagenerator, content, this.evaluationScores);
+                }
            } else {
                name = tagopts.getProperty("http-equiv", "");
                if (name.length() > 0) {
-                    metas.put(name.toLowerCase(), CharacterCoding.html2unicode(tagopts.getProperty("content","")));
+                    metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
                }
            }
        } else if (tagname.equalsIgnoreCase("area")) {
@ -281,7 +323,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            Properties p = new Properties(); p.put("name", areatitle);
            if (href.length() > 0) anchors.put(absolutePath(href), p);
        } else if (tagname.equalsIgnoreCase("link")) {
-            final MultiProtocolURI newLink = absolutePath(tagopts.getProperty("href", ""));
+            String href = tagopts.getProperty("href", "");
+            final MultiProtocolURI newLink = absolutePath(href);

            if (newLink != null) {
                final String rel = tagopts.getProperty("rel", "");
@ -296,6 +339,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                    rss.put(newLink, linktitle);
                } else if (rel.equalsIgnoreCase("stylesheet") && type.equalsIgnoreCase("text/css")) {
                    css.put(newLink, rel);
+                    Evaluation.match(Element.csspath, href, this.evaluationScores);
                } else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
                    Properties p = new Properties(); p.put("name", linktitle);
                    anchors.put(newLink, p);
@ -356,21 +400,34 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            title = recursiveParse(text);
        } else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
            h = recursiveParse(text);
-            if (h.length() > 0) bold.add(h);
+            if (h.length() > 0) bold.inc(h);
        } else if ((tagname.equalsIgnoreCase("strong")) && (text.length < 1024)) {
            h = recursiveParse(text);
-            if (h.length() > 0) bold.add(h);
+            if (h.length() > 0) bold.inc(h);
        } else if ((tagname.equalsIgnoreCase("i")) && (text.length < 1024)) {
            h = recursiveParse(text);
-            if (h.length() > 0) italic.add(h);
+            if (h.length() > 0) italic.inc(h);
        } else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) {
            h = recursiveParse(text);
            if (h.length() > 0) li.add(h);
+        } else if (tagname.equalsIgnoreCase("script")) {
+            String src = tagopts.getProperty("src", "");
+            if (src.length() > 0) {
+                script.add(absolutePath(src));
+                Evaluation.match(Element.scriptpath, src, this.evaluationScores);
+            } else {
+                Evaluation.match(Element.scriptcode, text, this.evaluationScores);
+            }
        }

        // fire event
        fireScrapeTag1(tagname, tagopts, text);
    }
+    
+
+    public void scrapeComment(final char[] comment) {
+        Evaluation.match(Element.comment, comment, this.evaluationScores);
+    }

    private String recursiveParse(final char[] inlineHtml) {
        if (inlineHtml.length < 14) return cleanLine(super.stripAll(inlineHtml));
@ -446,11 +503,29 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    }
    
    public String[] getBold() {
-        return this.bold.toArray(new String[this.bold.size()]);
+        List<String> a = new ArrayList<String>();
+        Iterator<String> i = this.bold.keys(false);
+        while (i.hasNext()) a.add(i.next());        
+        return a.toArray(new String[a.size()]);
+    }
+    
+    public String[] getBoldCount(String[] a) {
+        String[] counter = new String[a.length];
+        for (int i = 0; i < a.length; i++) counter[i] = Integer.toString(this.bold.get(a[i]));
+        return counter;
    }
    
    public String[] getItalic() {
-        return this.italic.toArray(new String[this.italic.size()]);
+        List<String> a = new ArrayList<String>();
+        Iterator<String> i = this.italic.keys(false);
+        while (i.hasNext()) a.add(i.next());        
+        return a.toArray(new String[a.size()]);
+    }
+    
+    public String[] getItalicCount(String[] a) {
+        String[] counter = new String[a.length];
+        for (int i = 0; i < a.length; i++) counter[i] = Integer.toString(this.italic.get(a[i]));
+        return counter;
    }
    
    public String[] getLi() {
@ -663,6 +738,33 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        return this.lat;
    }
    
+    /**
+     * produce all model names
+     * @return a set of model names
+     */
+    public Set<String> getEvaluationModelNames() {
+        return this.evaluationScores.getModelNames();
+    }
+    
+    public String[] getEvaluationModelScoreNames(String modelName) {
+        List<String> a = new ArrayList<String>();
+        ClusteredScoreMap<String> scores = this.evaluationScores.getScores(modelName);
+        if (scores != null) {
+            Iterator<String> i = scores.keys(false);
+            while (i.hasNext()) a.add(i.next());
+        }
+        return a.toArray(new String[a.size()]);
+    }
+    
+    public String[] getEvaluationModelScoreCounts(String modelName, String[] a) {
+        ClusteredScoreMap<String> scores = this.evaluationScores.getScores(modelName);
+        String[] counter = new String[a.length];
+        if (scores != null) {
+            for (int i = 0; i < a.length; i++) counter[i] = Integer.toString(scores.get(a[i]));
+        }
+        return counter;
+    }
+    
    /*
     *  (non-Javadoc)
     * @see de.anomic.htmlFilter.htmlFilterScraper#close()
--- a/source/net/yacy/document/parser/html/Evaluation.java
+++ b/source/net/yacy/document/parser/html/Evaluation.java
@ -0,0 +1,196 @@
+package net.yacy.document.parser.html;
+
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import net.yacy.cora.storage.ClusteredScoreMap;
+import net.yacy.kelondro.logging.Log;
+
+
+/*
+ * This class provides methods to use a pattern analysis for html files
+ * The pattern analysis is generic and can be configured using a field-name/pattern property
+ * configuration file.
+ * Such a configuration file has names of the structure
+ * <subject-name>_<document-element>
+ * and values are regular java expressions
+ * A html file is scanned for pattern matchings within a specific <document-element>
+ * and if such a matching can be found then the <attribute-name> is collected as
+ * subject for the scanned document
+ * patternProperties files must have special file names where the file name
+ * starts with the word "parser." and must end with ".properties"
+ * everything between this is a name for a solr multi-value field where
+ * the collected subject names are stored to
+ */
+public class Evaluation {
+
+    private static List<Model> models = new ArrayList<Model>(); // the list of all models that shall be applied
+    
+    public static enum Element {
+        text,
+        bodyclass,
+        divid,
+        csspath,
+        metagenerator,
+        url,
+        scriptpath,
+        scriptcode,
+        comment;
+    }
+    
+    private static class Attribute {
+        public String subject; // the name of the attribute
+        public Pattern pattern; // the pattern that must match for that attribute
+        public Attribute(String subject, Pattern pattern) {
+            this.subject = subject;
+            this.pattern = pattern;
+        }
+    }
+    
+    private static class Model {
+        
+        private String modelName;
+        private Map<Element, List<Attribute>> elementMatcher; // a mapping from element-names to lists of Attributes
+        
+        public Model(File patternProperties) throws IOException {
+            if (!patternProperties.exists()) throw new IOException("File does not exist: " + patternProperties);
+            String name = patternProperties.getName();
+            if (!name.startsWith("parser.")) throw new IOException("file name must start with 'parser.': " + name);
+            if (!name.endsWith(".properties")) throw new IOException("file name must end with '.properties': " + name);
+            this.modelName = name.substring(7, name.length() - 11);
+            if (this.modelName.length() < 1) throw new IOException("file name too short: " + name);
+            
+            // load the file
+            Properties p = new Properties();
+            p.load(new FileReader(patternProperties));
+            
+            // iterate through the properties and generate method patterns
+            elementMatcher = new HashMap<Element, List<Attribute>>();
+            String subject, elementName;
+            Element element;
+            Pattern pattern;
+            for (Map.Entry<Object, Object> entry: p.entrySet()) {
+                String k = (String) entry.getKey();
+                String v = (String) entry.getValue();
+                int w = k.indexOf('_');
+                if (w < 0) {
+                    Log.logSevere("PatternAnalysis", "wrong configuration in " + name + ": separator '_' missing: " + k);
+                    continue;
+                }
+                subject = k.substring(0, w);
+                elementName = k.substring(w + 1);
+                try {
+                    pattern = Pattern.compile(v);
+                } catch (PatternSyntaxException e) {
+                    Log.logSevere("PatternAnalysis", "bad pattern in " + name + ": '" + k + "=" + v + "' - " + e.getDescription());
+                    continue;
+                }
+                element = Element.valueOf(elementName);
+                if (element == null) {
+                    Log.logSevere("PatternAnalysis", "unknown element in " + name + ": " + elementName);
+                    continue;
+                }
+                List<Attribute> attributeList = this.elementMatcher.get(element);
+                if (attributeList == null) {
+                    attributeList = new ArrayList<Attribute>();
+                    this.elementMatcher.put(element, attributeList);
+                }
+                attributeList.add(new Attribute(subject, pattern));
+            }
+        }
+        
+        public String getName() {
+            return this.modelName;
+        }
+        
+        /**
+         * match elementContents for a specific elementName
+         * @param element - the name of the element as Element enum type
+         * @param content - the content of the element
+         * @return a list of subject names that match with the element
+         */
+        public ClusteredScoreMap<String> match(Element element, String content) {
+            ClusteredScoreMap<String> subjects = new ClusteredScoreMap<String>();
+            List<Attribute> patterns = this.elementMatcher.get(element);
+            if (patterns == null) return subjects;
+            for (Attribute attribute: patterns) {
+                if (attribute.pattern.matcher(content).matches()) subjects.inc(attribute.subject);
+            }
+            return subjects;
+        }
+        
+    }
+    
+    public static class Scores {
+
+        private Map<String, ClusteredScoreMap<String>> modelMap; // a map from model names to attribute scores
+        
+        public Scores() {
+            this.modelMap = new HashMap<String, ClusteredScoreMap<String>>();
+        }
+        
+        /**
+         * produce all model names
+         * @return a set of model names
+         */
+        public Set<String> getModelNames() {
+            return this.modelMap.keySet();
+        }
+        
+        /**
+         * calculate the scores for a model
+         * the scores is a attribute/count map which count how often a specific attribute was found
+         * @param modelName
+         * @return
+         */
+        public ClusteredScoreMap<String> getScores(String modelName) {
+            return this.modelMap.get(modelName);
+        }
+    }
+
+    /**
+     * add a model to the evaluation set
+     * @param f
+     * @throws IOException
+     */
+    public static void add(File f) throws IOException {
+        Model pattern = new Model(f);
+        models.add(pattern);
+    }
+    
+    /**
+     * match some content within a specific element
+     * this will increase statistic counters for models if a model matches
+     * @param element - the element where a matching is made
+     * @param content - the content of the element which shall be matched
+     * @param scores - the score object where the scores are stored
+     */
+    public static void match(Element element, String content, Scores scores) {
+        if (models.isEmpty()) return; // fast return if this feature is not used
+        ClusteredScoreMap<String> newScores, oldScores;
+        for (Model pattern: models) {
+            newScores = pattern.match(element, content);
+            oldScores = scores.getScores(pattern.getName());
+            if (oldScores == null) {
+                oldScores = new ClusteredScoreMap<String>();
+                scores.modelMap.put(pattern.getName(), oldScores);
+            }
+            oldScores.inc(newScores);
+        }
+    }
+    
+    public static void match(Element element, char[] content, Scores scores) {
+        if (models.isEmpty()) return; // fast return if this feature is not used
+        match(element, new String(content), scores);
+    }
+    
+}
--- a/source/net/yacy/document/parser/html/ImageEntry.java
+++ b/source/net/yacy/document/parser/html/ImageEntry.java
@ -36,6 +36,7 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
    private final long fileSize;
    
    public ImageEntry(final MultiProtocolURI url, final String alt, final int width, final int height, long fileSize) {
+        assert url != null;
        this.url = url;
        this.alt = alt;
        this.width = width;
--- a/source/net/yacy/document/parser/html/Scraper.java
+++ b/source/net/yacy/document/parser/html/Scraper.java
@ -37,6 +37,8 @@ public interface Scraper {
    public void scrapeTag0(String tagname, Properties tagopts);

    public void scrapeTag1(String tagname, Properties tagopts, char[] text);
+
+    public void scrapeComment(final char[] comment);
    
    public void close();
    
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@ -395,7 +395,9 @@ public final class TransformerWriter extends Writer {
                    buffer.charAt(buffer.length() - 3) == dash) {
                    // comment is at end
                    inComment = false;
-                    if (out != null) out.write(buffer.getChars());
+                    char[] comment = buffer.getChars();
+                    if (scraper != null) scraper.scrapeComment(comment);
+                    if (out != null) out.write(comment);
                    // buffer = new serverByteBuffer();
                    buffer.reset();
                }