Merge branch 'master' of https://github.com/yacy/yacy_search_server

9 years ago · bfe51001e3
parent 49331dc523 02e4489a23
commit bfe51001e3
32 changed files with 109 additions and 103 deletions
--- a/.classpath
+++ b/.classpath
@ -31,7 +31,7 @@
 	<classpathentry kind="lib" path="lib/jcl-over-slf4j-1.7.12.jar"/>
 	<classpathentry kind="lib" path="lib/slf4j-jdk14-1.7.12.jar"/>
 	<classpathentry kind="lib" path="lib/log4j-over-slf4j-1.7.12.jar"/>
-	<classpathentry kind="lib" path="lib/httpcore-4.4.3.jar"/>
+	<classpathentry kind="lib" path="lib/httpcore-4.4.4.jar"/>
 	<classpathentry kind="lib" path="lib/metadata-extractor-2.8.1.jar"/>
 	<classpathentry kind="lib" path="lib/xmpcore-5.1.2.jar"/>
 	<classpathentry kind="lib" path="lib/jcifs-1.3.17.jar"/>
@ -92,12 +92,11 @@
 	<classpathentry kind="lib" path="lib/jsoup-1.8.3.jar"/>
 	<classpathentry kind="lib" path="lib/javax.servlet-api-3.1.0.jar"/>
 	<classpathentry kind="lib" path="lib/weupnp-0.1.3.jar"/>
-	<classpathentry kind="lib" path="lib/common-image-3.1.2.jar"/>
-	<classpathentry kind="lib" path="lib/common-io-3.1.2.jar"/>
-	<classpathentry kind="lib" path="lib/common-lang-3.1.2.jar"/>
-	<classpathentry kind="lib" path="lib/imageio-core-3.1.2.jar"/>
-	<classpathentry kind="lib" path="lib/imageio-metadata-3.1.2.jar"/>
-	<classpathentry kind="lib" path="lib/imageio-tiff-3.1.2.jar"/>
-	<classpathentry kind="lib" path="lib/servlet-3.1.2.jar"/>
+	<classpathentry kind="lib" path="lib/common-image-3.2.jar"/>
+	<classpathentry kind="lib" path="lib/common-io-3.2.jar"/>
+	<classpathentry kind="lib" path="lib/common-lang-3.2.jar"/>
+	<classpathentry kind="lib" path="lib/imageio-core-3.2.jar"/>
+	<classpathentry kind="lib" path="lib/imageio-metadata-3.2.jar"/>
+	<classpathentry kind="lib" path="lib/imageio-tiff-3.2.jar"/>
 	<classpathentry kind="output" path="gen"/>
 </classpath>
--- a/build.xml
+++ b/build.xml
@ -165,9 +165,9 @@
    	<pathelement location="${lib}/bcmail-jdk15-1.46.jar" />
    	<pathelement location="${lib}/bcprov-jdk15-1.46.jar" />
    	<pathelement location="${lib}/chardet.jar" />
-    	<pathelement location="${lib}/common-image-3.1.2.jar" />
-    	<pathelement location="${lib}/common-io-3.1.2.jar" />
-    	<pathelement location="${lib}/common-lang-3.1.2.jar" />
+    	<pathelement location="${lib}/common-image-3.2.jar" />
+    	<pathelement location="${lib}/common-io-3.2.jar" />
+    	<pathelement location="${lib}/common-lang-3.2.jar" />
    	<pathelement location="${lib}/commons-codec-1.10.jar" />
    	<pathelement location="${lib}/commons-compress-1.10.jar" />
    	<pathelement location="${lib}/commons-fileupload-1.3.1.jar" />
@ -179,12 +179,12 @@
        <pathelement location="${lib}/guava-18.0.jar" />
        <pathelement location="${lib}/htmllexer.jar" />
        <pathelement location="${lib}/httpclient-4.5.1.jar" />
-        <pathelement location="${lib}/httpcore-4.4.3.jar" />
+        <pathelement location="${lib}/httpcore-4.4.4.jar" />
        <pathelement location="${lib}/httpmime-4.5.1.jar" />
        <pathelement location="${lib}/icu4j-56_1.jar" />
-    	<pathelement location="${lib}/imageio-core-3.1.2.jar" />
-    	<pathelement location="${lib}/imageio-metadata-3.1.2.jar" />
-    	<pathelement location="${lib}/imageio-tiff-3.1.2.jar" />
+    	<pathelement location="${lib}/imageio-core-3.2.jar" />
+    	<pathelement location="${lib}/imageio-metadata-3.2.jar" />
+    	<pathelement location="${lib}/imageio-tiff-3.2.jar" />
        <pathelement location="${lib}/J7Zip-modified.jar" />
        <pathelement location="${lib}/jakarta-oro-2.0.8.jar" />
    	<pathelement location="${lib}/jaudiotagger-2.0.4-20111207.115108-15.jar" />
@ -232,7 +232,6 @@
        <pathelement location="${lib}/pdfbox-1.8.10.jar" />
    	<pathelement location="${lib}/poi-3.13-20150929.jar" />
    	<pathelement location="${lib}/poi-scratchpad-3.13-20150929.jar" />
-    	<pathelement location="${lib}/servlet-3.1.2.jar" />
        <pathelement location="${lib}/slf4j-api-1.7.12.jar" />
        <pathelement location="${lib}/slf4j-jdk14-1.7.12.jar" />
        <pathelement location="${lib}/solr-core-5.3.1.jar" />
--- a/htroot/Blacklist_p.html
+++ b/htroot/Blacklist_p.html
@ -28,7 +28,7 @@
    </p>
    <h3>Active list: #(disabled)#
      #{blackLists}#      
-      &nbsp;[&nbsp;<em>#[name]#</em>&nbsp;]&nbsp;
+      &nbsp;[&nbsp;<em><a href="Blacklist_p.html?selectList&selectedListName=#[name]#">#[name]#</a></em>&nbsp;]&nbsp;
      #{/blackLists}#
      ::<span class="error">No blacklist selected</span>#(/disabled)#</h3>
    <!-- blacklist selection -->
--- a/htroot/api/getpageinfo.java
+++ b/htroot/api/getpageinfo.java
@ -114,12 +114,11 @@ public class getpageinfo {
                    prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());

                    // put keywords
-                    final String list[] = scraper.dc_subject();
+                    final Set<String> list = scraper.dc_subject();
                    int count = 0;
                    for (final String element: list) {
-                        final String tag = element;
-                        if (!tag.equals("")) {
-                            prop.putXML("tags_"+count+"_tag", tag);
+                        if (!element.equals("")) {
+                            prop.putXML("tags_"+count+"_tag", element);
                            count++;
                        }
                    }
--- a/htroot/api/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
@ -114,12 +114,11 @@ public class getpageinfo_p {
                    prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());

                    // put keywords
-                    final String list[] = scraper.dc_subject();
+                    final Set<String> list = scraper.dc_subject();
                    int count = 0;
                    for (final String element: list) {
-                        final String tag = element;
-                        if (!tag.equals("")) {
-                            prop.putXML("tags_"+count+"_tag", tag);
+                        if (!element.equals("")) {
+                            prop.putXML("tags_"+count+"_tag", element);
                            count++;
                        }
                    }
--- a/htroot/yacysearch.json
+++ b/htroot/yacysearch.json
@ -9,7 +9,6 @@
      "title": "Search for #[rss_query]#",
      "link": "#[searchBaseURL]#?query=#[rss_queryenc]#&amp;resource=#[resource]#&amp;contentdom=#[contentdom]#"
    },
-    "totalResults": "#[num-results_totalcount]#",
    "startIndex": "#[num-results_offset]#",
    "itemsPerPage": "#[num-results_itemsPerPage]#",
    "searchTerms": "#[rss_queryenc]#",
--- a/lib/common-image-3.1.2.jar
+++ b/lib/common-image-3.1.2.jar
--- a/lib/common-image-3.2.jar
+++ b/lib/common-image-3.2.jar
--- a/lib/common-io-3.1.2.jar
+++ b/lib/common-io-3.1.2.jar
--- a/lib/common-io-3.2.jar
+++ b/lib/common-io-3.2.jar
--- a/lib/common-lang-3.1.2.jar
+++ b/lib/common-lang-3.1.2.jar
--- a/lib/common-lang-3.2.jar
+++ b/lib/common-lang-3.2.jar
--- a/lib/httpcore-4.4.4.License
+++ b/lib/httpcore-4.4.4.License
--- a/lib/httpcore-4.4.4.jar
+++ b/lib/httpcore-4.4.4.jar
--- a/lib/imageio-core-3.1.2.jar
+++ b/lib/imageio-core-3.1.2.jar
--- a/lib/imageio-core-3.2.jar
+++ b/lib/imageio-core-3.2.jar
--- a/lib/imageio-metadata-3.1.2.jar
+++ b/lib/imageio-metadata-3.1.2.jar
--- a/lib/imageio-metadata-3.2.jar
+++ b/lib/imageio-metadata-3.2.jar
--- a/lib/imageio-tiff-3.1.2.jar
+++ b/lib/imageio-tiff-3.1.2.jar
--- a/lib/imageio-tiff-3.2.jar
+++ b/lib/imageio-tiff-3.2.jar
--- a/lib/servlet-3.1.2.jar
+++ b/lib/servlet-3.1.2.jar
--- a/locales/de.lng
+++ b/locales/de.lng
@ -2166,6 +2166,21 @@ reindex documents containing these fields:==Re-Indiziere Dokumente die folgende
 #"is empty"=="ist leer"
 #"no reindex job running"=="Es läuft kein Re-Indexierungs-Job"
 #"! reindex works only with embedded Solr index !"=="! Re-Indexierung funktioniert nur mit eingebautem Solr Index !"
+Re-Crawl Index Documents==Re-Crawl Index Dokumente
+Searches the local index and selects documents to add to the crawler (recrawl the document).==Durchsucht und selektiert Dokumente im lokalen Index und fügt diese dem Crawler hinzu (Dokumente erneut crawlen).
+This runs transparent as background job.==Dies läuft transparent als Hintergrund Job.
+Documents are added to the crawler only if no other crawls are active==Dokumente werden dem Crawler hinzugefügt, wenn kein anderer Crawl-Job aktiv ist
+and are added in small chunks.==und wird in kleinen Blöcken verarbeitet.
+"start recrawl job now"=="Starte Re-Crawl-Job jetzt"
+"stop recrawl job"=="Beende Re-Crawl-Job"
+to re-crawl documents with fresh_date_dt before today.==um Dokumente mit fresh_date_dt vor Heute erneut zu crawlen.
+after starting the recrawl job you can apply a custom Solr query to select documents to be processed==nach dem Start des Re-Crawl-Jobs kann die Solr Abfrage bearbeitet werden um gewünschte Dokumente zu verarbeiten
+Re-Crawl Query Details==Re-Crawl Abfrage Details
+Documents to process==Dokumente in Warteschlange
+Current Query==Aktuelle Abfrage
+Edit Solr Query==Edit Solr Abfrage
+update==aktualisieren
+include failed urls==inklusive Fehler-Urls
 #-----------------------------

 #File: Load_MediawikiWiki.html
--- a/nbproject/project.xml
+++ b/nbproject/project.xml
@ -83,7 +83,7 @@
            <compilation-unit>
                <package-root>source</package-root>
                <package-root>htroot</package-root>
-                <classpath mode="compile">lib/J7Zip-modified.jar;lib/apache-mime4j-0.6.jar;lib/bcmail-jdk15-1.46.jar;lib/bcprov-jdk15-1.46.jar;lib/chardet.jar;lib/common-image-3.1.2.jar;lib/common-io-3.1.2.jar;lib/common-lang-3.1.2.jar;lib/commons-codec-1.10.jar;lib/commons-compress-1.10.jar;lib/commons-fileupload-1.3.1.jar;lib/commons-io-2.4.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.2.jar;lib/fontbox-1.8.10.jar;lib/guava-18.0.jar;lib/htmllexer.jar;lib/httpclient-4.5.1.jar;lib/httpcore-4.4.3.jar;lib/httpmime-4.5.1.jar;lib/icu4j-56_1.jar;lib/imageio-core-3.1.2.jar;lib/imageio-metadata-3.1.2.jar;lib/imageio-tiff-3.1.2.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/javax.servlet-api-3.1.0.jar;lib/jcifs-1.3.17.jar;lib/jcl-over-slf4j-1.7.12.jar;lib/jempbox-1.8.10.jar;lib/jetty-client-9.2.13.v20150730.jar;lib/jetty-continuation-9.2.13.v20150730.jar;lib/jetty-deploy-9.2.13.v20150730.jar;lib/jetty-http-9.2.13.v20150730.jar;lib/jetty-io-9.2.13.v20150730.jar;lib/jetty-jmx-9.2.13.v20150730.jar;lib/jetty-proxy-9.2.13.v20150730.jar;lib/jetty-security-9.2.13.v20150730.jar;lib/jetty-server-9.2.13.v20150730.jar;lib/jetty-servlet-9.2.13.v20150730.jar;lib/jetty-servlets-9.2.13.v20150730.jar;lib/jetty-util-9.2.13.v20150730.jar;lib/jetty-webapp-9.2.13.v20150730.jar;lib/jetty-xml-9.2.13.v20150730.jar;lib/jsch-0.1.53.jar;lib/json-simple-1.1.1.jar;lib/jsoup-1.8.3.jar;lib/log4j-over-slf4j-1.7.12.jar;lib/lucene-analyzers-common-5.3.1.jar;lib/lucene-analyzers-phonetic-5.3.1.jar;lib/lucene-backward-codecs-5.3.1.jar;lib/lucene-classification-5.3.1.jar;lib/lucene-codecs-5.3.1.jar;lib/lucene-core-5.3.1.jar;lib/lucene-facet-5.3.1.jar;lib/lucene-grouping-5.3.1.jar;lib/lucene-highlighter-5.3.1.jar;lib/lucene-join-5.3.1.jar;lib/lucene-memory-5.3.1.jar;lib/lucene-misc-5.3.1.jar;lib/lucene-queries-5.3.1.jar;lib/lucene-queryparser-5.3.1.jar;lib/lucene-spatial-5.3.1.jar;lib/lucene-suggest-5.3.1.jar;lib/metadata-extractor-2.8.1.jar;lib/noggit-0.6.jar;lib/org.restlet.jar;lib/pdfbox-1.8.10.jar;lib/poi-3.13-20150929.jar;lib/poi-scratchpad-3.13-20150929.jar;lib/servlet-3.1.2.jar;lib/slf4j-api-1.7.12.jar;lib/slf4j-jdk14-1.7.12.jar;lib/solr-core-5.3.1.jar;lib/solr-solrj-5.3.1.jar;lib/spatial4j-0.4.1.jar;lib/stax2-api-3.1.4.jar;lib/webcat-0.1-swf.jar;lib/weupnp-0.1.3.jar;lib/woodstox-core-asl-4.4.1.jar;lib/xercesImpl.jar;lib/xml-apis.jar;lib/xmpcore-5.1.2.jar;lib/zookeeper-3.4.6.jar</classpath>
+                <classpath mode="compile">lib/J7Zip-modified.jar;lib/apache-mime4j-0.6.jar;lib/bcmail-jdk15-1.46.jar;lib/bcprov-jdk15-1.46.jar;lib/chardet.jar;lib/common-image-3.2.jar;lib/common-io-3.2.jar;lib/common-lang-3.2.jar;lib/commons-codec-1.10.jar;lib/commons-compress-1.10.jar;lib/commons-fileupload-1.3.1.jar;lib/commons-io-2.4.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.2.jar;lib/fontbox-1.8.10.jar;lib/guava-18.0.jar;lib/htmllexer.jar;lib/httpclient-4.5.1.jar;lib/httpcore-4.4.4.jar;lib/httpmime-4.5.1.jar;lib/icu4j-56_1.jar;lib/imageio-core-3.2.jar;lib/imageio-metadata-3.2.jar;lib/imageio-tiff-3.2.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/javax.servlet-api-3.1.0.jar;lib/jcifs-1.3.17.jar;lib/jcl-over-slf4j-1.7.12.jar;lib/jempbox-1.8.10.jar;lib/jetty-client-9.2.13.v20150730.jar;lib/jetty-continuation-9.2.13.v20150730.jar;lib/jetty-deploy-9.2.13.v20150730.jar;lib/jetty-http-9.2.13.v20150730.jar;lib/jetty-io-9.2.13.v20150730.jar;lib/jetty-jmx-9.2.13.v20150730.jar;lib/jetty-proxy-9.2.13.v20150730.jar;lib/jetty-security-9.2.13.v20150730.jar;lib/jetty-server-9.2.13.v20150730.jar;lib/jetty-servlet-9.2.13.v20150730.jar;lib/jetty-servlets-9.2.13.v20150730.jar;lib/jetty-util-9.2.13.v20150730.jar;lib/jetty-webapp-9.2.13.v20150730.jar;lib/jetty-xml-9.2.13.v20150730.jar;lib/jsch-0.1.53.jar;lib/json-simple-1.1.1.jar;lib/jsoup-1.8.3.jar;lib/log4j-over-slf4j-1.7.12.jar;lib/lucene-analyzers-common-5.3.1.jar;lib/lucene-analyzers-phonetic-5.3.1.jar;lib/lucene-backward-codecs-5.3.1.jar;lib/lucene-classification-5.3.1.jar;lib/lucene-codecs-5.3.1.jar;lib/lucene-core-5.3.1.jar;lib/lucene-facet-5.3.1.jar;lib/lucene-grouping-5.3.1.jar;lib/lucene-highlighter-5.3.1.jar;lib/lucene-join-5.3.1.jar;lib/lucene-memory-5.3.1.jar;lib/lucene-misc-5.3.1.jar;lib/lucene-queries-5.3.1.jar;lib/lucene-queryparser-5.3.1.jar;lib/lucene-spatial-5.3.1.jar;lib/lucene-suggest-5.3.1.jar;lib/metadata-extractor-2.8.1.jar;lib/noggit-0.6.jar;lib/org.restlet.jar;lib/pdfbox-1.8.10.jar;lib/poi-3.13-20150929.jar;lib/poi-scratchpad-3.13-20150929.jar;lib/slf4j-api-1.7.12.jar;lib/slf4j-jdk14-1.7.12.jar;lib/solr-core-5.3.1.jar;lib/solr-solrj-5.3.1.jar;lib/spatial4j-0.4.1.jar;lib/stax2-api-3.1.4.jar;lib/webcat-0.1-swf.jar;lib/weupnp-0.1.3.jar;lib/woodstox-core-asl-4.4.1.jar;lib/xercesImpl.jar;lib/xml-apis.jar;lib/xmpcore-5.1.2.jar;lib/zookeeper-3.4.6.jar</classpath>
                <built-to>lib/yacycore.jar</built-to>
                <source-level>1.7</source-level>
            </compilation-unit>
--- a/pom.xml
+++ b/pom.xml
@ -395,11 +395,6 @@
            <artifactId>commons-logging</artifactId>
            <version>1.2</version>
        </dependency>
-		<dependency>
-    		<groupId>com.twelvemonkeys.imageio</groupId>
-    		<artifactId>imageio-tiff</artifactId>
-    		<version>3.1.2</version>
-		</dependency>
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>fontbox</artifactId>
@ -435,6 +430,11 @@
            <artifactId>icu4j</artifactId>
            <version>56.1</version>
        </dependency>
+        <dependency>
+            <groupId>com.twelvemonkeys.imageio</groupId>
+            <artifactId>imageio-tiff</artifactId>
+            <version>3.2</version>
+        </dependency>        
        <dependency>
            <groupId>org</groupId>
            <artifactId>jaudiotagger</artifactId>
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@ -971,8 +971,12 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
        return toNormalform(false);
    }

+    /**
+     * Tokenized url as string (without the protocol)
+     * @return example "host com path file ext"
+     */
    public String toTokens() {
-        return toTokens(unescape(this.toNormalform(true)));
+        return toTokens(unescape(this.urlstub(true,true)));
    }

    /**
@ -1105,6 +1109,15 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
        return result;
    }

+    /**
+     * Generates a normal form of the url, without the protocol part,
+     * except the skipped protocol part this is identical with toNormalform()
+     * @see #toNormalform(boolean)
+     * @param excludeAnchor, exclude anchor part
+     * @param removeSessionID, exclude session id
+     * @return example "www.host.com:8080/path/file.html"
+     * @see #toNormalform(boolean, boolean)
+     */
    public String urlstub(final boolean excludeAnchor, final boolean removeSessionID) {
        // generates a normal form of the URL
        boolean defaultPort = false;
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -46,7 +46,6 @@ import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.TreeSet;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

@ -71,7 +70,7 @@ public class Document {
    private DigestURL source;             // the source url
    private final String mimeType;              // mimeType as taken from http header
    private final String charset;               // the charset of the document
-    private final List<String> keywords;        // most resources provide a keyword field
+    private final Set<String> keywords;         // most resources provide a keyword field
    private       List<String> titles;          // the document titles, taken from title and/or h1 tag; shall appear as headline of search result
    private final StringBuilder creator;        // author or copyright
    private final String publisher;             // publisher
@ -115,7 +114,7 @@ public class Document {
        this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
        this.charset = charset;
        this.parserObject = parserObject;
-        this.keywords = new LinkedList<String>();
+        this.keywords = new LinkedHashSet<String>();
        if (keywords != null) this.keywords.addAll(Arrays.asList(keywords));
        this.titles = (titles == null) ? new ArrayList<String>(1) : titles;
        this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
@ -214,6 +213,10 @@ dc_coverage
 dc_rights
     */

+    /**
+     * Get the main document title. This is the 1st in the list of titles.
+     * @return title_string (may return null or empty string)
+     */
    public String dc_title() {
        return (this.titles == null || this.titles.size() == 0) ? "" : this.titles.iterator().next();
    }
@ -222,6 +225,10 @@ dc_rights
        return this.titles;
    }

+    /**
+     * Sets the title of the document, replacing any existing titles.
+     * @param title
+     */
    public void setTitle(final String title) {
        this.titles = new ArrayList<String>();
        if (title != null) this.titles.add(title);
@ -239,11 +246,8 @@ dc_rights
     * @param tags
     */
    public void addTags(Set<String> tags) {
-        for (String s: this.keywords) {
-            tags.remove(s);
-        }
        for (String s: tags) {
-            this.keywords.add(s);
+            if (s != null && !s.isEmpty()) this.keywords.add(s);
        }
    }

@ -274,28 +278,27 @@ dc_rights
        }
        return gf;
    }
-    
-    public String[] dc_subject() {
-        // sort out doubles and empty words
-        final TreeSet<String> hs = new TreeSet<String>();
-        String s;
-        for (int i = 0; i < this.keywords.size(); i++) {
-            if (this.keywords.get(i) == null) continue;
-            s = (this.keywords.get(i)).trim();
-            if (!s.isEmpty()) hs.add(s);
-        }
-        final String[] t = new String[hs.size()];
-        int i = 0;
-        for (final String u: hs) t[i++] = u;
-        return t;
+
+    /**
+     * Get the set of keywords associated with the document
+     * @return set of unique keywords
+     */
+    public Set<String> dc_subject() {
+        return this.keywords;
    }

+    /**
+     * Get the set of keywords associated with the document and string
+     * each keyword separated by the separator character
+     *
+     * @param separator character
+     * @return string of keywords or empty string
+     */
    public String dc_subject(final char separator) {
-        final String[] t = dc_subject();
-        if (t.length == 0) return "";
+        if (this.keywords.size() == 0) return "";
        // generate a new list
-        final StringBuilder sb = new StringBuilder(t.length * 8);
-        for (final String s: t) sb.append(s).append(separator);
+        final StringBuilder sb = new StringBuilder(this.keywords.size() * 8);
+        for (final String s: this.keywords) sb.append(s).append(separator);
        return sb.substring(0, sb.length() - 1);
    }

@ -427,10 +430,6 @@ dc_rights
        return sentences;
    }

-    public List<String> getKeywords() {
-        return this.keywords;
-    }
-
    public Collection<AnchorURL> getAnchors() {
        // returns all links embedded as anchors (clickeable entities)
        // this is a url(String)/text(String) map
@ -688,7 +687,7 @@ dc_rights
        for (final Document doc: docs) {
            this.sections.addAll(doc.sections);
            this.titles.addAll(doc.titles());
-            this.keywords.addAll(doc.getKeywords());
+            this.keywords.addAll(doc.dc_subject());
            for (String d: doc.dc_description()) this.descriptions.add(d);

            if (!(this.text instanceof ByteArrayOutputStream)) {
--- a/source/net/yacy/document/parser/psParser.java
+++ b/source/net/yacy/document/parser/psParser.java
@ -271,7 +271,6 @@ public class psParser extends AbstractParser implements Parser {
        try {
            // creating a tempfile
            tempFile = FileUtils.createTempFile(this.getClass(), "temp.ps");
-            tempFile.deleteOnExit();

            // copying inputstream into file
            FileUtils.copy(source,tempFile);
--- a/source/net/yacy/kelondro/data/word/WordReferenceRow.java
+++ b/source/net/yacy/kelondro/data/word/WordReferenceRow.java
@ -118,7 +118,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
            final byte[]   urlHash,
            final int      urlLength,     // byte-length of complete URL
            final int      urlComps,      // number of path components
-            final int      titleLength,   // length of description/length (longer are better?)
+            final int      titlewordcount,// length of description/length (longer are better?)
            final int      hitcount,      // how often appears this word in the text
            final int      wordcount,     // total number of words
            final int      phrasecount,   // total number of phrases
@ -141,7 +141,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
        this.entry.setCol(col_urlhash, urlHash);
        this.entry.setCol(col_lastModified, mddlm);
        this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
-        this.entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words
+        this.entry.setCol(col_wordsInTitle, titlewordcount);
        this.entry.setCol(col_wordsInText, wordcount);
        this.entry.setCol(col_phrasesInText, phrasecount);
        this.entry.setCol(col_doctype, new byte[]{(byte) doctype});
@ -163,7 +163,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
    public WordReferenceRow(final byte[]   urlHash,
                            final int      urlLength,     // byte-length of complete URL
                            final int      urlComps,      // number of path components
-                            final int      titleLength,   // length of description/length (longer are better?)
+                            final int      titlewordcount,// length of description/length (longer are better?)
                            final int      wordcount,     // total number of words
                            final int      phrasecount,   // total number of phrases
                            final long     lastmodified,  // last-modified time of the document where word appears
@ -180,7 +180,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
                        this.entry.setCol(col_urlhash, urlHash);
                        this.entry.setCol(col_lastModified, mddlm);
                        this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
-                        this.entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words
+                        this.entry.setCol(col_wordsInTitle, titlewordcount);
                        this.entry.setCol(col_wordsInText, wordcount);
                        this.entry.setCol(col_phrasesInText, phrasecount);
                        this.entry.setCol(col_doctype, new byte[]{(byte) doctype});
--- a/source/net/yacy/kelondro/data/word/WordReferenceVars.java
+++ b/source/net/yacy/kelondro/data/word/WordReferenceVars.java
@ -34,11 +34,9 @@ import java.util.concurrent.LinkedBlockingQueue;
 import net.yacy.cora.date.MicroDate;
 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.encoding.UTF8;
-import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.order.Base64Order;
 import net.yacy.cora.util.ByteArray;
 import net.yacy.cora.util.ConcurrentLog;
-import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.kelondro.index.Row;
 import net.yacy.kelondro.index.Row.Entry;
 import net.yacy.kelondro.rwi.AbstractReference;
@ -50,11 +48,11 @@ import net.yacy.kelondro.workflow.WorkflowProcessor;

 public class WordReferenceVars extends AbstractReference implements WordReference, Reference, Cloneable, Comparable<WordReferenceVars>, Comparator<WordReferenceVars> {

-	/**
-	 * object for termination of concurrent blocking queue processing
-	 */
-	public static final WordReferenceVars poison = new WordReferenceVars();
-	protected static final byte[] default_language = UTF8.getBytes("en");
+    /**
+     * object for termination of concurrent blocking queue processing
+     */
+    public static final WordReferenceVars poison = new WordReferenceVars();
+    protected static final byte[] default_language = UTF8.getBytes("en");

    private final Bitfield flags;
    private long lastModified;
@ -71,31 +69,6 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
    private double termFrequency;
    private final boolean local;

-    public WordReferenceVars(final URIMetadataNode md, final boolean local) {
-        this.language = md.language();
-        this.flags = md.flags();
-        this.lastModified = md.moddate().getTime();
-        this.urlHash = md.hash();
-        this.type = md.doctype();
-        this.llocal = md.llocal();
-        this.lother = md.lother();
-        this.positions = new LinkedBlockingQueue<Integer>();
-        this.positions.add(1);
-        String urlNormalform = md.url().toNormalform(true);
-        this.urlcomps = MultiProtocolURL.urlComps(urlNormalform).length;
-        this.urllength = urlNormalform.length();
-        this.virtualAge = -1; // compute that later
-        // the following fields cannot be computed here very easy and are just filled with dummy values
-        this.phrasesintext = 1;
-        this.hitcount = 1;
-        this.posinphrase = 1;
-        this.posofphrase = 1;
-        this.wordsintext = 1;
-        this.wordsintitle = 1;
-        this.termFrequency = 1;
-        this.local = local;
-    }
-
    public WordReferenceVars(
            final byte[]   urlHash,
            final int      urlLength,     // byte-length of complete URL
--- a/source/net/yacy/kelondro/util/FileUtils.java
+++ b/source/net/yacy/kelondro/util/FileUtils.java
@ -790,6 +790,16 @@ public final class FileUtils {
        }
    }

+    /**
+     * Creates a temp file in the default system tmp directory (System property ""java.io.tmpdir"")
+     * with a name constructed by combination of class name and name.
+     * Marks the file with deleteOnExit() to be at least deleted on shutdown of jvm
+     *
+     * @param classObj name is used as prefix
+     * @param name
+     * @return temp file
+     * @throws IOException
+     */
    public static final File createTempFile(final Class<?> classObj, final String name) throws IOException {
        String parserClassName = classObj.getName();
        int idx = parserClassName.lastIndexOf('.');
@ -809,6 +819,7 @@ public final class FileUtils {
            File.createTempFile(
                parserClassName + "_" + ((idx > -1) ? fileName.substring(0, idx) : fileName),
                (!fileExt.isEmpty()) ? "." + fileExt : fileExt);
+        tempFile.deleteOnExit();
        return tempFile;
    }
    
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -3205,7 +3205,7 @@ public final class Switchboard extends serverSwitch {
        //final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
        final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
        tags.add("crawlStart");
-        final String[] keywords = scraper.dc_subject();
+        final Set<String> keywords = scraper.dc_subject();
        if (keywords != null) {
            for (final String k: keywords) {
                final String kk = BookmarkHelper.cleanTagsString(k);
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -56,6 +56,7 @@ import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.storage.HandleSet;
 import net.yacy.cora.util.ByteBuffer;
+import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.cora.util.LookAheadIterator;
 import net.yacy.cora.util.SpaceExceededException;
@ -686,10 +687,10 @@ public class Segment {

        // create a word prototype which is re-used for all entries
        if ((this.termIndex != null && storeToRWI) || searchEvent != null) {
-            final int len = (document == null) ? urlLength : document.dc_title().length();
+            final int wordsintitle = CommonPattern.SPACES.split(dc_title).length; // same calculation as for CollectionSchema.title_words_val
            final WordReferenceRow ientry = new WordReferenceRow(
                            url.hash(),
-                            urlLength, urlComps, len,
+                            urlLength, urlComps, wordsintitle,
                            condenser.RESULT_NUMB_WORDS,
                            condenser.RESULT_NUMB_SENTENCES,
                            modDate.getTime(),