added (manual) urldb migration (link on: Index Administraton -> Federated Solr Index)

- migrates all entries in old urldb Metadata coordinate (lat / lon) NumberFormatException still relative often (see excerpt below), - added try/catch for URIMetadataRow (seems not to be needed in URIMetaDataNode, as Solr internally checks for number format) - removed possible typ conversion for lat() / lon() comparison with 0.0f, changed to 0.0 (leaving it to the compiler/optimizer to choose number format) current log excerpt for NumberFormatException: W 2013/01/14 00:10:07 StackTrace For input string: "-" java.lang.NumberFormatException: For input string: "-" at sun.misc.FloatingDecimal.readJavaFormatString(Unknown Source) at java.lang.Double.parseDouble(Unknown Source) at net.yacy.kelondro.data.meta.URIMetadataRow$Components.lon(URIMetadataRow.java:525) at net.yacy.kelondro.data.meta.URIMetadataRow.lon(URIMetadataRow.java:279) at net.yacy.search.index.SolrConfiguration.metadata2solr(SolrConfiguration.java:277) at net.yacy.search.index.Fulltext.putMetadata(Fulltext.java:329) at transferURL.respond(transferURL.java:152) ... Caused by: java.lang.NumberFormatException: For input string: "-" at sun.misc.FloatingDecimal.readJavaFormatString(Unknown Source) at java.lang.Double.parseDouble(Unknown Source) at net.yacy.kelondro.data.meta.URIMetadataRow$Components.lon(URIMetadataRow.java:525) at net.yacy.kelondro.data.meta.URIMetadataRow.lon(URIMetadataRow.java:279) at net.yacy.search.index.SolrConfiguration.metadata2solr(SolrConfiguration.java:277) at net.yacy.search.index.Fulltext.putMetadata(Fulltext.java:329) at transferURL.respond(transferURL.java:152)
12 years ago · 3897bb4409
parent 3b6e08b49f
commit 3897bb4409
11 changed files with 702 additions and 504 deletions
--- a/htroot/IndexFederated_p.html
+++ b/htroot/IndexFederated_p.html
@ -26,7 +26,8 @@
      This is a switchboard for the usage of embedded metadata to embedded solr.
      The rwi index is necessary for index transmission and shall be switched off in future portalmode configurations.
      <dl>
-      <dt><input type="checkbox" name="core.service.fulltext" id="core.service.fulltext" #(core.service.fulltext.checked)#:: checked="checked"#(/core.service.fulltext.checked)# /></dt><dd>url metadata and embedded solr fulltext search index, interface at: <a href="/solr/select?q=*:*&start=0&rows=10" target="_blank">/solr/select?q=*:*&amp;start=0&amp;rows=10</a></dd>
+      <dt><input type="checkbox" name="core.service.fulltext" id="core.service.fulltext" #(core.service.fulltext.checked)#:: checked="checked"#(/core.service.fulltext.checked)# /></dt><dd>url metadata and embedded solr fulltext search index, interface at: <a href="/solr/select?q=*:*&start=0&rows=10" target="_blank">/solr/select?q=*:*&amp;start=0&amp;rows=10</a>
+        #(migrateUrlDbtoSolr)#:: <input type="button" class="submitready" onclick="window.location = '/api/migrateurldb_p.html';" value="migrate old index" />#(/migrateUrlDbtoSolr)# </dd>
      <dt><input type="checkbox" name="core.service.rwi.tmp" id="core.service.rwi" #(core.service.rwi.tmp.checked)#:: checked="checked"#(/core.service.rwi.tmp.checked)# /></dt><dd>embedded 'classic' rwi index</dd>
      <dt><input type="checkbox" name="core.service.citation.tmp" id="core.service.citation" #(core.service.citation.tmp.checked)#:: checked="checked"#(/core.service.citation.tmp.checked)# /></dt><dd>embedded citation reference index (link structure, used for ranking)</dd>
      <dt></dt><dd><input type="submit" name="set" value="Set" /></dd>
--- a/htroot/IndexFederated_p.java
+++ b/htroot/IndexFederated_p.java
@ -225,6 +225,10 @@ public class IndexFederated_p {
        prop.put("solr.indexing.sharding", env.getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_SHARDING, "modulo-host-md5"));
        prop.put("solr.indexing.schemefile", schemename);

+        if ((sb.index.fulltext().connectedURLDb())) {
+            prop.put("migrateUrlDbtoSolr", 1);
+        } else  prop.put("migrateUrlDbtoSolr", 0);
+
        // return rewrite properties
        return prop;
    }
--- a/htroot/migrateurldb_p.html
+++ b/htroot/migrateurldb_p.html
@ -0,0 +1,36 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+    <head>
+        <title>Migrate URLdb</title>
+        #%env/templates/metas.template%#
+    </head>
+    <body>
+        #%env/templates/header.template%#
+        #%env/templates/simpleheader.template%#
+
+        <h2>Migrate URLdb to embedded Solr Index</h2>
+
+        <p>Convert old meta data (urldb) index to embedded Solr fulltext index.</p>
+
+        <dl>
+            <dd>
+                <p>A low priority background job has been started which reads the old index, adds it to Solr and deletes the entry from the old index.</p>
+                <p>The default "slow migration" updates any entry in the old urldb index upon access (e.g. during search events).<br />
+                    If you feel that the not accessed entries are still relevant, with this migration all entries from the old urldb index will be migrated.</p>
+                <p>You may refresh this page to see how many entries in the old index are left for migration</p>
+                <p>Hint: this background task runs until all entries are migrated or YaCy is shutdown. The migration is not automatically restarted.</p>
+            </dd>
+        </dl>
+        <form action="migrateurldb_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
+            <fieldset>
+            <input type="hidden" name="lastcount" value="#[lastcount]#" />
+            <input type="hidden" name="lasttime" value="#[lasttime]#" />
+
+            <p><b>#[count]# entries</b> in old index left to migrate.</p>
+            <p>For large indexes this may run for a long time (migration speed: #[speed]# entries per minute) <input type="submit" name="dorefresh" value="refresh" /></p>
+            </fieldset>
+        </form>
+
+        #%env/templates/footer.template%#
+    </body>
+</html>
--- a/htroot/migrateurldb_p.java
+++ b/htroot/migrateurldb_p.java
@ -0,0 +1,44 @@
+// migrateurldb_p.java
+
+import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.migration;
+import net.yacy.search.Switchboard;
+import net.yacy.server.serverObjects;
+import net.yacy.server.serverSwitch;
+
+public class migrateurldb_p {
+
+    public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
+        final serverObjects prop = new serverObjects();
+        final Switchboard sb = (Switchboard) env;
+
+        int cnt;
+
+        if ((cnt = migration.migrateUrldbtoSolr(sb)) > 0) {
+            prop.put("count", cnt);
+
+            if (post != null && post.containsKey("dorefresh")) {
+                int lastcount = post.getInt("lastcount", 0);
+                Long t = post.getLong("lasttime", 1);
+
+                Double difft = (System.currentTimeMillis() - t) / 60000.0d;
+                int diff = (int)((lastcount - cnt) / difft) ;
+                prop.put("speed", diff);
+                prop.put("lasttime", t);
+                prop.put("lastcount", lastcount);
+
+            } else {
+                prop.put("speed", "?");
+                prop.put("lastcount",cnt);
+                prop.put("lasttime", System.currentTimeMillis());
+            }
+        } else {
+            prop.put("speed", "");
+            prop.put("count", "no urldb index available");
+        }
+
+
+        // return rewrite properties
+        return prop;
+    }
+}
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -1,495 +1,495 @@
-/**
- *  Condenser.java
- *  Copyright 2004 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
- *  First released 09.01.2004 at http://yacy.net
- *
- *  This library is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU Lesser General Public
- *  License as published by the Free Software Foundation; either
- *  version 2.1 of the License, or (at your option) any later version.
- *
- *  This library is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  Lesser General Public License for more details.
- *
- *  You should have received a copy of the GNU Lesser General Public License
- *  along with this program in the file lgpl21.txt
- *  If not, see <http://www.gnu.org/licenses/>.
- */
-
-package net.yacy.document;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.LinkedHashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Properties;
-import java.util.Set;
-import java.util.SortedSet;
-import java.util.TreeMap;
-
-import org.apache.solr.common.params.MapSolrParams;
-import org.apache.solr.update.processor.Lookup3Signature;
-
-import net.yacy.cora.document.ASCII;
-import net.yacy.cora.document.WordCache;
-import net.yacy.cora.document.analysis.Classification.ContentDomain;
-import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
-import net.yacy.cora.document.MultiProtocolURI;
-import net.yacy.cora.federate.solr.Boost;
-import net.yacy.cora.language.synonyms.SynonymLibrary;
-import net.yacy.cora.lod.vocabulary.Tagging;
-import net.yacy.document.language.Identificator;
-import net.yacy.document.parser.html.ImageEntry;
-import net.yacy.kelondro.data.word.Word;
-import net.yacy.kelondro.data.word.WordReferenceRow;
-import net.yacy.kelondro.logging.Log;
-import net.yacy.kelondro.util.Bitfield;
-import net.yacy.kelondro.util.SetTools;
-
-
-public final class Condenser {
-
-    // this is the page analysis class
-    public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
-    public final static int wordminsize = 2;
-    public final static int wordcut = 2;
-
-    // category flags that show how the page can be distinguished in different interest groups
-    public  static final int flag_cat_indexof       =  0; // a directory listing page (i.e. containing 'index of')
-    public  static final int flag_cat_haslocation   = 19; // the page has a location metadata attached
-    public  static final int flag_cat_hasimage      = 20; // the page refers to (at least one) images
-    public  static final int flag_cat_hasaudio      = 21; // the page refers to (at least one) audio file
-    public  static final int flag_cat_hasvideo      = 22; // the page refers to (at least one) videos
-    public  static final int flag_cat_hasapp        = 23; // the page refers to (at least one) application file
-
-    //private Properties analysis;
-    private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
-    private final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
-    private final Set<String> synonyms; // a set of synonyms to the words
-    private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection
-    private String fuzzy_signature_text = null; // signatures for double-check detection
-    
-    public int RESULT_NUMB_WORDS = -1;
-    public int RESULT_DIFF_WORDS = -1;
-    public int RESULT_NUMB_SENTENCES = -1;
-    public int RESULT_DIFF_SENTENCES = -1;
-    public Bitfield RESULT_FLAGS = new Bitfield(4);
-    private final Identificator languageIdentificator;
-
-    public Condenser(
-            final Document document,
-            final boolean indexText,
-            final boolean indexMedia,
-            final WordCache meaningLib,
-            final SynonymLibrary synlib,
-            final boolean doAutotagging
-            ) {
-        Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
-        // if addMedia == true, then all the media links are also parsed and added to the words
-        // added media words are flagged with the appropriate media flag
-        this.words = new HashMap<String, Word>();
-        this.synonyms = new LinkedHashSet<String>();
-        this.RESULT_FLAGS = new Bitfield(4);
-
-        // construct flag set for document
-        if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty())     this.RESULT_FLAGS.set(flag_cat_hasimage, true);
-        if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
-        if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
-        if (document.dc_source().getContentDomain() == ContentDomain.APP   || !document.getApplinks().isEmpty())   this.RESULT_FLAGS.set(flag_cat_hasapp,   true);
-        if (document.lat() != 0.0f && document.lon() != 0.0f) this.RESULT_FLAGS.set(flag_cat_haslocation, true);
-
-        this.languageIdentificator = new Identificator();
-
-        // add the URL components to the word list
-        insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
-
-        Map.Entry<MultiProtocolURI, String> entry;
-        if (indexText) {
-            createCondensement(document.getTextString(), meaningLib, doAutotagging);
-            // the phrase counter:
-            // phrase   0 are words taken from the URL
-            // phrase   1 is the MainTitle
-            // phrase   2 is <not used>
-            // phrase   3 is the Document Abstract
-            // phrase   4 is the Document Author
-            // phrase   5 is the Document Publisher
-            // phrase   6 are the tags specified in document
-            // phrase  10 and above are the section headlines/titles (88 possible)
-            // phrase  98 is taken from the embedded anchor/hyperlinks description (REMOVED!)
-            // phrase  99 is taken from the media Link url and anchor description
-            // phrase 100 and above are lines from the text
-            insertTextToWords(new SentenceReader(document.dc_title()),       1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
-            insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
-            insertTextToWords(new SentenceReader(document.dc_creator()),     4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
-            insertTextToWords(new SentenceReader(document.dc_publisher()),   5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
-            insertTextToWords(new SentenceReader(document.dc_subject(' ')),  6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
-            // missing: tags!
-            final String[] titles = document.getSectionTitles();
-            for (int i = 0; i < titles.length; i++) {
-                insertTextToWords(new SentenceReader(titles[i]), i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib);
-            }
-
-            // anchors: for text indexing we add only the anchor description
-            // REMOVED! Reason:
-            // words from the anchor description should appear as normal text in the output from the parser
-            // to flag these words as appearance in dc_description would confuse, since the user expects such word as titles of
-            // pages that are shown in the search result. The words from the URLS should also not appear as part of the index, because they
-            // are not visible in the text and could be used to crate fake-content
-            /*
-            final Iterator<Map.Entry<yacyURL, String>> i = document.getAnchors().entrySet().iterator();
-            while (i.hasNext()) {
-                entry = i.next();
-                if ((entry == null) || (entry.getKey() == null)) continue;
-                insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true);
-            }
-            */
-        } else {
-            this.RESULT_NUMB_WORDS = 0;
-            this.RESULT_DIFF_WORDS = 0;
-            this.RESULT_NUMB_SENTENCES = 0;
-            this.RESULT_DIFF_SENTENCES = 0;
-        }
-
-        if (indexMedia) {
-            // add anchor descriptions: here, we also add the url components
-            // audio
-            Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
-            while (i.hasNext()) {
-                entry = i.next();
-                insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);
-                insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib);
-            }
-
-            // video
-            i = document.getVideolinks().entrySet().iterator();
-            while (i.hasNext()) {
-                entry = i.next();
-                insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib);
-                insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib);
-            }
-
-            // applications
-            i = document.getApplinks().entrySet().iterator();
-            while (i.hasNext()) {
-                entry = i.next();
-                insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib);
-                insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib);
-            }
-
-            // images
-            final Iterator<ImageEntry> j = document.getImages().values().iterator();
-            ImageEntry ientry;
-            MultiProtocolURI url;
-            while (j.hasNext()) {
-                ientry = j.next();
-                url = ientry.url();
-                if (url == null) continue;
-                insertTextToWords(new SentenceReader(url.toNormalform(true)), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib);
-                insertTextToWords(new SentenceReader(ientry.alt()), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
-            }
-
-            // finally check all words for missing flag entry
-            final Iterator<Map.Entry<String, Word>> k = this.words.entrySet().iterator();
-            Word wprop;
-            Map.Entry<String, Word> we;
-            while (k.hasNext()) {
-                we = k.next();
-                wprop = we.getValue();
-                if (wprop.flags == null) {
-                    wprop.flags = this.RESULT_FLAGS.clone();
-                    this.words.put(we.getKey(), wprop);
-                }
-            }
-        }
-
-        // extend the tags in the document object with autotagging tags
-        if (!this.tags.isEmpty()) {
-            document.addMetatags(this.tags);
-        }
-
-        if (synlib != null) {
-            for (String word: this.words.keySet()) {
-                Set<String> syms = synlib.getSynonyms(word);
-                if (syms != null) this.synonyms.addAll(syms);
-            }
-        }
-        String text = document.getTextString();
-        
-        // create the synonyms set
-        if (synonyms != null) {
-            for (String word: this.words.keySet()) {
-                Set<String> syms = synlib.getSynonyms(word);
-                if (syms != null) this.synonyms.addAll(syms);
-            }
-        }
-        
-        // create hashes for duplicate detection
-        // check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b
-        EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature();
-        Map<String,String> sp = new HashMap<String,String>();
-        sp.put("quantRate", Float.toString(Boost.RANKING.getQuantRate())); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5!
-        sp.put("minTokenLen", Integer.toString(Boost.RANKING.getMinTokenLen()));
-        fuzzySignatureFactory.init(new MapSolrParams(sp));
-        fuzzySignatureFactory.add(text);
-        byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature();
-        long l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (fuzzy_signature_hash[i] & 0xff);
-        this.fuzzy_signature = l;
-        this.fuzzy_signature_text = fuzzySignatureFactory.getSignatureText().toString();
-        Lookup3Signature exactSignatureFactory = new Lookup3Signature();
-        exactSignatureFactory.add(text);
-        byte[] exact_signature_hash = exactSignatureFactory.getSignature();
-        l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (exact_signature_hash[i] & 0xff);
-        this.exact_signature = l;
-    }
-
-    private Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) {
-        this.languageIdentificator = null; // we don't need that here
-        // analysis = new Properties();
-        this.words = new TreeMap<String, Word>();
-        this.synonyms = new HashSet<String>();
-        createCondensement(text, meaningLib, doAutotagging);
-    }
-
-    private void insertTextToWords(
-            final SentenceReader text,
-            final int phrase,
-            final int flagpos,
-            final Bitfield flagstemplate,
-            final boolean useForLanguageIdentification,
-            final WordCache meaningLib) {
-        if (text == null) return;
-        String word;
-        Word wprop;
-        WordTokenizer wordenum = new WordTokenizer(text, meaningLib);
-        try {
-	        int pip = 0;
-	        while (wordenum.hasMoreElements()) {
-	            word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
-	            if (useForLanguageIdentification) this.languageIdentificator.add(word);
-	            if (word.length() < 2) continue;
-	            wprop = this.words.get(word);
-	            if (wprop == null) wprop = new Word(0, pip, phrase);
-	            if (wprop.flags == null) wprop.flags = flagstemplate.clone();
-	            wprop.flags.set(flagpos, true);
-	            this.words.put(word, wprop);
-	            pip++;
-	            this.RESULT_NUMB_WORDS++;
-	            this.RESULT_DIFF_WORDS++;
-	        }
-        } finally {
-        	wordenum.close();
-        }
-    }
-
-    public int excludeWords(final SortedSet<String> stopwords) {
-        // subtracts the given stopwords from the word list
-        // the word list shrinkes. This returns the number of shrinked words
-        final int oldsize = this.words.size();
-        SetTools.excludeDestructive(this.words, stopwords);
-        return oldsize - this.words.size();
-    }
-
-    public Map<String, Word> words() {
-        // returns the words as word/indexWord relation map
-        return this.words;
-    }
-    
-    public List<String> synonyms() {
-        ArrayList<String> l = new ArrayList<String>(this.synonyms.size());
-        for (String s: this.synonyms) l.add(s);
-        return l;
-    }
-
-    public long fuzzySignature() {
-        return this.fuzzy_signature;
-    }
-
-    public String fuzzySignatureText() {
-        return this.fuzzy_signature_text;
-    }
-    
-    public long exactSignature() {
-        return this.exact_signature;
-    }
-    
-    public String language() {
-        return this.languageIdentificator.getLanguage();
-    }
-
-    private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) {
-        assert text != null;
-        final Set<String> currsentwords = new HashSet<String>();
-        String word = "";
-        String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
-        for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
-        String k;
-        Tagging.Metatag tag;
-        int wordlen;
-        Word wsp;
-        final Word wsp1;
-        int wordHandle;
-        int wordHandleCount = 0;
-        final int sentenceHandleCount = 0;
-        int allwordcounter = 0;
-        final int allsentencecounter = 0;
-        int wordInSentenceCounter = 1;
-        boolean comb_indexof = false, last_last = false, last_index = false;
-        final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
-        if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false;
-
-        // read source
-        final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
-        try {
-	        while (wordenum.hasMoreElements()) {
-	            word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
-	            if (this.languageIdentificator != null) this.languageIdentificator.add(word);
-	            if (word.length() < wordminsize) continue;
-
-	            // get tags from autotagging
-	            if (doAutotagging) {
-	            	for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
-	            		// wordc is number of words that are tested
-	            		StringBuilder sb = new StringBuilder();
-	            		if (wordc == 1) {
-	            			sb.append(word);
-	            		} else {
-	            			for (int w = 0; w < wordc - 1; w++) {
-	            				sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
-	            			}
-	            			sb.append(word);
-	            		}
-	            		String testterm = sb.toString().trim();
-	            		//System.out.println("Testing: " + testterm);
-		                tag = LibraryProvider.autotagging.getTagFromTerm(testterm);
-		                if (tag != null) {
-		                    String navigatorName = tag.getVocabularyName();
-		                    Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
-		                    if (tagset == null) {
-		                        tagset = new HashSet<Tagging.Metatag>();
-		                        this.tags.put(navigatorName, tagset);
-		                    }
-	                        tagset.add(tag);
-		                }
-	            	}
-	            }
-	            // shift wordcache
-	            System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
-	            wordcache[wordcache.length - 1] = word;
-
-	            // distinguish punctuation and words
-	            wordlen = word.length();
-	            if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
-	                // store sentence
-	                currsentwords.clear();
-	                wordInSentenceCounter = 1;
-	            } else {
-	                // check index.of detection
-	                if (last_last && comb_indexof && word.equals("modified")) {
-	                    this.RESULT_FLAGS.set(flag_cat_indexof, true);
-	                    wordenum.pre(true); // parse lines as they come with CRLF
-	                }
-	                if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
-	                last_last = word.equals("last");
-	                last_index = word.equals("index");
-
-	                // store word
-	                allwordcounter++;
-	                currsentwords.add(word);
-	                wsp = this.words.get(word);
-	                if (wsp != null) {
-	                    // word already exists
-	                    wordHandle = wsp.posInText;
-	                    wsp.inc();
-	                } else {
-	                    // word does not yet exist, create new word entry
-	                    wordHandle = wordHandleCount++;
-	                    wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
-	                    wsp.flags = this.RESULT_FLAGS.clone();
-	                    this.words.put(word, wsp);
-	                }
-	                // we now have the unique handle of the word, put it into the sentence:
-	                wordInSentenceCounter++;
-	            }
-	        }
-        } finally {
-        	wordenum.close();
-        }
-
-        if (pseudostemming) {
-            Map.Entry<String, Word> entry;
-            // we search for similar words and reorganize the corresponding sentences
-            // a word is similar, if a shortened version is equal
-            final Iterator<Map.Entry<String, Word>> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order
-            wordsearch: while (wi.hasNext()) {
-                entry = wi.next();
-                word = entry.getKey();
-                wordlen = word.length();
-                wsp = entry.getValue();
-                for (int i = wordcut; i > 0; i--) {
-                    if (wordlen > i) {
-                        k = word.substring(0, wordlen - i);
-                        if (this.words.containsKey(k)) {
-                            // update word counter
-                            wsp1.count = wsp1.count + wsp.count;
-                            this.words.put(k, wsp1);
-                            // remove current word
-                            wi.remove();
-                            continue wordsearch;
-                        }
-                    }
-                }
-            }
-        }
-
-        // store result
-        //this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
-        this.RESULT_NUMB_WORDS = allwordcounter;
-        this.RESULT_DIFF_WORDS = wordHandleCount;
-        this.RESULT_NUMB_SENTENCES = allsentencecounter;
-        this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
-    }
-
-    public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
-        // returns a word/indexWord relation map
-        if (text == null) return null;
-        return new Condenser(text, meaningLib, false).words();
-    }
-
-    public static void main(final String[] args) {
-        // read a property file and convert them into configuration lines
-        try {
-            final File f = new File(args[0]);
-            final Properties p = new Properties();
-            p.load(new FileInputStream(f));
-            final StringBuilder sb = new StringBuilder();
-            sb.append("{\n");
-            for (int i = 0; i <= 15; i++) {
-                sb.append('"');
-                final String s = p.getProperty("keywords" + i);
-                final String[] l = s.split(",");
-                for (final String element : l) {
-                    sb.append(ASCII.String(Word.word2hash(element)));
-                }
-                if (i < 15) sb.append(",\n");
-            }
-            sb.append("}\n");
-            System.out.println(sb.toString());
-        } catch (final FileNotFoundException e) {
-            Log.logException(e);
-        } catch (final IOException e) {
-            Log.logException(e);
-        }
-
-    }
-
-}
+/**
+ *  Condenser.java
+ *  Copyright 2004 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
+ *  First released 09.01.2004 at http://yacy.net
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file lgpl21.txt
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+package net.yacy.document;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeMap;
+
+import org.apache.solr.common.params.MapSolrParams;
+import org.apache.solr.update.processor.Lookup3Signature;
+
+import net.yacy.cora.document.ASCII;
+import net.yacy.cora.document.WordCache;
+import net.yacy.cora.document.analysis.Classification.ContentDomain;
+import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
+import net.yacy.cora.document.MultiProtocolURI;
+import net.yacy.cora.federate.solr.Boost;
+import net.yacy.cora.language.synonyms.SynonymLibrary;
+import net.yacy.cora.lod.vocabulary.Tagging;
+import net.yacy.document.language.Identificator;
+import net.yacy.document.parser.html.ImageEntry;
+import net.yacy.kelondro.data.word.Word;
+import net.yacy.kelondro.data.word.WordReferenceRow;
+import net.yacy.kelondro.logging.Log;
+import net.yacy.kelondro.util.Bitfield;
+import net.yacy.kelondro.util.SetTools;
+
+
+public final class Condenser {
+
+    // this is the page analysis class
+    public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
+    public final static int wordminsize = 2;
+    public final static int wordcut = 2;
+
+    // category flags that show how the page can be distinguished in different interest groups
+    public  static final int flag_cat_indexof       =  0; // a directory listing page (i.e. containing 'index of')
+    public  static final int flag_cat_haslocation   = 19; // the page has a location metadata attached
+    public  static final int flag_cat_hasimage      = 20; // the page refers to (at least one) images
+    public  static final int flag_cat_hasaudio      = 21; // the page refers to (at least one) audio file
+    public  static final int flag_cat_hasvideo      = 22; // the page refers to (at least one) videos
+    public  static final int flag_cat_hasapp        = 23; // the page refers to (at least one) application file
+
+    //private Properties analysis;
+    private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
+    private final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
+    private final Set<String> synonyms; // a set of synonyms to the words
+    private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection
+    private String fuzzy_signature_text = null; // signatures for double-check detection
+    
+    public int RESULT_NUMB_WORDS = -1;
+    public int RESULT_DIFF_WORDS = -1;
+    public int RESULT_NUMB_SENTENCES = -1;
+    public int RESULT_DIFF_SENTENCES = -1;
+    public Bitfield RESULT_FLAGS = new Bitfield(4);
+    private final Identificator languageIdentificator;
+
+    public Condenser(
+            final Document document,
+            final boolean indexText,
+            final boolean indexMedia,
+            final WordCache meaningLib,
+            final SynonymLibrary synlib,
+            final boolean doAutotagging
+            ) {
+        Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
+        // if addMedia == true, then all the media links are also parsed and added to the words
+        // added media words are flagged with the appropriate media flag
+        this.words = new HashMap<String, Word>();
+        this.synonyms = new LinkedHashSet<String>();
+        this.RESULT_FLAGS = new Bitfield(4);
+
+        // construct flag set for document
+        if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty())     this.RESULT_FLAGS.set(flag_cat_hasimage, true);
+        if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
+        if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
+        if (document.dc_source().getContentDomain() == ContentDomain.APP   || !document.getApplinks().isEmpty())   this.RESULT_FLAGS.set(flag_cat_hasapp,   true);
+        if (document.lat() != 0.0 && document.lon() != 0.0) this.RESULT_FLAGS.set(flag_cat_haslocation, true);
+
+        this.languageIdentificator = new Identificator();
+
+        // add the URL components to the word list
+        insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
+
+        Map.Entry<MultiProtocolURI, String> entry;
+        if (indexText) {
+            createCondensement(document.getTextString(), meaningLib, doAutotagging);
+            // the phrase counter:
+            // phrase   0 are words taken from the URL
+            // phrase   1 is the MainTitle
+            // phrase   2 is <not used>
+            // phrase   3 is the Document Abstract
+            // phrase   4 is the Document Author
+            // phrase   5 is the Document Publisher
+            // phrase   6 are the tags specified in document
+            // phrase  10 and above are the section headlines/titles (88 possible)
+            // phrase  98 is taken from the embedded anchor/hyperlinks description (REMOVED!)
+            // phrase  99 is taken from the media Link url and anchor description
+            // phrase 100 and above are lines from the text
+            insertTextToWords(new SentenceReader(document.dc_title()),       1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(new SentenceReader(document.dc_creator()),     4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(new SentenceReader(document.dc_publisher()),   5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(new SentenceReader(document.dc_subject(' ')),  6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
+            // missing: tags!
+            final String[] titles = document.getSectionTitles();
+            for (int i = 0; i < titles.length; i++) {
+                insertTextToWords(new SentenceReader(titles[i]), i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib);
+            }
+
+            // anchors: for text indexing we add only the anchor description
+            // REMOVED! Reason:
+            // words from the anchor description should appear as normal text in the output from the parser
+            // to flag these words as appearance in dc_description would confuse, since the user expects such word as titles of
+            // pages that are shown in the search result. The words from the URLS should also not appear as part of the index, because they
+            // are not visible in the text and could be used to crate fake-content
+            /*
+            final Iterator<Map.Entry<yacyURL, String>> i = document.getAnchors().entrySet().iterator();
+            while (i.hasNext()) {
+                entry = i.next();
+                if ((entry == null) || (entry.getKey() == null)) continue;
+                insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true);
+            }
+            */
+        } else {
+            this.RESULT_NUMB_WORDS = 0;
+            this.RESULT_DIFF_WORDS = 0;
+            this.RESULT_NUMB_SENTENCES = 0;
+            this.RESULT_DIFF_SENTENCES = 0;
+        }
+
+        if (indexMedia) {
+            // add anchor descriptions: here, we also add the url components
+            // audio
+            Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
+            while (i.hasNext()) {
+                entry = i.next();
+                insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib);
+            }
+
+            // video
+            i = document.getVideolinks().entrySet().iterator();
+            while (i.hasNext()) {
+                entry = i.next();
+                insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib);
+            }
+
+            // applications
+            i = document.getApplinks().entrySet().iterator();
+            while (i.hasNext()) {
+                entry = i.next();
+                insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib);
+            }
+
+            // images
+            final Iterator<ImageEntry> j = document.getImages().values().iterator();
+            ImageEntry ientry;
+            MultiProtocolURI url;
+            while (j.hasNext()) {
+                ientry = j.next();
+                url = ientry.url();
+                if (url == null) continue;
+                insertTextToWords(new SentenceReader(url.toNormalform(true)), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(new SentenceReader(ientry.alt()), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
+            }
+
+            // finally check all words for missing flag entry
+            final Iterator<Map.Entry<String, Word>> k = this.words.entrySet().iterator();
+            Word wprop;
+            Map.Entry<String, Word> we;
+            while (k.hasNext()) {
+                we = k.next();
+                wprop = we.getValue();
+                if (wprop.flags == null) {
+                    wprop.flags = this.RESULT_FLAGS.clone();
+                    this.words.put(we.getKey(), wprop);
+                }
+            }
+        }
+
+        // extend the tags in the document object with autotagging tags
+        if (!this.tags.isEmpty()) {
+            document.addMetatags(this.tags);
+        }
+
+        if (synlib != null) {
+            for (String word: this.words.keySet()) {
+                Set<String> syms = synlib.getSynonyms(word);
+                if (syms != null) this.synonyms.addAll(syms);
+            }
+        }
+        String text = document.getTextString();
+        
+        // create the synonyms set
+        if (synonyms != null) {
+            for (String word: this.words.keySet()) {
+                Set<String> syms = synlib.getSynonyms(word);
+                if (syms != null) this.synonyms.addAll(syms);
+            }
+        }
+        
+        // create hashes for duplicate detection
+        // check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b
+        EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature();
+        Map<String,String> sp = new HashMap<String,String>();
+        sp.put("quantRate", Float.toString(Boost.RANKING.getQuantRate())); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5!
+        sp.put("minTokenLen", Integer.toString(Boost.RANKING.getMinTokenLen()));
+        fuzzySignatureFactory.init(new MapSolrParams(sp));
+        fuzzySignatureFactory.add(text);
+        byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature();
+        long l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (fuzzy_signature_hash[i] & 0xff);
+        this.fuzzy_signature = l;
+        this.fuzzy_signature_text = fuzzySignatureFactory.getSignatureText().toString();
+        Lookup3Signature exactSignatureFactory = new Lookup3Signature();
+        exactSignatureFactory.add(text);
+        byte[] exact_signature_hash = exactSignatureFactory.getSignature();
+        l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (exact_signature_hash[i] & 0xff);
+        this.exact_signature = l;
+    }
+
+    private Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) {
+        this.languageIdentificator = null; // we don't need that here
+        // analysis = new Properties();
+        this.words = new TreeMap<String, Word>();
+        this.synonyms = new HashSet<String>();
+        createCondensement(text, meaningLib, doAutotagging);
+    }
+
+    private void insertTextToWords(
+            final SentenceReader text,
+            final int phrase,
+            final int flagpos,
+            final Bitfield flagstemplate,
+            final boolean useForLanguageIdentification,
+            final WordCache meaningLib) {
+        if (text == null) return;
+        String word;
+        Word wprop;
+        WordTokenizer wordenum = new WordTokenizer(text, meaningLib);
+        try {
+	        int pip = 0;
+	        while (wordenum.hasMoreElements()) {
+	            word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
+	            if (useForLanguageIdentification) this.languageIdentificator.add(word);
+	            if (word.length() < 2) continue;
+	            wprop = this.words.get(word);
+	            if (wprop == null) wprop = new Word(0, pip, phrase);
+	            if (wprop.flags == null) wprop.flags = flagstemplate.clone();
+	            wprop.flags.set(flagpos, true);
+	            this.words.put(word, wprop);
+	            pip++;
+	            this.RESULT_NUMB_WORDS++;
+	            this.RESULT_DIFF_WORDS++;
+	        }
+        } finally {
+        	wordenum.close();
+        }
+    }
+
+    public int excludeWords(final SortedSet<String> stopwords) {
+        // subtracts the given stopwords from the word list
+        // the word list shrinkes. This returns the number of shrinked words
+        final int oldsize = this.words.size();
+        SetTools.excludeDestructive(this.words, stopwords);
+        return oldsize - this.words.size();
+    }
+
+    public Map<String, Word> words() {
+        // returns the words as word/indexWord relation map
+        return this.words;
+    }
+    
+    public List<String> synonyms() {
+        ArrayList<String> l = new ArrayList<String>(this.synonyms.size());
+        for (String s: this.synonyms) l.add(s);
+        return l;
+    }
+
+    public long fuzzySignature() {
+        return this.fuzzy_signature;
+    }
+
+    public String fuzzySignatureText() {
+        return this.fuzzy_signature_text;
+    }
+    
+    public long exactSignature() {
+        return this.exact_signature;
+    }
+    
+    public String language() {
+        return this.languageIdentificator.getLanguage();
+    }
+
+    private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) {
+        assert text != null;
+        final Set<String> currsentwords = new HashSet<String>();
+        String word = "";
+        String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
+        for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
+        String k;
+        Tagging.Metatag tag;
+        int wordlen;
+        Word wsp;
+        final Word wsp1;
+        int wordHandle;
+        int wordHandleCount = 0;
+        final int sentenceHandleCount = 0;
+        int allwordcounter = 0;
+        final int allsentencecounter = 0;
+        int wordInSentenceCounter = 1;
+        boolean comb_indexof = false, last_last = false, last_index = false;
+        final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
+        if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false;
+
+        // read source
+        final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
+        try {
+	        while (wordenum.hasMoreElements()) {
+	            word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
+	            if (this.languageIdentificator != null) this.languageIdentificator.add(word);
+	            if (word.length() < wordminsize) continue;
+
+	            // get tags from autotagging
+	            if (doAutotagging) {
+	            	for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
+	            		// wordc is number of words that are tested
+	            		StringBuilder sb = new StringBuilder();
+	            		if (wordc == 1) {
+	            			sb.append(word);
+	            		} else {
+	            			for (int w = 0; w < wordc - 1; w++) {
+	            				sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
+	            			}
+	            			sb.append(word);
+	            		}
+	            		String testterm = sb.toString().trim();
+	            		//System.out.println("Testing: " + testterm);
+		                tag = LibraryProvider.autotagging.getTagFromTerm(testterm);
+		                if (tag != null) {
+		                    String navigatorName = tag.getVocabularyName();
+		                    Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
+		                    if (tagset == null) {
+		                        tagset = new HashSet<Tagging.Metatag>();
+		                        this.tags.put(navigatorName, tagset);
+		                    }
+	                        tagset.add(tag);
+		                }
+	            	}
+	            }
+	            // shift wordcache
+	            System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
+	            wordcache[wordcache.length - 1] = word;
+
+	            // distinguish punctuation and words
+	            wordlen = word.length();
+	            if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
+	                // store sentence
+	                currsentwords.clear();
+	                wordInSentenceCounter = 1;
+	            } else {
+	                // check index.of detection
+	                if (last_last && comb_indexof && word.equals("modified")) {
+	                    this.RESULT_FLAGS.set(flag_cat_indexof, true);
+	                    wordenum.pre(true); // parse lines as they come with CRLF
+	                }
+	                if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
+	                last_last = word.equals("last");
+	                last_index = word.equals("index");
+
+	                // store word
+	                allwordcounter++;
+	                currsentwords.add(word);
+	                wsp = this.words.get(word);
+	                if (wsp != null) {
+	                    // word already exists
+	                    wordHandle = wsp.posInText;
+	                    wsp.inc();
+	                } else {
+	                    // word does not yet exist, create new word entry
+	                    wordHandle = wordHandleCount++;
+	                    wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
+	                    wsp.flags = this.RESULT_FLAGS.clone();
+	                    this.words.put(word, wsp);
+	                }
+	                // we now have the unique handle of the word, put it into the sentence:
+	                wordInSentenceCounter++;
+	            }
+	        }
+        } finally {
+        	wordenum.close();
+        }
+
+        if (pseudostemming) {
+            Map.Entry<String, Word> entry;
+            // we search for similar words and reorganize the corresponding sentences
+            // a word is similar, if a shortened version is equal
+            final Iterator<Map.Entry<String, Word>> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order
+            wordsearch: while (wi.hasNext()) {
+                entry = wi.next();
+                word = entry.getKey();
+                wordlen = word.length();
+                wsp = entry.getValue();
+                for (int i = wordcut; i > 0; i--) {
+                    if (wordlen > i) {
+                        k = word.substring(0, wordlen - i);
+                        if (this.words.containsKey(k)) {
+                            // update word counter
+                            wsp1.count = wsp1.count + wsp.count;
+                            this.words.put(k, wsp1);
+                            // remove current word
+                            wi.remove();
+                            continue wordsearch;
+                        }
+                    }
+                }
+            }
+        }
+
+        // store result
+        //this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
+        this.RESULT_NUMB_WORDS = allwordcounter;
+        this.RESULT_DIFF_WORDS = wordHandleCount;
+        this.RESULT_NUMB_SENTENCES = allsentencecounter;
+        this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
+    }
+
+    public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
+        // returns a word/indexWord relation map
+        if (text == null) return null;
+        return new Condenser(text, meaningLib, false).words();
+    }
+
+    public static void main(final String[] args) {
+        // read a property file and convert them into configuration lines
+        try {
+            final File f = new File(args[0]);
+            final Properties p = new Properties();
+            p.load(new FileInputStream(f));
+            final StringBuilder sb = new StringBuilder();
+            sb.append("{\n");
+            for (int i = 0; i <= 15; i++) {
+                sb.append('"');
+                final String s = p.getProperty("keywords" + i);
+                final String[] l = s.split(",");
+                for (final String element : l) {
+                    sb.append(ASCII.String(Word.word2hash(element)));
+                }
+                if (i < 15) sb.append(",\n");
+            }
+            sb.append("}\n");
+            System.out.println(sb.toString());
+        } catch (final FileNotFoundException e) {
+            Log.logException(e);
+        } catch (final IOException e) {
+            Log.logException(e);
+        }
+
+    }
+
+}
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -722,7 +722,7 @@ dc_rights
        final String language = dc_language();
        if (language != null && language.length() > 0) os.write("<dc:language>" + dc_language() + "</dc:language>\n");
        os.write("<dc:date>" + ISO8601Formatter.FORMATTER.format(date) + "</dc:date>\n");
-        if (this.lon != 0.0f && this.lat != 0.0f) os.write("<geo:Point><geo:long>" + this.lon +"</geo:long><geo:lat>" + this.lat + "</geo:lat></geo:Point>\n");
+        if (this.lon != 0.0 && this.lat != 0.0) os.write("<geo:Point><geo:long>" + this.lon +"</geo:long><geo:lat>" + this.lat + "</geo:lat></geo:Point>\n");
        os.write("</record>\n");
    }

@ -821,7 +821,7 @@ dc_rights
            anchors.putAll(doc.getAnchors());
            rss.putAll(doc.getRSS());
            ContentScraper.addAllImages(images, doc.getImages());
-            if (doc.lon() != 0.0f && doc.lat() != 0.0f) { lon = doc.lon(); lat = doc.lat(); }
+            if (doc.lon() != 0.0 && doc.lat() != 0.0) { lon = doc.lon(); lat = doc.lat(); }
        }

        // clean up parser data
--- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java
@ -226,7 +226,7 @@ public class URIMetadataRow {
        s.appendLF();
        if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher);
        s.appendLF();
-        if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF();
+        if (lon == 0.0 && lat == 0.0) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF();
        String s0 = s.toString();
        s.close();
 		return UTF8.getBytes(s0);
@ -514,7 +514,11 @@ public class URIMetadataRow {
            if (p < 0) {
                return 0.0d;
            }
-            return this.latlon.charAt(0) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(0, p));
+            try {
+                return this.latlon.charAt(0) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(0, p));
+            } catch (NumberFormatException e) {
+                return 0.0d;
+            }
        }
        public double lon() {
            if (this.latlon == null || this.latlon.isEmpty()) return 0.0d;
@ -522,7 +526,11 @@ public class URIMetadataRow {
            if (p < 0) {
                return 0.0d;
            }
-            return this.latlon.charAt(p + 1) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(p + 1));
+            try {
+                return this.latlon.charAt(p + 1) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(p + 1));
+            } catch (NumberFormatException e) {
+                return 0.0d;
+            }
        }
    }

--- a/source/net/yacy/migration.java
+++ b/source/net/yacy/migration.java
@ -33,6 +33,11 @@ import net.yacy.search.Switchboard;
 import net.yacy.search.SwitchboardConstants;

 import com.google.common.io.Files;
+import java.util.Iterator;
+import net.yacy.kelondro.data.meta.URIMetadataRow;
+import net.yacy.kelondro.index.Index;
+import net.yacy.kelondro.index.Row;
+import net.yacy.search.index.Fulltext;

 public class migration {
    //SVN constants
@ -256,4 +261,82 @@ public class migration {
            sb.setConfig("crawler.http.acceptCharset",  sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7"));
        }
    }
+    /**
+     * converts old urldb to Solr.
+     * In chunks of 1000 entries.
+     * Creates a lock file in workdir to allow only one active migration thread
+     * @return current size of urldb index
+     */
+    @SuppressWarnings("deprecation")
+    public static int migrateUrldbtoSolr(final Switchboard sb) {
+        int ret = 0;
+        final File f;
+        final Fulltext ft = sb.index.fulltext();
+
+        if (ft.getURLDb() != null) {
+            ret = ft.getURLDb().size();
+            f = new File(sb.workPath, "migrateUrldbtoSolr.lck");
+            f.deleteOnExit();
+            if (f.exists()) {
+                return ret;
+            } else {
+                try {
+                    f.createNewFile();                    
+                } catch (IOException ex) {
+                    Log.logInfo("migrateUrldbtoSolr","could not create lock file");
+                }
+            }
+
+            final Thread t = new Thread() {
+                boolean go = true;
+                final Index urldb = ft.getURLDb();
+
+                public void run() {
+                    try {
+                        Thread.currentThread().setName("migration.migrateUrldbtoSolr");
+
+                        int i = urldb.size();
+                        while (go && i > 0) {
+
+                            List<Row.Entry> chunk = urldb.random(1000);
+                            if ((chunk == null) || (chunk.size() == 0)) {
+                                go = false;
+                                break;
+                            }
+                            Iterator<Row.Entry> chunkit = chunk.iterator();
+
+                            while (go && chunkit.hasNext()) {
+                                try { // to catch any data errors 
+                                    URIMetadataRow row = new URIMetadataRow(chunkit.next(), null);
+                                    ft.putMetadata(row); // this deletes old urldb-entry first and inserts into Solr
+                                    i--;
+                                    if (Switchboard.getSwitchboard().shallTerminate()) {
+                                        go = false;
+                                    }
+                                } catch (Exception e) {
+                                    Log.logInfo("migrateUrldbtoSolr", "some error while adding old data to new index, continue with next entry");
+                                }
+                            }
+                            Log.logInfo("migrateUrldbtoSolr", Integer.toString(i) + " entries left (convert next chunk of 1000 entries)");
+                        }
+                        ft.commit();
+
+                    } catch (IOException ex) {
+                        Log.logInfo("migrateUrldbtoSolr", "error reading old urldb index");
+                    } finally {
+                        if (f.exists()) {
+                            f.delete(); // delete lock file
+                        }
+                    }
+                }
+
+                public void exit() {
+                    go = false;
+                }
+            };
+            t.setPriority(Thread.MIN_PRIORITY);
+            t.start();
+        }
+        return ret;
+    }
 }
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@ -100,6 +100,28 @@ public final class Fulltext implements Iterable<byte[]> {
        this.forcedCommitTime = 0;
    }

+    /**
+     * @deprecated
+     * used only for migration
+     * @return the connected URLDb
+
+     */
+    @Deprecated
+    public Index getURLDb() {
+        return this.urlIndexFile;
+    }
+
+    /**
+     * true if old metadata index URLDb is connected.
+     * used only for migration
+     * @deprecated
+     * current and future versions use Solr for metadata
+     */
+    @Deprecated
+    public boolean connectedURLDb() {
+        return this.urlIndexFile != null;
+    }
+
    protected void connectUrlDb(final String tablename, final boolean useTailCache, final boolean exceed134217727) {
    	if (this.urlIndexFile != null) return;
        this.tablename = tablename;
--- a/source/net/yacy/search/index/SolrConfiguration.java
+++ b/source/net/yacy/search/index/SolrConfiguration.java
@ -274,7 +274,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
        if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, "UTF8");

        // coordinates
-        if (md.lat() != 0.0f && md.lon() != 0.0f) {
+        if (md.lat() != 0.0 && md.lon() != 0.0) {
            if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(md.lat()) + "," + Double.toString(md.lon()));
        }
        if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, 200);
@ -794,7 +794,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
        if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, document.getCharset());

        // coordinates
-        if (document.lat() != 0.0f && document.lon() != 0.0f) {
+        if (document.lat() != 0.0 && document.lon() != 0.0) {
            if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(document.lat()) + "," + Double.toString(document.lon()));
        }
        if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, responseHeader == null ? 200 : responseHeader.getStatusCode());
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@ -836,7 +836,7 @@ public final class SearchEvent {
            }

            // check location constraint
-            if ((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_haslocation)) && (page.lat() == 0.0f || page.lon() == 0.0f)) {
+            if ((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_haslocation)) && (page.lat() == 0.0 || page.lon() == 0.0)) {
                this.query.misses.add(page.hash());
                continue;
            }