diff --git a/htroot/IndexFederated_p.html b/htroot/IndexFederated_p.html
index f0805ae6c..778ba22db 100644
--- a/htroot/IndexFederated_p.html
+++ b/htroot/IndexFederated_p.html
@@ -26,7 +26,8 @@
This is a switchboard for the usage of embedded metadata to embedded solr.
The rwi index is necessary for index transmission and shall be switched off in future portalmode configurations.
Convert old meta data (urldb) index to embedded Solr fulltext index.
+
+
+
+
A low priority background job has been started which reads the old index, adds it to Solr and deletes the entry from the old index.
+
The default "slow migration" updates any entry in the old urldb index upon access (e.g. during search events).
+ If you feel that the not accessed entries are still relevant, with this migration all entries from the old urldb index will be migrated.
+
You may refresh this page to see how many entries in the old index are left for migration
+
Hint: this background task runs until all entries are migrated or YaCy is shutdown. The migration is not automatically restarted.
+
+
+
+
+ #%env/templates/footer.template%#
+
+
diff --git a/htroot/migrateurldb_p.java b/htroot/migrateurldb_p.java
new file mode 100644
index 000000000..061a2edea
--- /dev/null
+++ b/htroot/migrateurldb_p.java
@@ -0,0 +1,44 @@
+// migrateurldb_p.java
+
+import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.migration;
+import net.yacy.search.Switchboard;
+import net.yacy.server.serverObjects;
+import net.yacy.server.serverSwitch;
+
+public class migrateurldb_p {
+
+ public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
+ final serverObjects prop = new serverObjects();
+ final Switchboard sb = (Switchboard) env;
+
+ int cnt;
+
+ if ((cnt = migration.migrateUrldbtoSolr(sb)) > 0) {
+ prop.put("count", cnt);
+
+ if (post != null && post.containsKey("dorefresh")) {
+ int lastcount = post.getInt("lastcount", 0);
+ Long t = post.getLong("lasttime", 1);
+
+ Double difft = (System.currentTimeMillis() - t) / 60000.0d;
+ int diff = (int)((lastcount - cnt) / difft) ;
+ prop.put("speed", diff);
+ prop.put("lasttime", t);
+ prop.put("lastcount", lastcount);
+
+ } else {
+ prop.put("speed", "?");
+ prop.put("lastcount",cnt);
+ prop.put("lasttime", System.currentTimeMillis());
+ }
+ } else {
+ prop.put("speed", "");
+ prop.put("count", "no urldb index available");
+ }
+
+
+ // return rewrite properties
+ return prop;
+ }
+}
\ No newline at end of file
diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java
index 9d17ae5e6..39e184024 100644
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@@ -1,495 +1,495 @@
-/**
- * Condenser.java
- * Copyright 2004 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
- * First released 09.01.2004 at http://yacy.net
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program in the file lgpl21.txt
- * If not, see .
- */
-
-package net.yacy.document;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.LinkedHashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Properties;
-import java.util.Set;
-import java.util.SortedSet;
-import java.util.TreeMap;
-
-import org.apache.solr.common.params.MapSolrParams;
-import org.apache.solr.update.processor.Lookup3Signature;
-
-import net.yacy.cora.document.ASCII;
-import net.yacy.cora.document.WordCache;
-import net.yacy.cora.document.analysis.Classification.ContentDomain;
-import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
-import net.yacy.cora.document.MultiProtocolURI;
-import net.yacy.cora.federate.solr.Boost;
-import net.yacy.cora.language.synonyms.SynonymLibrary;
-import net.yacy.cora.lod.vocabulary.Tagging;
-import net.yacy.document.language.Identificator;
-import net.yacy.document.parser.html.ImageEntry;
-import net.yacy.kelondro.data.word.Word;
-import net.yacy.kelondro.data.word.WordReferenceRow;
-import net.yacy.kelondro.logging.Log;
-import net.yacy.kelondro.util.Bitfield;
-import net.yacy.kelondro.util.SetTools;
-
-
-public final class Condenser {
-
- // this is the page analysis class
- public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
- public final static int wordminsize = 2;
- public final static int wordcut = 2;
-
- // category flags that show how the page can be distinguished in different interest groups
- public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of')
- public static final int flag_cat_haslocation = 19; // the page has a location metadata attached
- public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images
- public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file
- public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos
- public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
-
- //private Properties analysis;
- private final Map words; // a string (the words) to (indexWord) - relation
- private final Map> tags = new HashMap>(); // a set of tags, discovered from Autotagging
- private final Set synonyms; // a set of synonyms to the words
- private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection
- private String fuzzy_signature_text = null; // signatures for double-check detection
-
- public int RESULT_NUMB_WORDS = -1;
- public int RESULT_DIFF_WORDS = -1;
- public int RESULT_NUMB_SENTENCES = -1;
- public int RESULT_DIFF_SENTENCES = -1;
- public Bitfield RESULT_FLAGS = new Bitfield(4);
- private final Identificator languageIdentificator;
-
- public Condenser(
- final Document document,
- final boolean indexText,
- final boolean indexMedia,
- final WordCache meaningLib,
- final SynonymLibrary synlib,
- final boolean doAutotagging
- ) {
- Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
- // if addMedia == true, then all the media links are also parsed and added to the words
- // added media words are flagged with the appropriate media flag
- this.words = new HashMap();
- this.synonyms = new LinkedHashSet();
- this.RESULT_FLAGS = new Bitfield(4);
-
- // construct flag set for document
- if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true);
- if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
- if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
- if (document.dc_source().getContentDomain() == ContentDomain.APP || !document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true);
- if (document.lat() != 0.0f && document.lon() != 0.0f) this.RESULT_FLAGS.set(flag_cat_haslocation, true);
-
- this.languageIdentificator = new Identificator();
-
- // add the URL components to the word list
- insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
-
- Map.Entry entry;
- if (indexText) {
- createCondensement(document.getTextString(), meaningLib, doAutotagging);
- // the phrase counter:
- // phrase 0 are words taken from the URL
- // phrase 1 is the MainTitle
- // phrase 2 is
- // phrase 3 is the Document Abstract
- // phrase 4 is the Document Author
- // phrase 5 is the Document Publisher
- // phrase 6 are the tags specified in document
- // phrase 10 and above are the section headlines/titles (88 possible)
- // phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!)
- // phrase 99 is taken from the media Link url and anchor description
- // phrase 100 and above are lines from the text
- insertTextToWords(new SentenceReader(document.dc_title()), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
- insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
- insertTextToWords(new SentenceReader(document.dc_creator()), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
- insertTextToWords(new SentenceReader(document.dc_publisher()), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
- insertTextToWords(new SentenceReader(document.dc_subject(' ')), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
- // missing: tags!
- final String[] titles = document.getSectionTitles();
- for (int i = 0; i < titles.length; i++) {
- insertTextToWords(new SentenceReader(titles[i]), i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib);
- }
-
- // anchors: for text indexing we add only the anchor description
- // REMOVED! Reason:
- // words from the anchor description should appear as normal text in the output from the parser
- // to flag these words as appearance in dc_description would confuse, since the user expects such word as titles of
- // pages that are shown in the search result. The words from the URLS should also not appear as part of the index, because they
- // are not visible in the text and could be used to crate fake-content
- /*
- final Iterator> i = document.getAnchors().entrySet().iterator();
- while (i.hasNext()) {
- entry = i.next();
- if ((entry == null) || (entry.getKey() == null)) continue;
- insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true);
- }
- */
- } else {
- this.RESULT_NUMB_WORDS = 0;
- this.RESULT_DIFF_WORDS = 0;
- this.RESULT_NUMB_SENTENCES = 0;
- this.RESULT_DIFF_SENTENCES = 0;
- }
-
- if (indexMedia) {
- // add anchor descriptions: here, we also add the url components
- // audio
- Iterator> i = document.getAudiolinks().entrySet().iterator();
- while (i.hasNext()) {
- entry = i.next();
- insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);
- insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib);
- }
-
- // video
- i = document.getVideolinks().entrySet().iterator();
- while (i.hasNext()) {
- entry = i.next();
- insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib);
- insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib);
- }
-
- // applications
- i = document.getApplinks().entrySet().iterator();
- while (i.hasNext()) {
- entry = i.next();
- insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib);
- insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib);
- }
-
- // images
- final Iterator j = document.getImages().values().iterator();
- ImageEntry ientry;
- MultiProtocolURI url;
- while (j.hasNext()) {
- ientry = j.next();
- url = ientry.url();
- if (url == null) continue;
- insertTextToWords(new SentenceReader(url.toNormalform(true)), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib);
- insertTextToWords(new SentenceReader(ientry.alt()), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
- }
-
- // finally check all words for missing flag entry
- final Iterator> k = this.words.entrySet().iterator();
- Word wprop;
- Map.Entry we;
- while (k.hasNext()) {
- we = k.next();
- wprop = we.getValue();
- if (wprop.flags == null) {
- wprop.flags = this.RESULT_FLAGS.clone();
- this.words.put(we.getKey(), wprop);
- }
- }
- }
-
- // extend the tags in the document object with autotagging tags
- if (!this.tags.isEmpty()) {
- document.addMetatags(this.tags);
- }
-
- if (synlib != null) {
- for (String word: this.words.keySet()) {
- Set syms = synlib.getSynonyms(word);
- if (syms != null) this.synonyms.addAll(syms);
- }
- }
- String text = document.getTextString();
-
- // create the synonyms set
- if (synonyms != null) {
- for (String word: this.words.keySet()) {
- Set syms = synlib.getSynonyms(word);
- if (syms != null) this.synonyms.addAll(syms);
- }
- }
-
- // create hashes for duplicate detection
- // check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b
- EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature();
- Map sp = new HashMap();
- sp.put("quantRate", Float.toString(Boost.RANKING.getQuantRate())); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5!
- sp.put("minTokenLen", Integer.toString(Boost.RANKING.getMinTokenLen()));
- fuzzySignatureFactory.init(new MapSolrParams(sp));
- fuzzySignatureFactory.add(text);
- byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature();
- long l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (fuzzy_signature_hash[i] & 0xff);
- this.fuzzy_signature = l;
- this.fuzzy_signature_text = fuzzySignatureFactory.getSignatureText().toString();
- Lookup3Signature exactSignatureFactory = new Lookup3Signature();
- exactSignatureFactory.add(text);
- byte[] exact_signature_hash = exactSignatureFactory.getSignature();
- l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (exact_signature_hash[i] & 0xff);
- this.exact_signature = l;
- }
-
- private Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) {
- this.languageIdentificator = null; // we don't need that here
- // analysis = new Properties();
- this.words = new TreeMap();
- this.synonyms = new HashSet();
- createCondensement(text, meaningLib, doAutotagging);
- }
-
- private void insertTextToWords(
- final SentenceReader text,
- final int phrase,
- final int flagpos,
- final Bitfield flagstemplate,
- final boolean useForLanguageIdentification,
- final WordCache meaningLib) {
- if (text == null) return;
- String word;
- Word wprop;
- WordTokenizer wordenum = new WordTokenizer(text, meaningLib);
- try {
- int pip = 0;
- while (wordenum.hasMoreElements()) {
- word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
- if (useForLanguageIdentification) this.languageIdentificator.add(word);
- if (word.length() < 2) continue;
- wprop = this.words.get(word);
- if (wprop == null) wprop = new Word(0, pip, phrase);
- if (wprop.flags == null) wprop.flags = flagstemplate.clone();
- wprop.flags.set(flagpos, true);
- this.words.put(word, wprop);
- pip++;
- this.RESULT_NUMB_WORDS++;
- this.RESULT_DIFF_WORDS++;
- }
- } finally {
- wordenum.close();
- }
- }
-
- public int excludeWords(final SortedSet stopwords) {
- // subtracts the given stopwords from the word list
- // the word list shrinkes. This returns the number of shrinked words
- final int oldsize = this.words.size();
- SetTools.excludeDestructive(this.words, stopwords);
- return oldsize - this.words.size();
- }
-
- public Map words() {
- // returns the words as word/indexWord relation map
- return this.words;
- }
-
- public List synonyms() {
- ArrayList l = new ArrayList(this.synonyms.size());
- for (String s: this.synonyms) l.add(s);
- return l;
- }
-
- public long fuzzySignature() {
- return this.fuzzy_signature;
- }
-
- public String fuzzySignatureText() {
- return this.fuzzy_signature_text;
- }
-
- public long exactSignature() {
- return this.exact_signature;
- }
-
- public String language() {
- return this.languageIdentificator.getLanguage();
- }
-
- private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) {
- assert text != null;
- final Set currsentwords = new HashSet();
- String word = "";
- String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
- for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
- String k;
- Tagging.Metatag tag;
- int wordlen;
- Word wsp;
- final Word wsp1;
- int wordHandle;
- int wordHandleCount = 0;
- final int sentenceHandleCount = 0;
- int allwordcounter = 0;
- final int allsentencecounter = 0;
- int wordInSentenceCounter = 1;
- boolean comb_indexof = false, last_last = false, last_index = false;
- final Map sentences = new HashMap(100);
- if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false;
-
- // read source
- final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
- try {
- while (wordenum.hasMoreElements()) {
- word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
- if (this.languageIdentificator != null) this.languageIdentificator.add(word);
- if (word.length() < wordminsize) continue;
-
- // get tags from autotagging
- if (doAutotagging) {
- for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
- // wordc is number of words that are tested
- StringBuilder sb = new StringBuilder();
- if (wordc == 1) {
- sb.append(word);
- } else {
- for (int w = 0; w < wordc - 1; w++) {
- sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
- }
- sb.append(word);
- }
- String testterm = sb.toString().trim();
- //System.out.println("Testing: " + testterm);
- tag = LibraryProvider.autotagging.getTagFromTerm(testterm);
- if (tag != null) {
- String navigatorName = tag.getVocabularyName();
- Set tagset = this.tags.get(navigatorName);
- if (tagset == null) {
- tagset = new HashSet();
- this.tags.put(navigatorName, tagset);
- }
- tagset.add(tag);
- }
- }
- }
- // shift wordcache
- System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
- wordcache[wordcache.length - 1] = word;
-
- // distinguish punctuation and words
- wordlen = word.length();
- if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
- // store sentence
- currsentwords.clear();
- wordInSentenceCounter = 1;
- } else {
- // check index.of detection
- if (last_last && comb_indexof && word.equals("modified")) {
- this.RESULT_FLAGS.set(flag_cat_indexof, true);
- wordenum.pre(true); // parse lines as they come with CRLF
- }
- if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
- last_last = word.equals("last");
- last_index = word.equals("index");
-
- // store word
- allwordcounter++;
- currsentwords.add(word);
- wsp = this.words.get(word);
- if (wsp != null) {
- // word already exists
- wordHandle = wsp.posInText;
- wsp.inc();
- } else {
- // word does not yet exist, create new word entry
- wordHandle = wordHandleCount++;
- wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
- wsp.flags = this.RESULT_FLAGS.clone();
- this.words.put(word, wsp);
- }
- // we now have the unique handle of the word, put it into the sentence:
- wordInSentenceCounter++;
- }
- }
- } finally {
- wordenum.close();
- }
-
- if (pseudostemming) {
- Map.Entry entry;
- // we search for similar words and reorganize the corresponding sentences
- // a word is similar, if a shortened version is equal
- final Iterator> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order
- wordsearch: while (wi.hasNext()) {
- entry = wi.next();
- word = entry.getKey();
- wordlen = word.length();
- wsp = entry.getValue();
- for (int i = wordcut; i > 0; i--) {
- if (wordlen > i) {
- k = word.substring(0, wordlen - i);
- if (this.words.containsKey(k)) {
- // update word counter
- wsp1.count = wsp1.count + wsp.count;
- this.words.put(k, wsp1);
- // remove current word
- wi.remove();
- continue wordsearch;
- }
- }
- }
- }
- }
-
- // store result
- //this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
- this.RESULT_NUMB_WORDS = allwordcounter;
- this.RESULT_DIFF_WORDS = wordHandleCount;
- this.RESULT_NUMB_SENTENCES = allsentencecounter;
- this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
- }
-
- public static Map getWords(final String text, final WordCache meaningLib) {
- // returns a word/indexWord relation map
- if (text == null) return null;
- return new Condenser(text, meaningLib, false).words();
- }
-
- public static void main(final String[] args) {
- // read a property file and convert them into configuration lines
- try {
- final File f = new File(args[0]);
- final Properties p = new Properties();
- p.load(new FileInputStream(f));
- final StringBuilder sb = new StringBuilder();
- sb.append("{\n");
- for (int i = 0; i <= 15; i++) {
- sb.append('"');
- final String s = p.getProperty("keywords" + i);
- final String[] l = s.split(",");
- for (final String element : l) {
- sb.append(ASCII.String(Word.word2hash(element)));
- }
- if (i < 15) sb.append(",\n");
- }
- sb.append("}\n");
- System.out.println(sb.toString());
- } catch (final FileNotFoundException e) {
- Log.logException(e);
- } catch (final IOException e) {
- Log.logException(e);
- }
-
- }
-
-}
+/**
+ * Condenser.java
+ * Copyright 2004 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
+ * First released 09.01.2004 at http://yacy.net
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program in the file lgpl21.txt
+ * If not, see .
+ */
+
+package net.yacy.document;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeMap;
+
+import org.apache.solr.common.params.MapSolrParams;
+import org.apache.solr.update.processor.Lookup3Signature;
+
+import net.yacy.cora.document.ASCII;
+import net.yacy.cora.document.WordCache;
+import net.yacy.cora.document.analysis.Classification.ContentDomain;
+import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
+import net.yacy.cora.document.MultiProtocolURI;
+import net.yacy.cora.federate.solr.Boost;
+import net.yacy.cora.language.synonyms.SynonymLibrary;
+import net.yacy.cora.lod.vocabulary.Tagging;
+import net.yacy.document.language.Identificator;
+import net.yacy.document.parser.html.ImageEntry;
+import net.yacy.kelondro.data.word.Word;
+import net.yacy.kelondro.data.word.WordReferenceRow;
+import net.yacy.kelondro.logging.Log;
+import net.yacy.kelondro.util.Bitfield;
+import net.yacy.kelondro.util.SetTools;
+
+
+public final class Condenser {
+
+ // this is the page analysis class
+ public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
+ public final static int wordminsize = 2;
+ public final static int wordcut = 2;
+
+ // category flags that show how the page can be distinguished in different interest groups
+ public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of')
+ public static final int flag_cat_haslocation = 19; // the page has a location metadata attached
+ public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images
+ public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file
+ public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos
+ public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
+
+ //private Properties analysis;
+ private final Map words; // a string (the words) to (indexWord) - relation
+ private final Map> tags = new HashMap>(); // a set of tags, discovered from Autotagging
+ private final Set synonyms; // a set of synonyms to the words
+ private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection
+ private String fuzzy_signature_text = null; // signatures for double-check detection
+
+ public int RESULT_NUMB_WORDS = -1;
+ public int RESULT_DIFF_WORDS = -1;
+ public int RESULT_NUMB_SENTENCES = -1;
+ public int RESULT_DIFF_SENTENCES = -1;
+ public Bitfield RESULT_FLAGS = new Bitfield(4);
+ private final Identificator languageIdentificator;
+
+ public Condenser(
+ final Document document,
+ final boolean indexText,
+ final boolean indexMedia,
+ final WordCache meaningLib,
+ final SynonymLibrary synlib,
+ final boolean doAutotagging
+ ) {
+ Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
+ // if addMedia == true, then all the media links are also parsed and added to the words
+ // added media words are flagged with the appropriate media flag
+ this.words = new HashMap();
+ this.synonyms = new LinkedHashSet();
+ this.RESULT_FLAGS = new Bitfield(4);
+
+ // construct flag set for document
+ if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true);
+ if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
+ if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
+ if (document.dc_source().getContentDomain() == ContentDomain.APP || !document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true);
+ if (document.lat() != 0.0 && document.lon() != 0.0) this.RESULT_FLAGS.set(flag_cat_haslocation, true);
+
+ this.languageIdentificator = new Identificator();
+
+ // add the URL components to the word list
+ insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
+
+ Map.Entry entry;
+ if (indexText) {
+ createCondensement(document.getTextString(), meaningLib, doAutotagging);
+ // the phrase counter:
+ // phrase 0 are words taken from the URL
+ // phrase 1 is the MainTitle
+ // phrase 2 is
+ // phrase 3 is the Document Abstract
+ // phrase 4 is the Document Author
+ // phrase 5 is the Document Publisher
+ // phrase 6 are the tags specified in document
+ // phrase 10 and above are the section headlines/titles (88 possible)
+ // phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!)
+ // phrase 99 is taken from the media Link url and anchor description
+ // phrase 100 and above are lines from the text
+ insertTextToWords(new SentenceReader(document.dc_title()), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
+ insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
+ insertTextToWords(new SentenceReader(document.dc_creator()), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
+ insertTextToWords(new SentenceReader(document.dc_publisher()), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
+ insertTextToWords(new SentenceReader(document.dc_subject(' ')), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
+ // missing: tags!
+ final String[] titles = document.getSectionTitles();
+ for (int i = 0; i < titles.length; i++) {
+ insertTextToWords(new SentenceReader(titles[i]), i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib);
+ }
+
+ // anchors: for text indexing we add only the anchor description
+ // REMOVED! Reason:
+ // words from the anchor description should appear as normal text in the output from the parser
+ // to flag these words as appearance in dc_description would confuse, since the user expects such word as titles of
+ // pages that are shown in the search result. The words from the URLS should also not appear as part of the index, because they
+ // are not visible in the text and could be used to crate fake-content
+ /*
+ final Iterator> i = document.getAnchors().entrySet().iterator();
+ while (i.hasNext()) {
+ entry = i.next();
+ if ((entry == null) || (entry.getKey() == null)) continue;
+ insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true);
+ }
+ */
+ } else {
+ this.RESULT_NUMB_WORDS = 0;
+ this.RESULT_DIFF_WORDS = 0;
+ this.RESULT_NUMB_SENTENCES = 0;
+ this.RESULT_DIFF_SENTENCES = 0;
+ }
+
+ if (indexMedia) {
+ // add anchor descriptions: here, we also add the url components
+ // audio
+ Iterator> i = document.getAudiolinks().entrySet().iterator();
+ while (i.hasNext()) {
+ entry = i.next();
+ insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);
+ insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib);
+ }
+
+ // video
+ i = document.getVideolinks().entrySet().iterator();
+ while (i.hasNext()) {
+ entry = i.next();
+ insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib);
+ insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib);
+ }
+
+ // applications
+ i = document.getApplinks().entrySet().iterator();
+ while (i.hasNext()) {
+ entry = i.next();
+ insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib);
+ insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib);
+ }
+
+ // images
+ final Iterator j = document.getImages().values().iterator();
+ ImageEntry ientry;
+ MultiProtocolURI url;
+ while (j.hasNext()) {
+ ientry = j.next();
+ url = ientry.url();
+ if (url == null) continue;
+ insertTextToWords(new SentenceReader(url.toNormalform(true)), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib);
+ insertTextToWords(new SentenceReader(ientry.alt()), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
+ }
+
+ // finally check all words for missing flag entry
+ final Iterator> k = this.words.entrySet().iterator();
+ Word wprop;
+ Map.Entry we;
+ while (k.hasNext()) {
+ we = k.next();
+ wprop = we.getValue();
+ if (wprop.flags == null) {
+ wprop.flags = this.RESULT_FLAGS.clone();
+ this.words.put(we.getKey(), wprop);
+ }
+ }
+ }
+
+ // extend the tags in the document object with autotagging tags
+ if (!this.tags.isEmpty()) {
+ document.addMetatags(this.tags);
+ }
+
+ if (synlib != null) {
+ for (String word: this.words.keySet()) {
+ Set syms = synlib.getSynonyms(word);
+ if (syms != null) this.synonyms.addAll(syms);
+ }
+ }
+ String text = document.getTextString();
+
+ // create the synonyms set
+ if (synonyms != null) {
+ for (String word: this.words.keySet()) {
+ Set syms = synlib.getSynonyms(word);
+ if (syms != null) this.synonyms.addAll(syms);
+ }
+ }
+
+ // create hashes for duplicate detection
+ // check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b
+ EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature();
+ Map sp = new HashMap();
+ sp.put("quantRate", Float.toString(Boost.RANKING.getQuantRate())); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5!
+ sp.put("minTokenLen", Integer.toString(Boost.RANKING.getMinTokenLen()));
+ fuzzySignatureFactory.init(new MapSolrParams(sp));
+ fuzzySignatureFactory.add(text);
+ byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature();
+ long l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (fuzzy_signature_hash[i] & 0xff);
+ this.fuzzy_signature = l;
+ this.fuzzy_signature_text = fuzzySignatureFactory.getSignatureText().toString();
+ Lookup3Signature exactSignatureFactory = new Lookup3Signature();
+ exactSignatureFactory.add(text);
+ byte[] exact_signature_hash = exactSignatureFactory.getSignature();
+ l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (exact_signature_hash[i] & 0xff);
+ this.exact_signature = l;
+ }
+
+ private Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) {
+ this.languageIdentificator = null; // we don't need that here
+ // analysis = new Properties();
+ this.words = new TreeMap();
+ this.synonyms = new HashSet();
+ createCondensement(text, meaningLib, doAutotagging);
+ }
+
+ private void insertTextToWords(
+ final SentenceReader text,
+ final int phrase,
+ final int flagpos,
+ final Bitfield flagstemplate,
+ final boolean useForLanguageIdentification,
+ final WordCache meaningLib) {
+ if (text == null) return;
+ String word;
+ Word wprop;
+ WordTokenizer wordenum = new WordTokenizer(text, meaningLib);
+ try {
+ int pip = 0;
+ while (wordenum.hasMoreElements()) {
+ word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
+ if (useForLanguageIdentification) this.languageIdentificator.add(word);
+ if (word.length() < 2) continue;
+ wprop = this.words.get(word);
+ if (wprop == null) wprop = new Word(0, pip, phrase);
+ if (wprop.flags == null) wprop.flags = flagstemplate.clone();
+ wprop.flags.set(flagpos, true);
+ this.words.put(word, wprop);
+ pip++;
+ this.RESULT_NUMB_WORDS++;
+ this.RESULT_DIFF_WORDS++;
+ }
+ } finally {
+ wordenum.close();
+ }
+ }
+
+ public int excludeWords(final SortedSet stopwords) {
+ // subtracts the given stopwords from the word list
+ // the word list shrinkes. This returns the number of shrinked words
+ final int oldsize = this.words.size();
+ SetTools.excludeDestructive(this.words, stopwords);
+ return oldsize - this.words.size();
+ }
+
+ public Map words() {
+ // returns the words as word/indexWord relation map
+ return this.words;
+ }
+
+ public List synonyms() {
+ ArrayList l = new ArrayList(this.synonyms.size());
+ for (String s: this.synonyms) l.add(s);
+ return l;
+ }
+
+ public long fuzzySignature() {
+ return this.fuzzy_signature;
+ }
+
+ public String fuzzySignatureText() {
+ return this.fuzzy_signature_text;
+ }
+
+ public long exactSignature() {
+ return this.exact_signature;
+ }
+
+ public String language() {
+ return this.languageIdentificator.getLanguage();
+ }
+
+ private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) {
+ assert text != null;
+ final Set currsentwords = new HashSet();
+ String word = "";
+ String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
+ for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
+ String k;
+ Tagging.Metatag tag;
+ int wordlen;
+ Word wsp;
+ final Word wsp1;
+ int wordHandle;
+ int wordHandleCount = 0;
+ final int sentenceHandleCount = 0;
+ int allwordcounter = 0;
+ final int allsentencecounter = 0;
+ int wordInSentenceCounter = 1;
+ boolean comb_indexof = false, last_last = false, last_index = false;
+ final Map sentences = new HashMap(100);
+ if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false;
+
+ // read source
+ final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
+ try {
+ while (wordenum.hasMoreElements()) {
+ word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
+ if (this.languageIdentificator != null) this.languageIdentificator.add(word);
+ if (word.length() < wordminsize) continue;
+
+ // get tags from autotagging
+ if (doAutotagging) {
+ for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
+ // wordc is number of words that are tested
+ StringBuilder sb = new StringBuilder();
+ if (wordc == 1) {
+ sb.append(word);
+ } else {
+ for (int w = 0; w < wordc - 1; w++) {
+ sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
+ }
+ sb.append(word);
+ }
+ String testterm = sb.toString().trim();
+ //System.out.println("Testing: " + testterm);
+ tag = LibraryProvider.autotagging.getTagFromTerm(testterm);
+ if (tag != null) {
+ String navigatorName = tag.getVocabularyName();
+ Set tagset = this.tags.get(navigatorName);
+ if (tagset == null) {
+ tagset = new HashSet();
+ this.tags.put(navigatorName, tagset);
+ }
+ tagset.add(tag);
+ }
+ }
+ }
+ // shift wordcache
+ System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
+ wordcache[wordcache.length - 1] = word;
+
+ // distinguish punctuation and words
+ wordlen = word.length();
+ if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
+ // store sentence
+ currsentwords.clear();
+ wordInSentenceCounter = 1;
+ } else {
+ // check index.of detection
+ if (last_last && comb_indexof && word.equals("modified")) {
+ this.RESULT_FLAGS.set(flag_cat_indexof, true);
+ wordenum.pre(true); // parse lines as they come with CRLF
+ }
+ if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
+ last_last = word.equals("last");
+ last_index = word.equals("index");
+
+ // store word
+ allwordcounter++;
+ currsentwords.add(word);
+ wsp = this.words.get(word);
+ if (wsp != null) {
+ // word already exists
+ wordHandle = wsp.posInText;
+ wsp.inc();
+ } else {
+ // word does not yet exist, create new word entry
+ wordHandle = wordHandleCount++;
+ wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
+ wsp.flags = this.RESULT_FLAGS.clone();
+ this.words.put(word, wsp);
+ }
+ // we now have the unique handle of the word, put it into the sentence:
+ wordInSentenceCounter++;
+ }
+ }
+ } finally {
+ wordenum.close();
+ }
+
+ if (pseudostemming) {
+ Map.Entry entry;
+ // we search for similar words and reorganize the corresponding sentences
+ // a word is similar, if a shortened version is equal
+ final Iterator> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order
+ wordsearch: while (wi.hasNext()) {
+ entry = wi.next();
+ word = entry.getKey();
+ wordlen = word.length();
+ wsp = entry.getValue();
+ for (int i = wordcut; i > 0; i--) {
+ if (wordlen > i) {
+ k = word.substring(0, wordlen - i);
+ if (this.words.containsKey(k)) {
+ // update word counter
+ wsp1.count = wsp1.count + wsp.count;
+ this.words.put(k, wsp1);
+ // remove current word
+ wi.remove();
+ continue wordsearch;
+ }
+ }
+ }
+ }
+ }
+
+ // store result
+ //this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
+ this.RESULT_NUMB_WORDS = allwordcounter;
+ this.RESULT_DIFF_WORDS = wordHandleCount;
+ this.RESULT_NUMB_SENTENCES = allsentencecounter;
+ this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
+ }
+
+ public static Map getWords(final String text, final WordCache meaningLib) {
+ // returns a word/indexWord relation map
+ if (text == null) return null;
+ return new Condenser(text, meaningLib, false).words();
+ }
+
+ public static void main(final String[] args) {
+ // read a property file and convert them into configuration lines
+ try {
+ final File f = new File(args[0]);
+ final Properties p = new Properties();
+ p.load(new FileInputStream(f));
+ final StringBuilder sb = new StringBuilder();
+ sb.append("{\n");
+ for (int i = 0; i <= 15; i++) {
+ sb.append('"');
+ final String s = p.getProperty("keywords" + i);
+ final String[] l = s.split(",");
+ for (final String element : l) {
+ sb.append(ASCII.String(Word.word2hash(element)));
+ }
+ if (i < 15) sb.append(",\n");
+ }
+ sb.append("}\n");
+ System.out.println(sb.toString());
+ } catch (final FileNotFoundException e) {
+ Log.logException(e);
+ } catch (final IOException e) {
+ Log.logException(e);
+ }
+
+ }
+
+}
diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java
index e63ad37e6..ca7a7df43 100644
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@@ -722,7 +722,7 @@ dc_rights
final String language = dc_language();
if (language != null && language.length() > 0) os.write("" + dc_language() + "\n");
os.write("" + ISO8601Formatter.FORMATTER.format(date) + "\n");
- if (this.lon != 0.0f && this.lat != 0.0f) os.write("" + this.lon +"" + this.lat + "\n");
+ if (this.lon != 0.0 && this.lat != 0.0) os.write("" + this.lon +"" + this.lat + "\n");
os.write("\n");
}
@@ -821,7 +821,7 @@ dc_rights
anchors.putAll(doc.getAnchors());
rss.putAll(doc.getRSS());
ContentScraper.addAllImages(images, doc.getImages());
- if (doc.lon() != 0.0f && doc.lat() != 0.0f) { lon = doc.lon(); lat = doc.lat(); }
+ if (doc.lon() != 0.0 && doc.lat() != 0.0) { lon = doc.lon(); lat = doc.lat(); }
}
// clean up parser data
diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java
index 185868414..087760853 100644
--- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java
@@ -226,7 +226,7 @@ public class URIMetadataRow {
s.appendLF();
if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher);
s.appendLF();
- if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF();
+ if (lon == 0.0 && lat == 0.0) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF();
String s0 = s.toString();
s.close();
return UTF8.getBytes(s0);
@@ -514,7 +514,11 @@ public class URIMetadataRow {
if (p < 0) {
return 0.0d;
}
- return this.latlon.charAt(0) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(0, p));
+ try {
+ return this.latlon.charAt(0) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(0, p));
+ } catch (NumberFormatException e) {
+ return 0.0d;
+ }
}
public double lon() {
if (this.latlon == null || this.latlon.isEmpty()) return 0.0d;
@@ -522,7 +526,11 @@ public class URIMetadataRow {
if (p < 0) {
return 0.0d;
}
- return this.latlon.charAt(p + 1) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(p + 1));
+ try {
+ return this.latlon.charAt(p + 1) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(p + 1));
+ } catch (NumberFormatException e) {
+ return 0.0d;
+ }
}
}
diff --git a/source/net/yacy/migration.java b/source/net/yacy/migration.java
index 6709c5eea..f7c446356 100644
--- a/source/net/yacy/migration.java
+++ b/source/net/yacy/migration.java
@@ -33,6 +33,11 @@ import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import com.google.common.io.Files;
+import java.util.Iterator;
+import net.yacy.kelondro.data.meta.URIMetadataRow;
+import net.yacy.kelondro.index.Index;
+import net.yacy.kelondro.index.Row;
+import net.yacy.search.index.Fulltext;
public class migration {
//SVN constants
@@ -256,4 +261,82 @@ public class migration {
sb.setConfig("crawler.http.acceptCharset", sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7"));
}
}
+ /**
+ * converts old urldb to Solr.
+ * In chunks of 1000 entries.
+ * Creates a lock file in workdir to allow only one active migration thread
+ * @return current size of urldb index
+ */
+ @SuppressWarnings("deprecation")
+ public static int migrateUrldbtoSolr(final Switchboard sb) {
+ int ret = 0;
+ final File f;
+ final Fulltext ft = sb.index.fulltext();
+
+ if (ft.getURLDb() != null) {
+ ret = ft.getURLDb().size();
+ f = new File(sb.workPath, "migrateUrldbtoSolr.lck");
+ f.deleteOnExit();
+ if (f.exists()) {
+ return ret;
+ } else {
+ try {
+ f.createNewFile();
+ } catch (IOException ex) {
+ Log.logInfo("migrateUrldbtoSolr","could not create lock file");
+ }
+ }
+
+ final Thread t = new Thread() {
+ boolean go = true;
+ final Index urldb = ft.getURLDb();
+
+ public void run() {
+ try {
+ Thread.currentThread().setName("migration.migrateUrldbtoSolr");
+
+ int i = urldb.size();
+ while (go && i > 0) {
+
+ List chunk = urldb.random(1000);
+ if ((chunk == null) || (chunk.size() == 0)) {
+ go = false;
+ break;
+ }
+ Iterator chunkit = chunk.iterator();
+
+ while (go && chunkit.hasNext()) {
+ try { // to catch any data errors
+ URIMetadataRow row = new URIMetadataRow(chunkit.next(), null);
+ ft.putMetadata(row); // this deletes old urldb-entry first and inserts into Solr
+ i--;
+ if (Switchboard.getSwitchboard().shallTerminate()) {
+ go = false;
+ }
+ } catch (Exception e) {
+ Log.logInfo("migrateUrldbtoSolr", "some error while adding old data to new index, continue with next entry");
+ }
+ }
+ Log.logInfo("migrateUrldbtoSolr", Integer.toString(i) + " entries left (convert next chunk of 1000 entries)");
+ }
+ ft.commit();
+
+ } catch (IOException ex) {
+ Log.logInfo("migrateUrldbtoSolr", "error reading old urldb index");
+ } finally {
+ if (f.exists()) {
+ f.delete(); // delete lock file
+ }
+ }
+ }
+
+ public void exit() {
+ go = false;
+ }
+ };
+ t.setPriority(Thread.MIN_PRIORITY);
+ t.start();
+ }
+ return ret;
+ }
}
diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java
index 0228c1b2f..036269e1a 100644
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@@ -100,6 +100,28 @@ public final class Fulltext implements Iterable {
this.forcedCommitTime = 0;
}
+ /**
+ * @deprecated
+ * used only for migration
+ * @return the connected URLDb
+
+ */
+ @Deprecated
+ public Index getURLDb() {
+ return this.urlIndexFile;
+ }
+
+ /**
+ * true if old metadata index URLDb is connected.
+ * used only for migration
+ * @deprecated
+ * current and future versions use Solr for metadata
+ */
+ @Deprecated
+ public boolean connectedURLDb() {
+ return this.urlIndexFile != null;
+ }
+
protected void connectUrlDb(final String tablename, final boolean useTailCache, final boolean exceed134217727) {
if (this.urlIndexFile != null) return;
this.tablename = tablename;
diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java
index e6de803e9..90be0c28b 100644
--- a/source/net/yacy/search/index/SolrConfiguration.java
+++ b/source/net/yacy/search/index/SolrConfiguration.java
@@ -274,7 +274,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, "UTF8");
// coordinates
- if (md.lat() != 0.0f && md.lon() != 0.0f) {
+ if (md.lat() != 0.0 && md.lon() != 0.0) {
if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(md.lat()) + "," + Double.toString(md.lon()));
}
if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, 200);
@@ -794,7 +794,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, document.getCharset());
// coordinates
- if (document.lat() != 0.0f && document.lon() != 0.0f) {
+ if (document.lat() != 0.0 && document.lon() != 0.0) {
if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(document.lat()) + "," + Double.toString(document.lon()));
}
if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, responseHeader == null ? 200 : responseHeader.getStatusCode());
diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java
index ed7714648..a4809690c 100644
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@@ -836,7 +836,7 @@ public final class SearchEvent {
}
// check location constraint
- if ((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_haslocation)) && (page.lat() == 0.0f || page.lon() == 0.0f)) {
+ if ((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_haslocation)) && (page.lat() == 0.0 || page.lon() == 0.0)) {
this.query.misses.add(page.hash());
continue;
}