diff --git a/htroot/IndexFederated_p.html b/htroot/IndexFederated_p.html index f0805ae6c..778ba22db 100644 --- a/htroot/IndexFederated_p.html +++ b/htroot/IndexFederated_p.html @@ -26,7 +26,8 @@ This is a switchboard for the usage of embedded metadata to embedded solr. The rwi index is necessary for index transmission and shall be switched off in future portalmode configurations.
-
url metadata and embedded solr fulltext search index, interface at: /solr/select?q=*:*&start=0&rows=10
+
url metadata and embedded solr fulltext search index, interface at: /solr/select?q=*:*&start=0&rows=10 + #(migrateUrlDbtoSolr)#:: #(/migrateUrlDbtoSolr)#
embedded 'classic' rwi index
embedded citation reference index (link structure, used for ranking)
diff --git a/htroot/IndexFederated_p.java b/htroot/IndexFederated_p.java index 7c0eb6ef9..6da71b3cd 100644 --- a/htroot/IndexFederated_p.java +++ b/htroot/IndexFederated_p.java @@ -225,6 +225,10 @@ public class IndexFederated_p { prop.put("solr.indexing.sharding", env.getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_SHARDING, "modulo-host-md5")); prop.put("solr.indexing.schemefile", schemename); + if ((sb.index.fulltext().connectedURLDb())) { + prop.put("migrateUrlDbtoSolr", 1); + } else prop.put("migrateUrlDbtoSolr", 0); + // return rewrite properties return prop; } diff --git a/htroot/migrateurldb_p.html b/htroot/migrateurldb_p.html new file mode 100644 index 000000000..05487b6d1 --- /dev/null +++ b/htroot/migrateurldb_p.html @@ -0,0 +1,36 @@ + + + + Migrate URLdb + #%env/templates/metas.template%# + + + #%env/templates/header.template%# + #%env/templates/simpleheader.template%# + +

Migrate URLdb to embedded Solr Index

+ +

Convert old meta data (urldb) index to embedded Solr fulltext index.

+ +
+
+

A low priority background job has been started which reads the old index, adds it to Solr and deletes the entry from the old index.

+

The default "slow migration" updates any entry in the old urldb index upon access (e.g. during search events).
+ If you feel that the not accessed entries are still relevant, with this migration all entries from the old urldb index will be migrated.

+

You may refresh this page to see how many entries in the old index are left for migration

+

Hint: this background task runs until all entries are migrated or YaCy is shutdown. The migration is not automatically restarted.

+
+
+
+
+ + + +

#[count]# entries in old index left to migrate.

+

For large indexes this may run for a long time (migration speed: #[speed]# entries per minute)

+
+
+ + #%env/templates/footer.template%# + + diff --git a/htroot/migrateurldb_p.java b/htroot/migrateurldb_p.java new file mode 100644 index 000000000..061a2edea --- /dev/null +++ b/htroot/migrateurldb_p.java @@ -0,0 +1,44 @@ +// migrateurldb_p.java + +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.migration; +import net.yacy.search.Switchboard; +import net.yacy.server.serverObjects; +import net.yacy.server.serverSwitch; + +public class migrateurldb_p { + + public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) { + final serverObjects prop = new serverObjects(); + final Switchboard sb = (Switchboard) env; + + int cnt; + + if ((cnt = migration.migrateUrldbtoSolr(sb)) > 0) { + prop.put("count", cnt); + + if (post != null && post.containsKey("dorefresh")) { + int lastcount = post.getInt("lastcount", 0); + Long t = post.getLong("lasttime", 1); + + Double difft = (System.currentTimeMillis() - t) / 60000.0d; + int diff = (int)((lastcount - cnt) / difft) ; + prop.put("speed", diff); + prop.put("lasttime", t); + prop.put("lastcount", lastcount); + + } else { + prop.put("speed", "?"); + prop.put("lastcount",cnt); + prop.put("lasttime", System.currentTimeMillis()); + } + } else { + prop.put("speed", ""); + prop.put("count", "no urldb index available"); + } + + + // return rewrite properties + return prop; + } +} \ No newline at end of file diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 9d17ae5e6..39e184024 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -1,495 +1,495 @@ -/** - * Condenser.java - * Copyright 2004 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany - * First released 09.01.2004 at http://yacy.net - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program in the file lgpl21.txt - * If not, see . - */ - -package net.yacy.document; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Properties; -import java.util.Set; -import java.util.SortedSet; -import java.util.TreeMap; - -import org.apache.solr.common.params.MapSolrParams; -import org.apache.solr.update.processor.Lookup3Signature; - -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.WordCache; -import net.yacy.cora.document.analysis.Classification.ContentDomain; -import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.federate.solr.Boost; -import net.yacy.cora.language.synonyms.SynonymLibrary; -import net.yacy.cora.lod.vocabulary.Tagging; -import net.yacy.document.language.Identificator; -import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.word.Word; -import net.yacy.kelondro.data.word.WordReferenceRow; -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.Bitfield; -import net.yacy.kelondro.util.SetTools; - - -public final class Condenser { - - // this is the page analysis class - public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form - public final static int wordminsize = 2; - public final static int wordcut = 2; - - // category flags that show how the page can be distinguished in different interest groups - public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of') - public static final int flag_cat_haslocation = 19; // the page has a location metadata attached - public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images - public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file - public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos - public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file - - //private Properties analysis; - private final Map words; // a string (the words) to (indexWord) - relation - private final Map> tags = new HashMap>(); // a set of tags, discovered from Autotagging - private final Set synonyms; // a set of synonyms to the words - private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection - private String fuzzy_signature_text = null; // signatures for double-check detection - - public int RESULT_NUMB_WORDS = -1; - public int RESULT_DIFF_WORDS = -1; - public int RESULT_NUMB_SENTENCES = -1; - public int RESULT_DIFF_SENTENCES = -1; - public Bitfield RESULT_FLAGS = new Bitfield(4); - private final Identificator languageIdentificator; - - public Condenser( - final Document document, - final boolean indexText, - final boolean indexMedia, - final WordCache meaningLib, - final SynonymLibrary synlib, - final boolean doAutotagging - ) { - Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging - // if addMedia == true, then all the media links are also parsed and added to the words - // added media words are flagged with the appropriate media flag - this.words = new HashMap(); - this.synonyms = new LinkedHashSet(); - this.RESULT_FLAGS = new Bitfield(4); - - // construct flag set for document - if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true); - if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true); - if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true); - if (document.dc_source().getContentDomain() == ContentDomain.APP || !document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true); - if (document.lat() != 0.0f && document.lon() != 0.0f) this.RESULT_FLAGS.set(flag_cat_haslocation, true); - - this.languageIdentificator = new Identificator(); - - // add the URL components to the word list - insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib); - - Map.Entry entry; - if (indexText) { - createCondensement(document.getTextString(), meaningLib, doAutotagging); - // the phrase counter: - // phrase 0 are words taken from the URL - // phrase 1 is the MainTitle - // phrase 2 is - // phrase 3 is the Document Abstract - // phrase 4 is the Document Author - // phrase 5 is the Document Publisher - // phrase 6 are the tags specified in document - // phrase 10 and above are the section headlines/titles (88 possible) - // phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!) - // phrase 99 is taken from the media Link url and anchor description - // phrase 100 and above are lines from the text - insertTextToWords(new SentenceReader(document.dc_title()), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib); - insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib); - insertTextToWords(new SentenceReader(document.dc_creator()), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); - insertTextToWords(new SentenceReader(document.dc_publisher()), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); - insertTextToWords(new SentenceReader(document.dc_subject(' ')), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib); - // missing: tags! - final String[] titles = document.getSectionTitles(); - for (int i = 0; i < titles.length; i++) { - insertTextToWords(new SentenceReader(titles[i]), i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib); - } - - // anchors: for text indexing we add only the anchor description - // REMOVED! Reason: - // words from the anchor description should appear as normal text in the output from the parser - // to flag these words as appearance in dc_description would confuse, since the user expects such word as titles of - // pages that are shown in the search result. The words from the URLS should also not appear as part of the index, because they - // are not visible in the text and could be used to crate fake-content - /* - final Iterator> i = document.getAnchors().entrySet().iterator(); - while (i.hasNext()) { - entry = i.next(); - if ((entry == null) || (entry.getKey() == null)) continue; - insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true); - } - */ - } else { - this.RESULT_NUMB_WORDS = 0; - this.RESULT_DIFF_WORDS = 0; - this.RESULT_NUMB_SENTENCES = 0; - this.RESULT_DIFF_SENTENCES = 0; - } - - if (indexMedia) { - // add anchor descriptions: here, we also add the url components - // audio - Iterator> i = document.getAudiolinks().entrySet().iterator(); - while (i.hasNext()) { - entry = i.next(); - insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib); - insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib); - } - - // video - i = document.getVideolinks().entrySet().iterator(); - while (i.hasNext()) { - entry = i.next(); - insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib); - insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib); - } - - // applications - i = document.getApplinks().entrySet().iterator(); - while (i.hasNext()) { - entry = i.next(); - insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib); - insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib); - } - - // images - final Iterator j = document.getImages().values().iterator(); - ImageEntry ientry; - MultiProtocolURI url; - while (j.hasNext()) { - ientry = j.next(); - url = ientry.url(); - if (url == null) continue; - insertTextToWords(new SentenceReader(url.toNormalform(true)), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib); - insertTextToWords(new SentenceReader(ientry.alt()), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib); - } - - // finally check all words for missing flag entry - final Iterator> k = this.words.entrySet().iterator(); - Word wprop; - Map.Entry we; - while (k.hasNext()) { - we = k.next(); - wprop = we.getValue(); - if (wprop.flags == null) { - wprop.flags = this.RESULT_FLAGS.clone(); - this.words.put(we.getKey(), wprop); - } - } - } - - // extend the tags in the document object with autotagging tags - if (!this.tags.isEmpty()) { - document.addMetatags(this.tags); - } - - if (synlib != null) { - for (String word: this.words.keySet()) { - Set syms = synlib.getSynonyms(word); - if (syms != null) this.synonyms.addAll(syms); - } - } - String text = document.getTextString(); - - // create the synonyms set - if (synonyms != null) { - for (String word: this.words.keySet()) { - Set syms = synlib.getSynonyms(word); - if (syms != null) this.synonyms.addAll(syms); - } - } - - // create hashes for duplicate detection - // check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b - EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature(); - Map sp = new HashMap(); - sp.put("quantRate", Float.toString(Boost.RANKING.getQuantRate())); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5! - sp.put("minTokenLen", Integer.toString(Boost.RANKING.getMinTokenLen())); - fuzzySignatureFactory.init(new MapSolrParams(sp)); - fuzzySignatureFactory.add(text); - byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature(); - long l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (fuzzy_signature_hash[i] & 0xff); - this.fuzzy_signature = l; - this.fuzzy_signature_text = fuzzySignatureFactory.getSignatureText().toString(); - Lookup3Signature exactSignatureFactory = new Lookup3Signature(); - exactSignatureFactory.add(text); - byte[] exact_signature_hash = exactSignatureFactory.getSignature(); - l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (exact_signature_hash[i] & 0xff); - this.exact_signature = l; - } - - private Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) { - this.languageIdentificator = null; // we don't need that here - // analysis = new Properties(); - this.words = new TreeMap(); - this.synonyms = new HashSet(); - createCondensement(text, meaningLib, doAutotagging); - } - - private void insertTextToWords( - final SentenceReader text, - final int phrase, - final int flagpos, - final Bitfield flagstemplate, - final boolean useForLanguageIdentification, - final WordCache meaningLib) { - if (text == null) return; - String word; - Word wprop; - WordTokenizer wordenum = new WordTokenizer(text, meaningLib); - try { - int pip = 0; - while (wordenum.hasMoreElements()) { - word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); - if (useForLanguageIdentification) this.languageIdentificator.add(word); - if (word.length() < 2) continue; - wprop = this.words.get(word); - if (wprop == null) wprop = new Word(0, pip, phrase); - if (wprop.flags == null) wprop.flags = flagstemplate.clone(); - wprop.flags.set(flagpos, true); - this.words.put(word, wprop); - pip++; - this.RESULT_NUMB_WORDS++; - this.RESULT_DIFF_WORDS++; - } - } finally { - wordenum.close(); - } - } - - public int excludeWords(final SortedSet stopwords) { - // subtracts the given stopwords from the word list - // the word list shrinkes. This returns the number of shrinked words - final int oldsize = this.words.size(); - SetTools.excludeDestructive(this.words, stopwords); - return oldsize - this.words.size(); - } - - public Map words() { - // returns the words as word/indexWord relation map - return this.words; - } - - public List synonyms() { - ArrayList l = new ArrayList(this.synonyms.size()); - for (String s: this.synonyms) l.add(s); - return l; - } - - public long fuzzySignature() { - return this.fuzzy_signature; - } - - public String fuzzySignatureText() { - return this.fuzzy_signature_text; - } - - public long exactSignature() { - return this.exact_signature; - } - - public String language() { - return this.languageIdentificator.getLanguage(); - } - - private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) { - assert text != null; - final Set currsentwords = new HashSet(); - String word = ""; - String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1]; - for (int i = 0; i < wordcache.length; i++) wordcache[i] = ""; - String k; - Tagging.Metatag tag; - int wordlen; - Word wsp; - final Word wsp1; - int wordHandle; - int wordHandleCount = 0; - final int sentenceHandleCount = 0; - int allwordcounter = 0; - final int allsentencecounter = 0; - int wordInSentenceCounter = 1; - boolean comb_indexof = false, last_last = false, last_index = false; - final Map sentences = new HashMap(100); - if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false; - - // read source - final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib); - try { - while (wordenum.hasMoreElements()) { - word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH); - if (this.languageIdentificator != null) this.languageIdentificator.add(word); - if (word.length() < wordminsize) continue; - - // get tags from autotagging - if (doAutotagging) { - for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) { - // wordc is number of words that are tested - StringBuilder sb = new StringBuilder(); - if (wordc == 1) { - sb.append(word); - } else { - for (int w = 0; w < wordc - 1; w++) { - sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' '); - } - sb.append(word); - } - String testterm = sb.toString().trim(); - //System.out.println("Testing: " + testterm); - tag = LibraryProvider.autotagging.getTagFromTerm(testterm); - if (tag != null) { - String navigatorName = tag.getVocabularyName(); - Set tagset = this.tags.get(navigatorName); - if (tagset == null) { - tagset = new HashSet(); - this.tags.put(navigatorName, tagset); - } - tagset.add(tag); - } - } - } - // shift wordcache - System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1); - wordcache[wordcache.length - 1] = word; - - // distinguish punctuation and words - wordlen = word.length(); - if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { - // store sentence - currsentwords.clear(); - wordInSentenceCounter = 1; - } else { - // check index.of detection - if (last_last && comb_indexof && word.equals("modified")) { - this.RESULT_FLAGS.set(flag_cat_indexof, true); - wordenum.pre(true); // parse lines as they come with CRLF - } - if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true; - last_last = word.equals("last"); - last_index = word.equals("index"); - - // store word - allwordcounter++; - currsentwords.add(word); - wsp = this.words.get(word); - if (wsp != null) { - // word already exists - wordHandle = wsp.posInText; - wsp.inc(); - } else { - // word does not yet exist, create new word entry - wordHandle = wordHandleCount++; - wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100); - wsp.flags = this.RESULT_FLAGS.clone(); - this.words.put(word, wsp); - } - // we now have the unique handle of the word, put it into the sentence: - wordInSentenceCounter++; - } - } - } finally { - wordenum.close(); - } - - if (pseudostemming) { - Map.Entry entry; - // we search for similar words and reorganize the corresponding sentences - // a word is similar, if a shortened version is equal - final Iterator> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order - wordsearch: while (wi.hasNext()) { - entry = wi.next(); - word = entry.getKey(); - wordlen = word.length(); - wsp = entry.getValue(); - for (int i = wordcut; i > 0; i--) { - if (wordlen > i) { - k = word.substring(0, wordlen - i); - if (this.words.containsKey(k)) { - // update word counter - wsp1.count = wsp1.count + wsp.count; - this.words.put(k, wsp1); - // remove current word - wi.remove(); - continue wordsearch; - } - } - } - } - } - - // store result - //this.RESULT_NUMB_TEXT_BYTES = wordenum.count(); - this.RESULT_NUMB_WORDS = allwordcounter; - this.RESULT_DIFF_WORDS = wordHandleCount; - this.RESULT_NUMB_SENTENCES = allsentencecounter; - this.RESULT_DIFF_SENTENCES = sentenceHandleCount; - } - - public static Map getWords(final String text, final WordCache meaningLib) { - // returns a word/indexWord relation map - if (text == null) return null; - return new Condenser(text, meaningLib, false).words(); - } - - public static void main(final String[] args) { - // read a property file and convert them into configuration lines - try { - final File f = new File(args[0]); - final Properties p = new Properties(); - p.load(new FileInputStream(f)); - final StringBuilder sb = new StringBuilder(); - sb.append("{\n"); - for (int i = 0; i <= 15; i++) { - sb.append('"'); - final String s = p.getProperty("keywords" + i); - final String[] l = s.split(","); - for (final String element : l) { - sb.append(ASCII.String(Word.word2hash(element))); - } - if (i < 15) sb.append(",\n"); - } - sb.append("}\n"); - System.out.println(sb.toString()); - } catch (final FileNotFoundException e) { - Log.logException(e); - } catch (final IOException e) { - Log.logException(e); - } - - } - -} +/** + * Condenser.java + * Copyright 2004 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 09.01.2004 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.document; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeMap; + +import org.apache.solr.common.params.MapSolrParams; +import org.apache.solr.update.processor.Lookup3Signature; + +import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.WordCache; +import net.yacy.cora.document.analysis.Classification.ContentDomain; +import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; +import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.federate.solr.Boost; +import net.yacy.cora.language.synonyms.SynonymLibrary; +import net.yacy.cora.lod.vocabulary.Tagging; +import net.yacy.document.language.Identificator; +import net.yacy.document.parser.html.ImageEntry; +import net.yacy.kelondro.data.word.Word; +import net.yacy.kelondro.data.word.WordReferenceRow; +import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.util.Bitfield; +import net.yacy.kelondro.util.SetTools; + + +public final class Condenser { + + // this is the page analysis class + public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form + public final static int wordminsize = 2; + public final static int wordcut = 2; + + // category flags that show how the page can be distinguished in different interest groups + public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of') + public static final int flag_cat_haslocation = 19; // the page has a location metadata attached + public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images + public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file + public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos + public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file + + //private Properties analysis; + private final Map words; // a string (the words) to (indexWord) - relation + private final Map> tags = new HashMap>(); // a set of tags, discovered from Autotagging + private final Set synonyms; // a set of synonyms to the words + private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection + private String fuzzy_signature_text = null; // signatures for double-check detection + + public int RESULT_NUMB_WORDS = -1; + public int RESULT_DIFF_WORDS = -1; + public int RESULT_NUMB_SENTENCES = -1; + public int RESULT_DIFF_SENTENCES = -1; + public Bitfield RESULT_FLAGS = new Bitfield(4); + private final Identificator languageIdentificator; + + public Condenser( + final Document document, + final boolean indexText, + final boolean indexMedia, + final WordCache meaningLib, + final SynonymLibrary synlib, + final boolean doAutotagging + ) { + Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging + // if addMedia == true, then all the media links are also parsed and added to the words + // added media words are flagged with the appropriate media flag + this.words = new HashMap(); + this.synonyms = new LinkedHashSet(); + this.RESULT_FLAGS = new Bitfield(4); + + // construct flag set for document + if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true); + if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true); + if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true); + if (document.dc_source().getContentDomain() == ContentDomain.APP || !document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true); + if (document.lat() != 0.0 && document.lon() != 0.0) this.RESULT_FLAGS.set(flag_cat_haslocation, true); + + this.languageIdentificator = new Identificator(); + + // add the URL components to the word list + insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib); + + Map.Entry entry; + if (indexText) { + createCondensement(document.getTextString(), meaningLib, doAutotagging); + // the phrase counter: + // phrase 0 are words taken from the URL + // phrase 1 is the MainTitle + // phrase 2 is + // phrase 3 is the Document Abstract + // phrase 4 is the Document Author + // phrase 5 is the Document Publisher + // phrase 6 are the tags specified in document + // phrase 10 and above are the section headlines/titles (88 possible) + // phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!) + // phrase 99 is taken from the media Link url and anchor description + // phrase 100 and above are lines from the text + insertTextToWords(new SentenceReader(document.dc_title()), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(new SentenceReader(document.dc_creator()), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(new SentenceReader(document.dc_publisher()), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(new SentenceReader(document.dc_subject(' ')), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib); + // missing: tags! + final String[] titles = document.getSectionTitles(); + for (int i = 0; i < titles.length; i++) { + insertTextToWords(new SentenceReader(titles[i]), i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib); + } + + // anchors: for text indexing we add only the anchor description + // REMOVED! Reason: + // words from the anchor description should appear as normal text in the output from the parser + // to flag these words as appearance in dc_description would confuse, since the user expects such word as titles of + // pages that are shown in the search result. The words from the URLS should also not appear as part of the index, because they + // are not visible in the text and could be used to crate fake-content + /* + final Iterator> i = document.getAnchors().entrySet().iterator(); + while (i.hasNext()) { + entry = i.next(); + if ((entry == null) || (entry.getKey() == null)) continue; + insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true); + } + */ + } else { + this.RESULT_NUMB_WORDS = 0; + this.RESULT_DIFF_WORDS = 0; + this.RESULT_NUMB_SENTENCES = 0; + this.RESULT_DIFF_SENTENCES = 0; + } + + if (indexMedia) { + // add anchor descriptions: here, we also add the url components + // audio + Iterator> i = document.getAudiolinks().entrySet().iterator(); + while (i.hasNext()) { + entry = i.next(); + insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib); + insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib); + } + + // video + i = document.getVideolinks().entrySet().iterator(); + while (i.hasNext()) { + entry = i.next(); + insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib); + insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib); + } + + // applications + i = document.getApplinks().entrySet().iterator(); + while (i.hasNext()) { + entry = i.next(); + insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib); + insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib); + } + + // images + final Iterator j = document.getImages().values().iterator(); + ImageEntry ientry; + MultiProtocolURI url; + while (j.hasNext()) { + ientry = j.next(); + url = ientry.url(); + if (url == null) continue; + insertTextToWords(new SentenceReader(url.toNormalform(true)), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib); + insertTextToWords(new SentenceReader(ientry.alt()), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib); + } + + // finally check all words for missing flag entry + final Iterator> k = this.words.entrySet().iterator(); + Word wprop; + Map.Entry we; + while (k.hasNext()) { + we = k.next(); + wprop = we.getValue(); + if (wprop.flags == null) { + wprop.flags = this.RESULT_FLAGS.clone(); + this.words.put(we.getKey(), wprop); + } + } + } + + // extend the tags in the document object with autotagging tags + if (!this.tags.isEmpty()) { + document.addMetatags(this.tags); + } + + if (synlib != null) { + for (String word: this.words.keySet()) { + Set syms = synlib.getSynonyms(word); + if (syms != null) this.synonyms.addAll(syms); + } + } + String text = document.getTextString(); + + // create the synonyms set + if (synonyms != null) { + for (String word: this.words.keySet()) { + Set syms = synlib.getSynonyms(word); + if (syms != null) this.synonyms.addAll(syms); + } + } + + // create hashes for duplicate detection + // check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b + EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature(); + Map sp = new HashMap(); + sp.put("quantRate", Float.toString(Boost.RANKING.getQuantRate())); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5! + sp.put("minTokenLen", Integer.toString(Boost.RANKING.getMinTokenLen())); + fuzzySignatureFactory.init(new MapSolrParams(sp)); + fuzzySignatureFactory.add(text); + byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature(); + long l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (fuzzy_signature_hash[i] & 0xff); + this.fuzzy_signature = l; + this.fuzzy_signature_text = fuzzySignatureFactory.getSignatureText().toString(); + Lookup3Signature exactSignatureFactory = new Lookup3Signature(); + exactSignatureFactory.add(text); + byte[] exact_signature_hash = exactSignatureFactory.getSignature(); + l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (exact_signature_hash[i] & 0xff); + this.exact_signature = l; + } + + private Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) { + this.languageIdentificator = null; // we don't need that here + // analysis = new Properties(); + this.words = new TreeMap(); + this.synonyms = new HashSet(); + createCondensement(text, meaningLib, doAutotagging); + } + + private void insertTextToWords( + final SentenceReader text, + final int phrase, + final int flagpos, + final Bitfield flagstemplate, + final boolean useForLanguageIdentification, + final WordCache meaningLib) { + if (text == null) return; + String word; + Word wprop; + WordTokenizer wordenum = new WordTokenizer(text, meaningLib); + try { + int pip = 0; + while (wordenum.hasMoreElements()) { + word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); + if (useForLanguageIdentification) this.languageIdentificator.add(word); + if (word.length() < 2) continue; + wprop = this.words.get(word); + if (wprop == null) wprop = new Word(0, pip, phrase); + if (wprop.flags == null) wprop.flags = flagstemplate.clone(); + wprop.flags.set(flagpos, true); + this.words.put(word, wprop); + pip++; + this.RESULT_NUMB_WORDS++; + this.RESULT_DIFF_WORDS++; + } + } finally { + wordenum.close(); + } + } + + public int excludeWords(final SortedSet stopwords) { + // subtracts the given stopwords from the word list + // the word list shrinkes. This returns the number of shrinked words + final int oldsize = this.words.size(); + SetTools.excludeDestructive(this.words, stopwords); + return oldsize - this.words.size(); + } + + public Map words() { + // returns the words as word/indexWord relation map + return this.words; + } + + public List synonyms() { + ArrayList l = new ArrayList(this.synonyms.size()); + for (String s: this.synonyms) l.add(s); + return l; + } + + public long fuzzySignature() { + return this.fuzzy_signature; + } + + public String fuzzySignatureText() { + return this.fuzzy_signature_text; + } + + public long exactSignature() { + return this.exact_signature; + } + + public String language() { + return this.languageIdentificator.getLanguage(); + } + + private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) { + assert text != null; + final Set currsentwords = new HashSet(); + String word = ""; + String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1]; + for (int i = 0; i < wordcache.length; i++) wordcache[i] = ""; + String k; + Tagging.Metatag tag; + int wordlen; + Word wsp; + final Word wsp1; + int wordHandle; + int wordHandleCount = 0; + final int sentenceHandleCount = 0; + int allwordcounter = 0; + final int allsentencecounter = 0; + int wordInSentenceCounter = 1; + boolean comb_indexof = false, last_last = false, last_index = false; + final Map sentences = new HashMap(100); + if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false; + + // read source + final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib); + try { + while (wordenum.hasMoreElements()) { + word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH); + if (this.languageIdentificator != null) this.languageIdentificator.add(word); + if (word.length() < wordminsize) continue; + + // get tags from autotagging + if (doAutotagging) { + for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) { + // wordc is number of words that are tested + StringBuilder sb = new StringBuilder(); + if (wordc == 1) { + sb.append(word); + } else { + for (int w = 0; w < wordc - 1; w++) { + sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' '); + } + sb.append(word); + } + String testterm = sb.toString().trim(); + //System.out.println("Testing: " + testterm); + tag = LibraryProvider.autotagging.getTagFromTerm(testterm); + if (tag != null) { + String navigatorName = tag.getVocabularyName(); + Set tagset = this.tags.get(navigatorName); + if (tagset == null) { + tagset = new HashSet(); + this.tags.put(navigatorName, tagset); + } + tagset.add(tag); + } + } + } + // shift wordcache + System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1); + wordcache[wordcache.length - 1] = word; + + // distinguish punctuation and words + wordlen = word.length(); + if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { + // store sentence + currsentwords.clear(); + wordInSentenceCounter = 1; + } else { + // check index.of detection + if (last_last && comb_indexof && word.equals("modified")) { + this.RESULT_FLAGS.set(flag_cat_indexof, true); + wordenum.pre(true); // parse lines as they come with CRLF + } + if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true; + last_last = word.equals("last"); + last_index = word.equals("index"); + + // store word + allwordcounter++; + currsentwords.add(word); + wsp = this.words.get(word); + if (wsp != null) { + // word already exists + wordHandle = wsp.posInText; + wsp.inc(); + } else { + // word does not yet exist, create new word entry + wordHandle = wordHandleCount++; + wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100); + wsp.flags = this.RESULT_FLAGS.clone(); + this.words.put(word, wsp); + } + // we now have the unique handle of the word, put it into the sentence: + wordInSentenceCounter++; + } + } + } finally { + wordenum.close(); + } + + if (pseudostemming) { + Map.Entry entry; + // we search for similar words and reorganize the corresponding sentences + // a word is similar, if a shortened version is equal + final Iterator> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order + wordsearch: while (wi.hasNext()) { + entry = wi.next(); + word = entry.getKey(); + wordlen = word.length(); + wsp = entry.getValue(); + for (int i = wordcut; i > 0; i--) { + if (wordlen > i) { + k = word.substring(0, wordlen - i); + if (this.words.containsKey(k)) { + // update word counter + wsp1.count = wsp1.count + wsp.count; + this.words.put(k, wsp1); + // remove current word + wi.remove(); + continue wordsearch; + } + } + } + } + } + + // store result + //this.RESULT_NUMB_TEXT_BYTES = wordenum.count(); + this.RESULT_NUMB_WORDS = allwordcounter; + this.RESULT_DIFF_WORDS = wordHandleCount; + this.RESULT_NUMB_SENTENCES = allsentencecounter; + this.RESULT_DIFF_SENTENCES = sentenceHandleCount; + } + + public static Map getWords(final String text, final WordCache meaningLib) { + // returns a word/indexWord relation map + if (text == null) return null; + return new Condenser(text, meaningLib, false).words(); + } + + public static void main(final String[] args) { + // read a property file and convert them into configuration lines + try { + final File f = new File(args[0]); + final Properties p = new Properties(); + p.load(new FileInputStream(f)); + final StringBuilder sb = new StringBuilder(); + sb.append("{\n"); + for (int i = 0; i <= 15; i++) { + sb.append('"'); + final String s = p.getProperty("keywords" + i); + final String[] l = s.split(","); + for (final String element : l) { + sb.append(ASCII.String(Word.word2hash(element))); + } + if (i < 15) sb.append(",\n"); + } + sb.append("}\n"); + System.out.println(sb.toString()); + } catch (final FileNotFoundException e) { + Log.logException(e); + } catch (final IOException e) { + Log.logException(e); + } + + } + +} diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index e63ad37e6..ca7a7df43 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -722,7 +722,7 @@ dc_rights final String language = dc_language(); if (language != null && language.length() > 0) os.write("" + dc_language() + "\n"); os.write("" + ISO8601Formatter.FORMATTER.format(date) + "\n"); - if (this.lon != 0.0f && this.lat != 0.0f) os.write("" + this.lon +"" + this.lat + "\n"); + if (this.lon != 0.0 && this.lat != 0.0) os.write("" + this.lon +"" + this.lat + "\n"); os.write("\n"); } @@ -821,7 +821,7 @@ dc_rights anchors.putAll(doc.getAnchors()); rss.putAll(doc.getRSS()); ContentScraper.addAllImages(images, doc.getImages()); - if (doc.lon() != 0.0f && doc.lat() != 0.0f) { lon = doc.lon(); lat = doc.lat(); } + if (doc.lon() != 0.0 && doc.lat() != 0.0) { lon = doc.lon(); lat = doc.lat(); } } // clean up parser data diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index 185868414..087760853 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -226,7 +226,7 @@ public class URIMetadataRow { s.appendLF(); if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher); s.appendLF(); - if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF(); + if (lon == 0.0 && lat == 0.0) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF(); String s0 = s.toString(); s.close(); return UTF8.getBytes(s0); @@ -514,7 +514,11 @@ public class URIMetadataRow { if (p < 0) { return 0.0d; } - return this.latlon.charAt(0) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(0, p)); + try { + return this.latlon.charAt(0) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(0, p)); + } catch (NumberFormatException e) { + return 0.0d; + } } public double lon() { if (this.latlon == null || this.latlon.isEmpty()) return 0.0d; @@ -522,7 +526,11 @@ public class URIMetadataRow { if (p < 0) { return 0.0d; } - return this.latlon.charAt(p + 1) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(p + 1)); + try { + return this.latlon.charAt(p + 1) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(p + 1)); + } catch (NumberFormatException e) { + return 0.0d; + } } } diff --git a/source/net/yacy/migration.java b/source/net/yacy/migration.java index 6709c5eea..f7c446356 100644 --- a/source/net/yacy/migration.java +++ b/source/net/yacy/migration.java @@ -33,6 +33,11 @@ import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import com.google.common.io.Files; +import java.util.Iterator; +import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.index.Index; +import net.yacy.kelondro.index.Row; +import net.yacy.search.index.Fulltext; public class migration { //SVN constants @@ -256,4 +261,82 @@ public class migration { sb.setConfig("crawler.http.acceptCharset", sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7")); } } + /** + * converts old urldb to Solr. + * In chunks of 1000 entries. + * Creates a lock file in workdir to allow only one active migration thread + * @return current size of urldb index + */ + @SuppressWarnings("deprecation") + public static int migrateUrldbtoSolr(final Switchboard sb) { + int ret = 0; + final File f; + final Fulltext ft = sb.index.fulltext(); + + if (ft.getURLDb() != null) { + ret = ft.getURLDb().size(); + f = new File(sb.workPath, "migrateUrldbtoSolr.lck"); + f.deleteOnExit(); + if (f.exists()) { + return ret; + } else { + try { + f.createNewFile(); + } catch (IOException ex) { + Log.logInfo("migrateUrldbtoSolr","could not create lock file"); + } + } + + final Thread t = new Thread() { + boolean go = true; + final Index urldb = ft.getURLDb(); + + public void run() { + try { + Thread.currentThread().setName("migration.migrateUrldbtoSolr"); + + int i = urldb.size(); + while (go && i > 0) { + + List chunk = urldb.random(1000); + if ((chunk == null) || (chunk.size() == 0)) { + go = false; + break; + } + Iterator chunkit = chunk.iterator(); + + while (go && chunkit.hasNext()) { + try { // to catch any data errors + URIMetadataRow row = new URIMetadataRow(chunkit.next(), null); + ft.putMetadata(row); // this deletes old urldb-entry first and inserts into Solr + i--; + if (Switchboard.getSwitchboard().shallTerminate()) { + go = false; + } + } catch (Exception e) { + Log.logInfo("migrateUrldbtoSolr", "some error while adding old data to new index, continue with next entry"); + } + } + Log.logInfo("migrateUrldbtoSolr", Integer.toString(i) + " entries left (convert next chunk of 1000 entries)"); + } + ft.commit(); + + } catch (IOException ex) { + Log.logInfo("migrateUrldbtoSolr", "error reading old urldb index"); + } finally { + if (f.exists()) { + f.delete(); // delete lock file + } + } + } + + public void exit() { + go = false; + } + }; + t.setPriority(Thread.MIN_PRIORITY); + t.start(); + } + return ret; + } } diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 0228c1b2f..036269e1a 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -100,6 +100,28 @@ public final class Fulltext implements Iterable { this.forcedCommitTime = 0; } + /** + * @deprecated + * used only for migration + * @return the connected URLDb + + */ + @Deprecated + public Index getURLDb() { + return this.urlIndexFile; + } + + /** + * true if old metadata index URLDb is connected. + * used only for migration + * @deprecated + * current and future versions use Solr for metadata + */ + @Deprecated + public boolean connectedURLDb() { + return this.urlIndexFile != null; + } + protected void connectUrlDb(final String tablename, final boolean useTailCache, final boolean exceed134217727) { if (this.urlIndexFile != null) return; this.tablename = tablename; diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index e6de803e9..90be0c28b 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -274,7 +274,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, "UTF8"); // coordinates - if (md.lat() != 0.0f && md.lon() != 0.0f) { + if (md.lat() != 0.0 && md.lon() != 0.0) { if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(md.lat()) + "," + Double.toString(md.lon())); } if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, 200); @@ -794,7 +794,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, document.getCharset()); // coordinates - if (document.lat() != 0.0f && document.lon() != 0.0f) { + if (document.lat() != 0.0 && document.lon() != 0.0) { if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(document.lat()) + "," + Double.toString(document.lon())); } if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, responseHeader == null ? 200 : responseHeader.getStatusCode()); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index ed7714648..a4809690c 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -836,7 +836,7 @@ public final class SearchEvent { } // check location constraint - if ((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_haslocation)) && (page.lat() == 0.0f || page.lon() == 0.0f)) { + if ((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_haslocation)) && (page.lat() == 0.0 || page.lon() == 0.0)) { this.query.misses.add(page.hash()); continue; }