added (manual) urldb migration (link on: Index Administraton -> Federated Solr Index)

- migrates all entries in old urldb

Metadata coordinate (lat / lon) NumberFormatException still relative often (see excerpt below), 
- added try/catch for URIMetadataRow (seems not to be needed in URIMetaDataNode, as Solr internally checks for number format)
- removed possible typ conversion for lat() / lon() comparison with 0.0f, changed to 0.0  (leaving it to the compiler/optimizer to choose number format)

current log excerpt for NumberFormatException:
W 2013/01/14 00:10:07 StackTrace For input string: "-"
java.lang.NumberFormatException: For input string: "-"
	at sun.misc.FloatingDecimal.readJavaFormatString(Unknown Source)
	at java.lang.Double.parseDouble(Unknown Source)
	at transferURL.respond(
Caused by: java.lang.NumberFormatException: For input string: "-"
	at sun.misc.FloatingDecimal.readJavaFormatString(Unknown Source)
	at java.lang.Double.parseDouble(Unknown Source)
	at transferURL.respond(
reger 12 years ago
parent 3b6e08b49f
commit 3897bb4409

@ -26,7 +26,8 @@
This is a switchboard for the usage of embedded metadata to embedded solr.
The rwi index is necessary for index transmission and shall be switched off in future portalmode configurations.
<dt><input type="checkbox" name="core.service.fulltext" id="core.service.fulltext" #(core.service.fulltext.checked)#:: checked="checked"#(/core.service.fulltext.checked)# /></dt><dd>url metadata and embedded solr fulltext search index, interface at: <a href="/solr/select?q=*:*&start=0&rows=10" target="_blank">/solr/select?q=*:*&amp;start=0&amp;rows=10</a></dd>
<dt><input type="checkbox" name="core.service.fulltext" id="core.service.fulltext" #(core.service.fulltext.checked)#:: checked="checked"#(/core.service.fulltext.checked)# /></dt><dd>url metadata and embedded solr fulltext search index, interface at: <a href="/solr/select?q=*:*&start=0&rows=10" target="_blank">/solr/select?q=*:*&amp;start=0&amp;rows=10</a>
#(migrateUrlDbtoSolr)#:: <input type="button" class="submitready" onclick="window.location = '/api/migrateurldb_p.html';" value="migrate old index" />#(/migrateUrlDbtoSolr)# </dd>
<dt><input type="checkbox" name="core.service.rwi.tmp" id="core.service.rwi" #(core.service.rwi.tmp.checked)#:: checked="checked"#(/core.service.rwi.tmp.checked)# /></dt><dd>embedded 'classic' rwi index</dd>
<dt><input type="checkbox" name="core.service.citation.tmp" id="core.service.citation" #(core.service.citation.tmp.checked)#:: checked="checked"#(/core.service.citation.tmp.checked)# /></dt><dd>embedded citation reference index (link structure, used for ranking)</dd>
<dt></dt><dd><input type="submit" name="set" value="Set" /></dd>

@ -225,6 +225,10 @@ public class IndexFederated_p {
prop.put("solr.indexing.sharding", env.getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_SHARDING, "modulo-host-md5"));
prop.put("solr.indexing.schemefile", schemename);
if ((sb.index.fulltext().connectedURLDb())) {
prop.put("migrateUrlDbtoSolr", 1);
} else prop.put("migrateUrlDbtoSolr", 0);
// return rewrite properties
return prop;

@ -0,0 +1,36 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "">
<html xmlns="">
<title>Migrate URLdb</title>
<h2>Migrate URLdb to embedded Solr Index</h2>
<p>Convert old meta data (urldb) index to embedded Solr fulltext index.</p>
<p>A low priority background job has been started which reads the old index, adds it to Solr and deletes the entry from the old index.</p>
<p>The default "slow migration" updates any entry in the old urldb index upon access (e.g. during search events).<br />
If you feel that the not accessed entries are still relevant, with this migration all entries from the old urldb index will be migrated.</p>
<p>You may refresh this page to see how many entries in the old index are left for migration</p>
<p>Hint: this background task runs until all entries are migrated or YaCy is shutdown. The migration is not automatically restarted.</p>
<form action="migrateurldb_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<input type="hidden" name="lastcount" value="#[lastcount]#" />
<input type="hidden" name="lasttime" value="#[lasttime]#" />
<p><b>#[count]# entries</b> in old index left to migrate.</p>
<p>For large indexes this may run for a long time (migration speed: #[speed]# entries per minute) <input type="submit" name="dorefresh" value="refresh" /></p>

@ -0,0 +1,44 @@
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.migration;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
public class migrateurldb_p {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
int cnt;
if ((cnt = migration.migrateUrldbtoSolr(sb)) > 0) {
prop.put("count", cnt);
if (post != null && post.containsKey("dorefresh")) {
int lastcount = post.getInt("lastcount", 0);
Long t = post.getLong("lasttime", 1);
Double difft = (System.currentTimeMillis() - t) / 60000.0d;
int diff = (int)((lastcount - cnt) / difft) ;
prop.put("speed", diff);
prop.put("lasttime", t);
prop.put("lastcount", lastcount);
} else {
prop.put("speed", "?");
prop.put("lasttime", System.currentTimeMillis());
} else {
prop.put("speed", "");
prop.put("count", "no urldb index available");
// return rewrite properties
return prop;

@ -1,495 +1,495 @@
* Copyright 2004 by Michael Peter Christen,, Frankfurt am Main, Germany
* First released 09.01.2004 at
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* Lesser General Public License for more details.
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <>.
package net.yacy.document;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeMap;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.update.processor.Lookup3Signature;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.document.language.Identificator;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.SetTools;
public final class Condenser {
// this is the page analysis class
public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
public final static int wordminsize = 2;
public final static int wordcut = 2;
// category flags that show how the page can be distinguished in different interest groups
public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of')
public static final int flag_cat_haslocation = 19; // the page has a location metadata attached
public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images
public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file
public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
//private Properties analysis;
private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
private final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
private final Set<String> synonyms; // a set of synonyms to the words
private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection
private String fuzzy_signature_text = null; // signatures for double-check detection
public int RESULT_NUMB_WORDS = -1;
public int RESULT_DIFF_WORDS = -1;
public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1;
public Bitfield RESULT_FLAGS = new Bitfield(4);
private final Identificator languageIdentificator;
public Condenser(
final Document document,
final boolean indexText,
final boolean indexMedia,
final WordCache meaningLib,
final SynonymLibrary synlib,
final boolean doAutotagging
) {
Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
// if addMedia == true, then all the media links are also parsed and added to the words
// added media words are flagged with the appropriate media flag
this.words = new HashMap<String, Word>();
this.synonyms = new LinkedHashSet<String>();
this.RESULT_FLAGS = new Bitfield(4);
// construct flag set for document
if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.dc_source().getContentDomain() == ContentDomain.APP || !document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true);
if ( != 0.0f && document.lon() != 0.0f) this.RESULT_FLAGS.set(flag_cat_haslocation, true);
this.languageIdentificator = new Identificator();
// add the URL components to the word list
insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
Map.Entry<MultiProtocolURI, String> entry;
if (indexText) {
createCondensement(document.getTextString(), meaningLib, doAutotagging);
// the phrase counter:
// phrase 0 are words taken from the URL
// phrase 1 is the MainTitle
// phrase 2 is <not used>
// phrase 3 is the Document Abstract
// phrase 4 is the Document Author
// phrase 5 is the Document Publisher
// phrase 6 are the tags specified in document
// phrase 10 and above are the section headlines/titles (88 possible)
// phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!)
// phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text
insertTextToWords(new SentenceReader(document.dc_title()), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_creator()), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_publisher()), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_subject(' ')), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
// missing: tags!
final String[] titles = document.getSectionTitles();
for (int i = 0; i < titles.length; i++) {
insertTextToWords(new SentenceReader(titles[i]), i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib);
// anchors: for text indexing we add only the anchor description
// REMOVED! Reason:
// words from the anchor description should appear as normal text in the output from the parser
// to flag these words as appearance in dc_description would confuse, since the user expects such word as titles of
// pages that are shown in the search result. The words from the URLS should also not appear as part of the index, because they
// are not visible in the text and could be used to crate fake-content
final Iterator<Map.Entry<yacyURL, String>> i = document.getAnchors().entrySet().iterator();
while (i.hasNext()) {
entry =;
if ((entry == null) || (entry.getKey() == null)) continue;
insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true);
} else {
if (indexMedia) {
// add anchor descriptions: here, we also add the url components
// audio
Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) {
entry =;
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib);
// video
i = document.getVideolinks().entrySet().iterator();
while (i.hasNext()) {
entry =;
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib);
// applications
i = document.getApplinks().entrySet().iterator();
while (i.hasNext()) {
entry =;
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib);
// images
final Iterator<ImageEntry> j = document.getImages().values().iterator();
ImageEntry ientry;
MultiProtocolURI url;
while (j.hasNext()) {
ientry =;
url = ientry.url();
if (url == null) continue;
insertTextToWords(new SentenceReader(url.toNormalform(true)), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(new SentenceReader(ientry.alt()), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
// finally check all words for missing flag entry
final Iterator<Map.Entry<String, Word>> k = this.words.entrySet().iterator();
Word wprop;
Map.Entry<String, Word> we;
while (k.hasNext()) {
we =;
wprop = we.getValue();
if (wprop.flags == null) {
wprop.flags = this.RESULT_FLAGS.clone();
this.words.put(we.getKey(), wprop);
// extend the tags in the document object with autotagging tags
if (!this.tags.isEmpty()) {
if (synlib != null) {
for (String word: this.words.keySet()) {
Set<String> syms = synlib.getSynonyms(word);
if (syms != null) this.synonyms.addAll(syms);
String text = document.getTextString();
// create the synonyms set
if (synonyms != null) {
for (String word: this.words.keySet()) {
Set<String> syms = synlib.getSynonyms(word);
if (syms != null) this.synonyms.addAll(syms);
// create hashes for duplicate detection
// check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b
EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature();
Map<String,String> sp = new HashMap<String,String>();
sp.put("quantRate", Float.toString(Boost.RANKING.getQuantRate())); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5!
sp.put("minTokenLen", Integer.toString(Boost.RANKING.getMinTokenLen()));
fuzzySignatureFactory.init(new MapSolrParams(sp));
byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature();
long l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (fuzzy_signature_hash[i] & 0xff);
this.fuzzy_signature = l;
this.fuzzy_signature_text = fuzzySignatureFactory.getSignatureText().toString();
Lookup3Signature exactSignatureFactory = new Lookup3Signature();
byte[] exact_signature_hash = exactSignatureFactory.getSignature();
l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (exact_signature_hash[i] & 0xff);
this.exact_signature = l;
private Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) {
this.languageIdentificator = null; // we don't need that here
// analysis = new Properties();
this.words = new TreeMap<String, Word>();
this.synonyms = new HashSet<String>();
createCondensement(text, meaningLib, doAutotagging);
private void insertTextToWords(
final SentenceReader text,
final int phrase,
final int flagpos,
final Bitfield flagstemplate,
final boolean useForLanguageIdentification,
final WordCache meaningLib) {
if (text == null) return;
String word;
Word wprop;
WordTokenizer wordenum = new WordTokenizer(text, meaningLib);
try {
int pip = 0;
while (wordenum.hasMoreElements()) {
word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
if (useForLanguageIdentification) this.languageIdentificator.add(word);
if (word.length() < 2) continue;
wprop = this.words.get(word);
if (wprop == null) wprop = new Word(0, pip, phrase);
if (wprop.flags == null) wprop.flags = flagstemplate.clone();
wprop.flags.set(flagpos, true);
this.words.put(word, wprop);
} finally {
public int excludeWords(final SortedSet<String> stopwords) {
// subtracts the given stopwords from the word list
// the word list shrinkes. This returns the number of shrinked words
final int oldsize = this.words.size();
SetTools.excludeDestructive(this.words, stopwords);
return oldsize - this.words.size();
public Map<String, Word> words() {
// returns the words as word/indexWord relation map
return this.words;
public List<String> synonyms() {
ArrayList<String> l = new ArrayList<String>(this.synonyms.size());
for (String s: this.synonyms) l.add(s);
return l;
public long fuzzySignature() {
return this.fuzzy_signature;
public String fuzzySignatureText() {
return this.fuzzy_signature_text;
public long exactSignature() {
return this.exact_signature;
public String language() {
return this.languageIdentificator.getLanguage();
private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) {
assert text != null;
final Set<String> currsentwords = new HashSet<String>();
String word = "";
String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
String k;
Tagging.Metatag tag;
int wordlen;
Word wsp;
final Word wsp1;
int wordHandle;
int wordHandleCount = 0;
final int sentenceHandleCount = 0;
int allwordcounter = 0;
final int allsentencecounter = 0;
int wordInSentenceCounter = 1;
boolean comb_indexof = false, last_last = false, last_index = false;
final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false;
// read source
final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
try {
while (wordenum.hasMoreElements()) {
word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
if (this.languageIdentificator != null) this.languageIdentificator.add(word);
if (word.length() < wordminsize) continue;
// get tags from autotagging
if (doAutotagging) {
for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
// wordc is number of words that are tested
StringBuilder sb = new StringBuilder();
if (wordc == 1) {
} else {
for (int w = 0; w < wordc - 1; w++) {
sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
String testterm = sb.toString().trim();
//System.out.println("Testing: " + testterm);
tag = LibraryProvider.autotagging.getTagFromTerm(testterm);
if (tag != null) {
String navigatorName = tag.getVocabularyName();
Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
if (tagset == null) {
tagset = new HashSet<Tagging.Metatag>();
this.tags.put(navigatorName, tagset);
// shift wordcache
System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
wordcache[wordcache.length - 1] = word;
// distinguish punctuation and words
wordlen = word.length();
if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
// store sentence
wordInSentenceCounter = 1;
} else {
// check index.of detection
if (last_last && comb_indexof && word.equals("modified")) {
this.RESULT_FLAGS.set(flag_cat_indexof, true);
wordenum.pre(true); // parse lines as they come with CRLF
if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
last_last = word.equals("last");
last_index = word.equals("index");
// store word
wsp = this.words.get(word);
if (wsp != null) {
// word already exists
wordHandle = wsp.posInText;;
} else {
// word does not yet exist, create new word entry
wordHandle = wordHandleCount++;
wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
wsp.flags = this.RESULT_FLAGS.clone();
this.words.put(word, wsp);
// we now have the unique handle of the word, put it into the sentence:
} finally {
if (pseudostemming) {
Map.Entry<String, Word> entry;
// we search for similar words and reorganize the corresponding sentences
// a word is similar, if a shortened version is equal
final Iterator<Map.Entry<String, Word>> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order
wordsearch: while (wi.hasNext()) {
entry =;
word = entry.getKey();
wordlen = word.length();
wsp = entry.getValue();
for (int i = wordcut; i > 0; i--) {
if (wordlen > i) {
k = word.substring(0, wordlen - i);
if (this.words.containsKey(k)) {
// update word counter
wsp1.count = wsp1.count + wsp.count;
this.words.put(k, wsp1);
// remove current word
continue wordsearch;
// store result
//this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
this.RESULT_NUMB_WORDS = allwordcounter;
this.RESULT_DIFF_WORDS = wordHandleCount;
this.RESULT_NUMB_SENTENCES = allsentencecounter;
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
// returns a word/indexWord relation map
if (text == null) return null;
return new Condenser(text, meaningLib, false).words();
public static void main(final String[] args) {
// read a property file and convert them into configuration lines
try {
final File f = new File(args[0]);
final Properties p = new Properties();
p.load(new FileInputStream(f));
final StringBuilder sb = new StringBuilder();
for (int i = 0; i <= 15; i++) {
final String s = p.getProperty("keywords" + i);
final String[] l = s.split(",");
for (final String element : l) {
if (i < 15) sb.append(",\n");
} catch (final FileNotFoundException e) {
} catch (final IOException e) {
* Copyright 2004 by Michael Peter Christen,, Frankfurt am Main, Germany
* First released 09.01.2004 at
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* Lesser General Public License for more details.
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <>.
package net.yacy.document;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeMap;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.update.processor.Lookup3Signature;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.document.language.Identificator;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.SetTools;
public final class Condenser {
// this is the page analysis class
public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
public final static int wordminsize = 2;
public final static int wordcut = 2;
// category flags that show how the page can be distinguished in different interest groups
public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of')
public static final int flag_cat_haslocation = 19; // the page has a location metadata attached
public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images
public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file
public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
//private Properties analysis;
private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
private final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
private final Set<String> synonyms; // a set of synonyms to the words
private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection
private String fuzzy_signature_text = null; // signatures for double-check detection
public int RESULT_NUMB_WORDS = -1;
public int RESULT_DIFF_WORDS = -1;
public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1;
public Bitfield RESULT_FLAGS = new Bitfield(4);
private final Identificator languageIdentificator;
public Condenser(
final Document document,
final boolean indexText,
final boolean indexMedia,
final WordCache meaningLib,
final SynonymLibrary synlib,
final boolean doAutotagging
) {
Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
// if addMedia == true, then all the media links are also parsed and added to the words
// added media words are flagged with the appropriate media flag
this.words = new HashMap<String, Word>();
this.synonyms = new LinkedHashSet<String>();
this.RESULT_FLAGS = new Bitfield(4);
// construct flag set for document
if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.dc_source().getContentDomain() == ContentDomain.APP || !document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true);
if ( != 0.0 && document.lon() != 0.0) this.RESULT_FLAGS.set(flag_cat_haslocation, true);
this.languageIdentificator = new Identificator();
// add the URL components to the word list
insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
Map.Entry<MultiProtocolURI, String> entry;
if (indexText) {
createCondensement(document.getTextString(), meaningLib, doAutotagging);
// the phrase counter:
// phrase 0 are words taken from the URL
// phrase 1 is the MainTitle
// phrase 2 is <not used>
// phrase 3 is the Document Abstract
// phrase 4 is the Document Author
// phrase 5 is the Document Publisher
// phrase 6 are the tags specified in document
// phrase 10 and above are the section headlines/titles (88 possible)
// phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!)
// phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text
insertTextToWords(new SentenceReader(document.dc_title()), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_creator()), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_publisher()), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_subject(' ')), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
// missing: tags!
final String[] titles = document.getSectionTitles();
for (int i = 0; i < titles.length; i++) {
insertTextToWords(new SentenceReader(titles[i]), i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib);
// anchors: for text indexing we add only the anchor description
// REMOVED! Reason:
// words from the anchor description should appear as normal text in the output from the parser
// to flag these words as appearance in dc_description would confuse, since the user expects such word as titles of
// pages that are shown in the search result. The words from the URLS should also not appear as part of the index, because they
// are not visible in the text and could be used to crate fake-content
final Iterator<Map.Entry<yacyURL, String>> i = document.getAnchors().entrySet().iterator();
while (i.hasNext()) {
entry =;
if ((entry == null) || (entry.getKey() == null)) continue;
insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true);
} else {
if (indexMedia) {
// add anchor descriptions: here, we also add the url components
// audio
Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) {
entry =;
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib);
// video
i = document.getVideolinks().entrySet().iterator();
while (i.hasNext()) {
entry =;
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib);
// applications
i = document.getApplinks().entrySet().iterator();
while (i.hasNext()) {
entry =;
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib);
// images
final Iterator<ImageEntry> j = document.getImages().values().iterator();
ImageEntry ientry;
MultiProtocolURI url;
while (j.hasNext()) {
ientry =;
url = ientry.url();
if (url == null) continue;
insertTextToWords(new SentenceReader(url.toNormalform(true)), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(new SentenceReader(ientry.alt()), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
// finally check all words for missing flag entry
final Iterator<Map.Entry<String, Word>> k = this.words.entrySet().iterator();
Word wprop;
Map.Entry<String, Word> we;
while (k.hasNext()) {
we =;
wprop = we.getValue();
if (wprop.flags == null) {
wprop.flags = this.RESULT_FLAGS.clone();
this.words.put(we.getKey(), wprop);
// extend the tags in the document object with autotagging tags
if (!this.tags.isEmpty()) {
if (synlib != null) {
for (String word: this.words.keySet()) {
Set<String> syms = synlib.getSynonyms(word);
if (syms != null) this.synonyms.addAll(syms);
String text = document.getTextString();
// create the synonyms set
if (synonyms != null) {
for (String word: this.words.keySet()) {
Set<String> syms = synlib.getSynonyms(word);
if (syms != null) this.synonyms.addAll(syms);
// create hashes for duplicate detection
// check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b
EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature();
Map<String,String> sp = new HashMap<String,String>();
sp.put("quantRate", Float.toString(Boost.RANKING.getQuantRate())); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5!
sp.put("minTokenLen", Integer.toString(Boost.RANKING.getMinTokenLen()));
fuzzySignatureFactory.init(new MapSolrParams(sp));
byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature();
long l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (fuzzy_signature_hash[i] & 0xff);
this.fuzzy_signature = l;
this.fuzzy_signature_text = fuzzySignatureFactory.getSignatureText().toString();
Lookup3Signature exactSignatureFactory = new Lookup3Signature();
byte[] exact_signature_hash = exactSignatureFactory.getSignature();
l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (exact_signature_hash[i] & 0xff);
this.exact_signature = l;
private Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) {
this.languageIdentificator = null; // we don't need that here
// analysis = new Properties();
this.words = new TreeMap<String, Word>();
this.synonyms = new HashSet<String>();
createCondensement(text, meaningLib, doAutotagging);
private void insertTextToWords(
final SentenceReader text,
final int phrase,
final int flagpos,
final Bitfield flagstemplate,
final boolean useForLanguageIdentification,
final WordCache meaningLib) {
if (text == null) return;
String word;
Word wprop;
WordTokenizer wordenum = new WordTokenizer(text, meaningLib);
try {
int pip = 0;
while (wordenum.hasMoreElements()) {
word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
if (useForLanguageIdentification) this.languageIdentificator.add(word);
if (word.length() < 2) continue;
wprop = this.words.get(word);
if (wprop == null) wprop = new Word(0, pip, phrase);
if (wprop.flags == null) wprop.flags = flagstemplate.clone();
wprop.flags.set(flagpos, true);
this.words.put(word, wprop);
} finally {
public int excludeWords(final SortedSet<String> stopwords) {
// subtracts the given stopwords from the word list
// the word list shrinkes. This returns the number of shrinked words
final int oldsize = this.words.size();
SetTools.excludeDestructive(this.words, stopwords);
return oldsize - this.words.size();
public Map<String, Word> words() {
// returns the words as word/indexWord relation map
return this.words;
public List<String> synonyms() {
ArrayList<String> l = new ArrayList<String>(this.synonyms.size());
for (String s: this.synonyms) l.add(s);
return l;
public long fuzzySignature() {
return this.fuzzy_signature;
public String fuzzySignatureText() {
return this.fuzzy_signature_text;
public long exactSignature() {
return this.exact_signature;
public String language() {
return this.languageIdentificator.getLanguage();
private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) {
assert text != null;
final Set<String> currsentwords = new HashSet<String>();
String word = "";
String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
String k;
Tagging.Metatag tag;
int wordlen;
Word wsp;
final Word wsp1;
int wordHandle;
int wordHandleCount = 0;
final int sentenceHandleCount = 0;
int allwordcounter = 0;
final int allsentencecounter = 0;
int wordInSentenceCounter = 1;
boolean comb_indexof = false, last_last = false, last_index = false;
final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false;
// read source
final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
try {
while (wordenum.hasMoreElements()) {
word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
if (this.languageIdentificator != null) this.languageIdentificator.add(word);
if (word.length() < wordminsize) continue;
// get tags from autotagging
if (doAutotagging) {
for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
// wordc is number of words that are tested
StringBuilder sb = new StringBuilder();
if (wordc == 1) {
} else {
for (int w = 0; w < wordc - 1; w++) {
sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
String testterm = sb.toString().trim();
//System.out.println("Testing: " + testterm);
tag = LibraryProvider.autotagging.getTagFromTerm(testterm);
if (tag != null) {
String navigatorName = tag.getVocabularyName();
Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
if (tagset == null) {
tagset = new HashSet<Tagging.Metatag>();
this.tags.put(navigatorName, tagset);
// shift wordcache
System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
wordcache[wordcache.length - 1] = word;
// distinguish punctuation and words
wordlen = word.length();
if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
// store sentence
wordInSentenceCounter = 1;
} else {
// check index.of detection
if (last_last && comb_indexof && word.equals("modified")) {
this.RESULT_FLAGS.set(flag_cat_indexof, true);
wordenum.pre(true); // parse lines as they come with CRLF
if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
last_last = word.equals("last");
last_index = word.equals("index");
// store word
wsp = this.words.get(word);
if (wsp != null) {
// word already exists
wordHandle = wsp.posInText;;
} else {
// word does not yet exist, create new word entry
wordHandle = wordHandleCount++;
wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
wsp.flags = this.RESULT_FLAGS.clone();
this.words.put(word, wsp);
// we now have the unique handle of the word, put it into the sentence:
} finally {
if (pseudostemming) {
Map.Entry<String, Word> entry;
// we search for similar words and reorganize the corresponding sentences
// a word is similar, if a shortened version is equal
final Iterator<Map.Entry<String, Word>> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order
wordsearch: while (wi.hasNext()) {
entry =;
word = entry.getKey();
wordlen = word.length();
wsp = entry.getValue();
for (int i = wordcut; i > 0; i--) {
if (wordlen > i) {
k = word.substring(0, wordlen - i);
if (this.words.containsKey(k)) {
// update word counter
wsp1.count = wsp1.count + wsp.count;
this.words.put(k, wsp1);
// remove current word
continue wordsearch;
// store result
//this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
this.RESULT_NUMB_WORDS = allwordcounter;
this.RESULT_DIFF_WORDS = wordHandleCount;
this.RESULT_NUMB_SENTENCES = allsentencecounter;
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
// returns a word/indexWord relation map
if (text == null) return null;
return new Condenser(text, meaningLib, false).words();
public static void main(final String[] args) {
// read a property file and convert them into configuration lines
try {
final File f = new File(args[0]);
final Properties p = new Properties();
p.load(new FileInputStream(f));
final StringBuilder sb = new StringBuilder();
for (int i = 0; i <= 15; i++) {
final String s = p.getProperty("keywords" + i);
final String[] l = s.split(",");
for (final String element : l) {
if (i < 15) sb.append(",\n");
} catch (final FileNotFoundException e) {
} catch (final IOException e) {

@ -722,7 +722,7 @@ dc_rights
final String language = dc_language();
if (language != null && language.length() > 0) os.write("<dc:language>" + dc_language() + "</dc:language>\n");
os.write("<dc:date>" + ISO8601Formatter.FORMATTER.format(date) + "</dc:date>\n");
if (this.lon != 0.0f && != 0.0f) os.write("<geo:Point><geo:long>" + this.lon +"</geo:long><geo:lat>" + + "</geo:lat></geo:Point>\n");
if (this.lon != 0.0 && != 0.0) os.write("<geo:Point><geo:long>" + this.lon +"</geo:long><geo:lat>" + + "</geo:lat></geo:Point>\n");
@ -821,7 +821,7 @@ dc_rights
ContentScraper.addAllImages(images, doc.getImages());
if (doc.lon() != 0.0f && != 0.0f) { lon = doc.lon(); lat =; }
if (doc.lon() != 0.0 && != 0.0) { lon = doc.lon(); lat =; }
// clean up parser data

@ -226,7 +226,7 @@ public class URIMetadataRow {
if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher);
if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF();
if (lon == 0.0 && lat == 0.0) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF();
String s0 = s.toString();
return UTF8.getBytes(s0);
@ -514,7 +514,11 @@ public class URIMetadataRow {
if (p < 0) {
return 0.0d;
return this.latlon.charAt(0) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(0, p));
try {
return this.latlon.charAt(0) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(0, p));
} catch (NumberFormatException e) {
return 0.0d;
public double lon() {
if (this.latlon == null || this.latlon.isEmpty()) return 0.0d;
@ -522,7 +526,11 @@ public class URIMetadataRow {
if (p < 0) {
return 0.0d;
return this.latlon.charAt(p + 1) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(p + 1));
try {
return this.latlon.charAt(p + 1) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(p + 1));
} catch (NumberFormatException e) {
return 0.0d;

@ -33,6 +33,11 @@ import;
import java.util.Iterator;
import net.yacy.kelondro.index.Index;
import net.yacy.kelondro.index.Row;
public class migration {
//SVN constants
@ -256,4 +261,82 @@ public class migration {
sb.setConfig("crawler.http.acceptCharset", sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7"));
* converts old urldb to Solr.
* In chunks of 1000 entries.
* Creates a lock file in workdir to allow only one active migration thread
* @return current size of urldb index
public static int migrateUrldbtoSolr(final Switchboard sb) {
int ret = 0;
final File f;
final Fulltext ft = sb.index.fulltext();
if (ft.getURLDb() != null) {
ret = ft.getURLDb().size();
f = new File(sb.workPath, "migrateUrldbtoSolr.lck");
if (f.exists()) {
return ret;
} else {
try {
} catch (IOException ex) {
Log.logInfo("migrateUrldbtoSolr","could not create lock file");
final Thread t = new Thread() {
boolean go = true;
final Index urldb = ft.getURLDb();
public void run() {
try {
int i = urldb.size();
while (go && i > 0) {
List<Row.Entry> chunk = urldb.random(1000);
if ((chunk == null) || (chunk.size() == 0)) {
go = false;
Iterator<Row.Entry> chunkit = chunk.iterator();
while (go && chunkit.hasNext()) {
try { // to catch any data errors
URIMetadataRow row = new URIMetadataRow(, null);
ft.putMetadata(row); // this deletes old urldb-entry first and inserts into Solr
if (Switchboard.getSwitchboard().shallTerminate()) {
go = false;
} catch (Exception e) {
Log.logInfo("migrateUrldbtoSolr", "some error while adding old data to new index, continue with next entry");
Log.logInfo("migrateUrldbtoSolr", Integer.toString(i) + " entries left (convert next chunk of 1000 entries)");
} catch (IOException ex) {
Log.logInfo("migrateUrldbtoSolr", "error reading old urldb index");
} finally {
if (f.exists()) {
f.delete(); // delete lock file
public void exit() {
go = false;
return ret;

@ -100,6 +100,28 @@ public final class Fulltext implements Iterable<byte[]> {
this.forcedCommitTime = 0;
* @deprecated
* used only for migration
* @return the connected URLDb
public Index getURLDb() {
return this.urlIndexFile;
* true if old metadata index URLDb is connected.
* used only for migration
* @deprecated
* current and future versions use Solr for metadata
public boolean connectedURLDb() {
return this.urlIndexFile != null;
protected void connectUrlDb(final String tablename, final boolean useTailCache, final boolean exceed134217727) {
if (this.urlIndexFile != null) return;
this.tablename = tablename;

@ -274,7 +274,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, "UTF8");
// coordinates
if ( != 0.0f && md.lon() != 0.0f) {
if ( != 0.0 && md.lon() != 0.0) {
if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString( + "," + Double.toString(md.lon()));
if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, 200);
@ -794,7 +794,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, document.getCharset());
// coordinates
if ( != 0.0f && document.lon() != 0.0f) {
if ( != 0.0 && document.lon() != 0.0) {
if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString( + "," + Double.toString(document.lon()));
if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, responseHeader == null ? 200 : responseHeader.getStatusCode());

@ -836,7 +836,7 @@ public final class SearchEvent {
// check location constraint
if ((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_haslocation)) && ( == 0.0f || page.lon() == 0.0f)) {
if ((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_haslocation)) && ( == 0.0 || page.lon() == 0.0)) {
