added (manual) urldb migration (link on: Index Administraton -> Federated Solr Index)

- migrates all entries in old urldb

Metadata coordinate (lat / lon) NumberFormatException still relative often (see excerpt below), 
- added try/catch for URIMetadataRow (seems not to be needed in URIMetaDataNode, as Solr internally checks for number format)
- removed possible typ conversion for lat() / lon() comparison with 0.0f, changed to 0.0  (leaving it to the compiler/optimizer to choose number format)

current log excerpt for NumberFormatException:
W 2013/01/14 00:10:07 StackTrace For input string: "-"
java.lang.NumberFormatException: For input string: "-"
	at sun.misc.FloatingDecimal.readJavaFormatString(Unknown Source)
	at java.lang.Double.parseDouble(Unknown Source)
	at net.yacy.kelondro.data.meta.URIMetadataRow$Components.lon(URIMetadataRow.java:525)
	at net.yacy.kelondro.data.meta.URIMetadataRow.lon(URIMetadataRow.java:279)
	at net.yacy.search.index.SolrConfiguration.metadata2solr(SolrConfiguration.java:277)
	at net.yacy.search.index.Fulltext.putMetadata(Fulltext.java:329)
	at transferURL.respond(transferURL.java:152)
...
Caused by: java.lang.NumberFormatException: For input string: "-"
	at sun.misc.FloatingDecimal.readJavaFormatString(Unknown Source)
	at java.lang.Double.parseDouble(Unknown Source)
	at net.yacy.kelondro.data.meta.URIMetadataRow$Components.lon(URIMetadataRow.java:525)
	at net.yacy.kelondro.data.meta.URIMetadataRow.lon(URIMetadataRow.java:279)
	at net.yacy.search.index.SolrConfiguration.metadata2solr(SolrConfiguration.java:277)
	at net.yacy.search.index.Fulltext.putMetadata(Fulltext.java:329)
	at transferURL.respond(transferURL.java:152)
pull/1/head
reger 12 years ago
parent 3b6e08b49f
commit 3897bb4409

@ -26,7 +26,8 @@
This is a switchboard for the usage of embedded metadata to embedded solr.
The rwi index is necessary for index transmission and shall be switched off in future portalmode configurations.
<dl>
<dt><input type="checkbox" name="core.service.fulltext" id="core.service.fulltext" #(core.service.fulltext.checked)#:: checked="checked"#(/core.service.fulltext.checked)# /></dt><dd>url metadata and embedded solr fulltext search index, interface at: <a href="/solr/select?q=*:*&start=0&rows=10" target="_blank">/solr/select?q=*:*&amp;start=0&amp;rows=10</a></dd>
<dt><input type="checkbox" name="core.service.fulltext" id="core.service.fulltext" #(core.service.fulltext.checked)#:: checked="checked"#(/core.service.fulltext.checked)# /></dt><dd>url metadata and embedded solr fulltext search index, interface at: <a href="/solr/select?q=*:*&start=0&rows=10" target="_blank">/solr/select?q=*:*&amp;start=0&amp;rows=10</a>
#(migrateUrlDbtoSolr)#:: <input type="button" class="submitready" onclick="window.location = '/api/migrateurldb_p.html';" value="migrate old index" />#(/migrateUrlDbtoSolr)# </dd>
<dt><input type="checkbox" name="core.service.rwi.tmp" id="core.service.rwi" #(core.service.rwi.tmp.checked)#:: checked="checked"#(/core.service.rwi.tmp.checked)# /></dt><dd>embedded 'classic' rwi index</dd>
<dt><input type="checkbox" name="core.service.citation.tmp" id="core.service.citation" #(core.service.citation.tmp.checked)#:: checked="checked"#(/core.service.citation.tmp.checked)# /></dt><dd>embedded citation reference index (link structure, used for ranking)</dd>
<dt></dt><dd><input type="submit" name="set" value="Set" /></dd>

@ -225,6 +225,10 @@ public class IndexFederated_p {
prop.put("solr.indexing.sharding", env.getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_SHARDING, "modulo-host-md5"));
prop.put("solr.indexing.schemefile", schemename);
if ((sb.index.fulltext().connectedURLDb())) {
prop.put("migrateUrlDbtoSolr", 1);
} else prop.put("migrateUrlDbtoSolr", 0);
// return rewrite properties
return prop;
}

@ -0,0 +1,36 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Migrate URLdb</title>
#%env/templates/metas.template%#
</head>
<body>
#%env/templates/header.template%#
#%env/templates/simpleheader.template%#
<h2>Migrate URLdb to embedded Solr Index</h2>
<p>Convert old meta data (urldb) index to embedded Solr fulltext index.</p>
<dl>
<dd>
<p>A low priority background job has been started which reads the old index, adds it to Solr and deletes the entry from the old index.</p>
<p>The default "slow migration" updates any entry in the old urldb index upon access (e.g. during search events).<br />
If you feel that the not accessed entries are still relevant, with this migration all entries from the old urldb index will be migrated.</p>
<p>You may refresh this page to see how many entries in the old index are left for migration</p>
<p>Hint: this background task runs until all entries are migrated or YaCy is shutdown. The migration is not automatically restarted.</p>
</dd>
</dl>
<form action="migrateurldb_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<input type="hidden" name="lastcount" value="#[lastcount]#" />
<input type="hidden" name="lasttime" value="#[lasttime]#" />
<p><b>#[count]# entries</b> in old index left to migrate.</p>
<p>For large indexes this may run for a long time (migration speed: #[speed]# entries per minute) <input type="submit" name="dorefresh" value="refresh" /></p>
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

@ -0,0 +1,44 @@
// migrateurldb_p.java
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.migration;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
public class migrateurldb_p {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
int cnt;
if ((cnt = migration.migrateUrldbtoSolr(sb)) > 0) {
prop.put("count", cnt);
if (post != null && post.containsKey("dorefresh")) {
int lastcount = post.getInt("lastcount", 0);
Long t = post.getLong("lasttime", 1);
Double difft = (System.currentTimeMillis() - t) / 60000.0d;
int diff = (int)((lastcount - cnt) / difft) ;
prop.put("speed", diff);
prop.put("lasttime", t);
prop.put("lastcount", lastcount);
} else {
prop.put("speed", "?");
prop.put("lastcount",cnt);
prop.put("lasttime", System.currentTimeMillis());
}
} else {
prop.put("speed", "");
prop.put("count", "no urldb index available");
}
// return rewrite properties
return prop;
}
}

@ -106,7 +106,7 @@ public final class Condenser {
if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.dc_source().getContentDomain() == ContentDomain.APP || !document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true);
if (document.lat() != 0.0f && document.lon() != 0.0f) this.RESULT_FLAGS.set(flag_cat_haslocation, true);
if (document.lat() != 0.0 && document.lon() != 0.0) this.RESULT_FLAGS.set(flag_cat_haslocation, true);
this.languageIdentificator = new Identificator();

@ -722,7 +722,7 @@ dc_rights
final String language = dc_language();
if (language != null && language.length() > 0) os.write("<dc:language>" + dc_language() + "</dc:language>\n");
os.write("<dc:date>" + ISO8601Formatter.FORMATTER.format(date) + "</dc:date>\n");
if (this.lon != 0.0f && this.lat != 0.0f) os.write("<geo:Point><geo:long>" + this.lon +"</geo:long><geo:lat>" + this.lat + "</geo:lat></geo:Point>\n");
if (this.lon != 0.0 && this.lat != 0.0) os.write("<geo:Point><geo:long>" + this.lon +"</geo:long><geo:lat>" + this.lat + "</geo:lat></geo:Point>\n");
os.write("</record>\n");
}
@ -821,7 +821,7 @@ dc_rights
anchors.putAll(doc.getAnchors());
rss.putAll(doc.getRSS());
ContentScraper.addAllImages(images, doc.getImages());
if (doc.lon() != 0.0f && doc.lat() != 0.0f) { lon = doc.lon(); lat = doc.lat(); }
if (doc.lon() != 0.0 && doc.lat() != 0.0) { lon = doc.lon(); lat = doc.lat(); }
}
// clean up parser data

@ -226,7 +226,7 @@ public class URIMetadataRow {
s.appendLF();
if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher);
s.appendLF();
if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF();
if (lon == 0.0 && lat == 0.0) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF();
String s0 = s.toString();
s.close();
return UTF8.getBytes(s0);
@ -514,7 +514,11 @@ public class URIMetadataRow {
if (p < 0) {
return 0.0d;
}
try {
return this.latlon.charAt(0) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(0, p));
} catch (NumberFormatException e) {
return 0.0d;
}
}
public double lon() {
if (this.latlon == null || this.latlon.isEmpty()) return 0.0d;
@ -522,7 +526,11 @@ public class URIMetadataRow {
if (p < 0) {
return 0.0d;
}
try {
return this.latlon.charAt(p + 1) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(p + 1));
} catch (NumberFormatException e) {
return 0.0d;
}
}
}

@ -33,6 +33,11 @@ import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import com.google.common.io.Files;
import java.util.Iterator;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.Index;
import net.yacy.kelondro.index.Row;
import net.yacy.search.index.Fulltext;
public class migration {
//SVN constants
@ -256,4 +261,82 @@ public class migration {
sb.setConfig("crawler.http.acceptCharset", sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7"));
}
}
/**
* converts old urldb to Solr.
* In chunks of 1000 entries.
* Creates a lock file in workdir to allow only one active migration thread
* @return current size of urldb index
*/
@SuppressWarnings("deprecation")
public static int migrateUrldbtoSolr(final Switchboard sb) {
int ret = 0;
final File f;
final Fulltext ft = sb.index.fulltext();
if (ft.getURLDb() != null) {
ret = ft.getURLDb().size();
f = new File(sb.workPath, "migrateUrldbtoSolr.lck");
f.deleteOnExit();
if (f.exists()) {
return ret;
} else {
try {
f.createNewFile();
} catch (IOException ex) {
Log.logInfo("migrateUrldbtoSolr","could not create lock file");
}
}
final Thread t = new Thread() {
boolean go = true;
final Index urldb = ft.getURLDb();
public void run() {
try {
Thread.currentThread().setName("migration.migrateUrldbtoSolr");
int i = urldb.size();
while (go && i > 0) {
List<Row.Entry> chunk = urldb.random(1000);
if ((chunk == null) || (chunk.size() == 0)) {
go = false;
break;
}
Iterator<Row.Entry> chunkit = chunk.iterator();
while (go && chunkit.hasNext()) {
try { // to catch any data errors
URIMetadataRow row = new URIMetadataRow(chunkit.next(), null);
ft.putMetadata(row); // this deletes old urldb-entry first and inserts into Solr
i--;
if (Switchboard.getSwitchboard().shallTerminate()) {
go = false;
}
} catch (Exception e) {
Log.logInfo("migrateUrldbtoSolr", "some error while adding old data to new index, continue with next entry");
}
}
Log.logInfo("migrateUrldbtoSolr", Integer.toString(i) + " entries left (convert next chunk of 1000 entries)");
}
ft.commit();
} catch (IOException ex) {
Log.logInfo("migrateUrldbtoSolr", "error reading old urldb index");
} finally {
if (f.exists()) {
f.delete(); // delete lock file
}
}
}
public void exit() {
go = false;
}
};
t.setPriority(Thread.MIN_PRIORITY);
t.start();
}
return ret;
}
}

@ -100,6 +100,28 @@ public final class Fulltext implements Iterable<byte[]> {
this.forcedCommitTime = 0;
}
/**
* @deprecated
* used only for migration
* @return the connected URLDb
*/
@Deprecated
public Index getURLDb() {
return this.urlIndexFile;
}
/**
* true if old metadata index URLDb is connected.
* used only for migration
* @deprecated
* current and future versions use Solr for metadata
*/
@Deprecated
public boolean connectedURLDb() {
return this.urlIndexFile != null;
}
protected void connectUrlDb(final String tablename, final boolean useTailCache, final boolean exceed134217727) {
if (this.urlIndexFile != null) return;
this.tablename = tablename;

@ -274,7 +274,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, "UTF8");
// coordinates
if (md.lat() != 0.0f && md.lon() != 0.0f) {
if (md.lat() != 0.0 && md.lon() != 0.0) {
if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(md.lat()) + "," + Double.toString(md.lon()));
}
if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, 200);
@ -794,7 +794,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, document.getCharset());
// coordinates
if (document.lat() != 0.0f && document.lon() != 0.0f) {
if (document.lat() != 0.0 && document.lon() != 0.0) {
if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(document.lat()) + "," + Double.toString(document.lon()));
}
if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, responseHeader == null ? 200 : responseHeader.getStatusCode());

@ -836,7 +836,7 @@ public final class SearchEvent {
}
// check location constraint
if ((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_haslocation)) && (page.lat() == 0.0f || page.lon() == 0.0f)) {
if ((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_haslocation)) && (page.lat() == 0.0 || page.lon() == 0.0)) {
this.query.misses.add(page.hash());
continue;
}

Loading…
Cancel
Save