- removed old metadata database and all migration code

- refactored all code which uses URIMetadataRow as standard for word
hash length and word hash ordering and moved that to the class 'Word',
becuase the class URIMetadataRow defined the old metadata data structure
and should be superfluous in the future
- removed unused methods from URIMetadataRow as preparation for further
removal of that class
pull/1/head
Michael Peter Christen 11 years ago
parent d3de309953
commit 1ea17bd9f3

@ -45,7 +45,6 @@ import net.yacy.cora.util.SpaceExceededException;
import net.yacy.data.ListManager;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
@ -110,8 +109,8 @@ public class IndexControlRWIs_p {
final String[] urls = post.getAll("urlhx.*");
HandleSet urlb =
new RowHandleSet(
URIMetadataRow.rowdef.primaryKeyLength,
URIMetadataRow.rowdef.objectOrder,
Word.commonHashLength,
Word.commonHashOrder,
urls.length);
if ( urls != null ) {
for ( final String s : urls ) {
@ -165,8 +164,8 @@ public class IndexControlRWIs_p {
final Iterator<WordReference> en = index.entries();
urlb =
new RowHandleSet(
URIMetadataRow.rowdef.primaryKeyLength,
URIMetadataRow.rowdef.objectOrder,
Word.commonHashLength,
Word.commonHashOrder,
index.size());
while ( en.hasNext() ) {
try {
@ -208,8 +207,8 @@ public class IndexControlRWIs_p {
}
final HandleSet urlHashes =
new RowHandleSet(
URIMetadataRow.rowdef.primaryKeyLength,
URIMetadataRow.rowdef.objectOrder,
Word.commonHashLength,
Word.commonHashOrder,
0);
for ( final byte[] b : urlb ) {
try {
@ -363,8 +362,8 @@ public class IndexControlRWIs_p {
final String blacklist = post.get("blacklist", "");
final HandleSet urlHashes =
new RowHandleSet(
URIMetadataRow.rowdef.primaryKeyLength,
URIMetadataRow.rowdef.objectOrder,
Word.commonHashLength,
Word.commonHashOrder,
urlb.size());
if ( post.containsKey("blacklisturls") ) {
final String[] supportedBlacklistTypes =

@ -149,7 +149,7 @@ public class IndexControlURLs_p {
// delete everything
if ( post.containsKey("deletecomplete") ) {
if ( post.get("deleteIndex", "").equals("on") ) {
try {segment.fulltext().clearURLIndex();} catch (final IOException e) {}
segment.fulltext().clearURLIndex();
try {segment.fulltext().clearLocalSolr();} catch (final IOException e) {}
}
if ( post.get("deleteRemoteSolr", "").equals("on")) {

@ -87,11 +87,9 @@ public class IndexFederated_p {
if (previous_core_fulltext && !post_core_fulltext) {
// switch off
sb.index.fulltext().disconnectLocalSolr();
sb.index.fulltext().disconnectUrlDb();
}
if (!previous_core_fulltext && post_core_fulltext) {
// switch on
sb.index.connectUrlDb(sb.useTailCache, sb.exceed134217727);
try { sb.index.fulltext().connectLocalSolr(); } catch (final IOException e) { ConcurrentLog.logException(e); }
}

@ -36,7 +36,6 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.Digest;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;

@ -1,36 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Migrate URLdb</title>
#%env/templates/metas.template%#
</head>
<body>
#%env/templates/header.template%#
#%env/templates/submenuIndexControl.template%#
<h2>Migrate URLdb to embedded Solr Index</h2>
<p>Convert old meta data (urldb) index to embedded Solr fulltext index.</p>
<dl>
<dd>
<p>A low priority background job has been started which reads the old index, adds it to Solr and deletes the entry from the old index.</p>
<p>The default "slow migration" updates any entry in the old urldb index upon access (e.g. during search events).<br />
If you feel that the not accessed entries are still relevant, with this migration all entries from the old urldb index will be migrated.</p>
<p>You may refresh this page to see how many entries in the old index are left for migration</p>
<p>Hint: this background task runs until all entries are migrated or YaCy is shutdown. The migration is not automatically restarted.</p>
</dd>
</dl>
<form action="migrateurldb_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<input type="hidden" name="lastcount" value="#[lastcount]#" />
<input type="hidden" name="lasttime" value="#[lasttime]#" />
<p><b>#[count]# entries</b> in old index left to migrate.</p>
<p>For large indexes this may run for a long time (migration speed: #[speed]# entries per minute) <input type="submit" name="dorefresh" value="refresh" /></p>
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

@ -1,44 +0,0 @@
// migrateurldb_p.java
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.migration;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
public class migrateurldb_p {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
int cnt;
if ((cnt = migration.migrateUrldbtoSolr(sb)) > 0) {
prop.put("count", cnt);
if (post != null && post.containsKey("dorefresh")) {
int lastcount = post.getInt("lastcount", 0);
Long t = post.getLong("lasttime", 1);
Double difft = (System.currentTimeMillis() - t) / 60000.0d;
int diff = (int)((lastcount - cnt) / difft) ;
prop.put("speed", diff);
prop.put("lasttime", t);
prop.put("lastcount", lastcount);
} else {
prop.put("speed", "?");
prop.put("lastcount",cnt);
prop.put("lasttime", System.currentTimeMillis());
}
} else {
prop.put("speed", "");
prop.put("count", "no urldb index available");
}
// return rewrite properties
return prop;
}
}

@ -42,7 +42,7 @@ import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.Memory;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.FileUtils;
@ -182,8 +182,8 @@ public final class transferRWI {
String wordHash;
byte[] urlHash;
WordReferenceRow iEntry;
final HandleSet unknownURL = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
final HandleSet knownURL = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
final HandleSet unknownURL = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
final HandleSet knownURL = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
final ArrayList<String> wordhashes = new ArrayList<String>();
int received = 0;
int blocked = 0;

@ -57,7 +57,6 @@ import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;

@ -51,7 +51,7 @@ import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Latency;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.BufferedObjectIndex;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.RowHandleSet;
@ -96,7 +96,7 @@ public class Balancer {
this.cacheStacksPath = cachePath;
this.domainStacks = new ConcurrentHashMap<String, HostHandles>();
this.domStackInitSize = Integer.MAX_VALUE;
this.double_push_check = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
this.double_push_check = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
this.zeroWaitingCandidates = new ArrayList<Map.Entry<String, byte[]>>();
this.random = new Random(System.currentTimeMillis());
@ -564,7 +564,7 @@ public class Balancer {
if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return;
this.domainStacks.clear();
this.lastDomainStackFill = System.currentTimeMillis();
final HandleSet blackhandles = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10);
final HandleSet blackhandles = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10);
String host;
Request request;
int count = 0;

@ -49,7 +49,6 @@ import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.FileUtils;
@ -588,7 +587,7 @@ public final class CrawlSwitchboard {
r = sei.next();
String handle = r.profileHandle();
RowHandleSet us = this.profilesActiveCrawlsCounter.get(handle);
if (us == null) {us = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.profilesActiveCrawlsCounter.put(handle, us);}
if (us == null) {us = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); this.profilesActiveCrawlsCounter.put(handle, us);}
if (us.size() < 100) us.put(r.url().hash()); // store the hash, but not too many
deletionCandidate.remove(handle);
if (deletionCandidate.size() == 0) return new HashSet<String>(0);

@ -32,7 +32,7 @@ import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.BufferedObjectIndex;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.RowHandleSet;
@ -60,7 +60,7 @@ public class HostQueue {
final boolean exceed134217727) {
this.hostHash = hostHash;
this.queuesPath = queuesPath;
this.urlHashDoubleCheck = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
this.urlHashDoubleCheck = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
// create a stack for newly entered entries
if (!(this.queuesPath.exists())) this.queuesPath.mkdir(); // make the path

@ -43,7 +43,7 @@ import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet;
public class BookmarksDB {
@ -212,7 +212,7 @@ public class BookmarksDB {
final TreeSet<String> set=new TreeSet<String>(new bookmarkComparator(true));
final String tagHash=BookmarkHelper.tagHash(tagName);
final Tag tag=getTag(tagHash);
RowHandleSet hashes = tag == null ? new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10) : tag.getUrlHashes();
RowHandleSet hashes = tag == null ? new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10) : tag.getUrlHashes();
if (priv) {
for (byte[] hash: hashes) set.add(ASCII.String(hash));
} else {
@ -389,7 +389,7 @@ public class BookmarksDB {
private Tag(final String name) {
this.tagHash = BookmarkHelper.tagHash(name);
this.tagName = name;
this.urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10);
this.urlHashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10);
}
/**

@ -11,16 +11,16 @@
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// it under the terms of the GNU General private License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// GNU General private License for more details.
//
// You should have received a copy of the GNU General Public License
// You should have received a copy of the GNU General private License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@ -31,7 +31,6 @@ import java.text.ParseException;
import java.util.Date;
import java.util.List;
import java.util.Properties;
import java.util.regex.Pattern;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
@ -43,7 +42,6 @@ import net.yacy.cora.order.Digest;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.util.ByteBuffer;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
@ -59,7 +57,7 @@ public class URIMetadataRow {
// this object stores attributes for URL entries
public static final Row rowdef = new Row(
private static final Row rowdef = new Row(
"String hash-12, " + // the url's hash
"String comp-360, " + // components: the url, description, author, tags and publisher
"Cardinal mod-4 {b256}, " + // last-modified from the httpd
@ -108,7 +106,7 @@ public class URIMetadataRow {
private WordReference word; // this is only used if the url is transported via remote search requests
private Components comp;
public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord) {
private URIMetadataRow(final Row.Entry entry, final WordReference searchedWord) {
this.entry = entry;
this.snippet = "";
this.word = searchedWord;
@ -242,17 +240,6 @@ public class URIMetadataRow {
return h;
}
private String hostHash = null;
public String hosthash() {
if (this.hostHash != null) return this.hostHash;
this.hostHash = ASCII.String(this.entry.getPrimaryKeyBytes(), 6, 6);
return this.hostHash;
}
public boolean matches(final Pattern matcher) {
return this.metadata().matches(matcher);
}
public DigestURL url() {
return this.metadata().url();
}
@ -281,7 +268,7 @@ public class URIMetadataRow {
return this.metadata().lon();
}
private Components metadata() {
public Components metadata() {
// avoid double computation of metadata elements
if (this.comp != null) return this.comp;
// parse elements from comp field;
@ -434,20 +421,6 @@ public class URIMetadataRow {
}
}
public Request toBalancerEntry(final String initiatorHash) {
return new Request(
ASCII.getBytes(initiatorHash),
metadata().url(),
referrerHash(),
metadata().dc_title(),
moddate(),
null,
0,
0,
0,
0);
}
/**
* @return the object as String.<br>
* This e.g. looks like this:
@ -472,7 +445,7 @@ public class URIMetadataRow {
private final String dc_title, dc_creator, dc_subject, dc_publisher;
private String latlon; // a comma-separated tuple as "<latitude>,<longitude>" where the coordinates are given as WGS84 spatial coordinates in decimal degrees
public Components(
private Components(
final String urlRaw,
final byte[] urlhash,
final String title,
@ -489,12 +462,7 @@ public class URIMetadataRow {
this.dc_publisher = publisher;
this.latlon = latlon;
}
public boolean matches(final Pattern matcher) {
if (this.urlRaw != null) return matcher.matcher(this.urlRaw.toLowerCase()).matches();
if (this.url != null) return matcher.matcher(this.url.toNormalform(true).toLowerCase()).matches();
return false;
}
public DigestURL url() {
private DigestURL url() {
if (this.url == null) {
try {
this.url = new DigestURL(this.urlRaw, this.urlHash);
@ -506,11 +474,11 @@ public class URIMetadataRow {
}
return this.url;
}
public String dc_title() { return this.dc_title; }
public String dc_creator() { return this.dc_creator; }
public String dc_publisher() { return this.dc_publisher; }
public String dc_subject() { return this.dc_subject; }
public double lat() {
private String dc_title() { return this.dc_title; }
private String dc_creator() { return this.dc_creator; }
private String dc_publisher() { return this.dc_publisher; }
private String dc_subject() { return this.dc_subject; }
private double lat() {
if (this.latlon == null || this.latlon.isEmpty()) return 0.0d;
final int p = this.latlon.indexOf(',');
if (p < 0) return 0.0d;
@ -523,7 +491,7 @@ public class URIMetadataRow {
return 0.0d;
}
}
public double lon() {
private double lon() {
if (this.latlon == null || this.latlon.isEmpty()) return 0.0d;
final int p = this.latlon.indexOf(',');
if (p < 0) return 0.0d;

@ -40,10 +40,8 @@ import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.MemoryControl;
public class Word {
/**
* this is the lenght(12) of the hash key that is used:<br>
* - for seed hashes (this Object)<br>
@ -51,7 +49,8 @@ public class Word {
* - for L-URL hashes (plasmaLURL.urlHashLength)<br><br>
* these hashes all shall be generated by base64.enhancedCoder
*/
public static final int commonHashLength = 12;
public static final int commonHashLength = 12;
public static final Base64Order commonHashOrder = Base64Order.enhancedCoder;
private static final int hashCacheSize = Math.max(20000, Math.min(200000, (int) (MemoryControl.available() / 40000L)));
private static ARC<String, byte[]> hashCache = null;
@ -64,12 +63,6 @@ public class Word {
ConcurrentLog.info("Word", "hashCache.size = " + 1000);
}
}
/*
private static ConcurrentHashMap<String, byte[]> hashCache = null;
static {
hashCache = new ConcurrentHashMap<String, byte[]>();
}
*/
// object carries statistics for words and sentences
public int count; // number of occurrences
@ -122,7 +115,7 @@ public class Word {
byte[] h = hashCache.get(wordlc);
if (h != null) return h;
// calculate the hash
h = Base64Order.enhancedCoder.encodeSubstring(Digest.encodeMD5Raw(wordlc), commonHashLength);
h = commonHashOrder.encodeSubstring(Digest.encodeMD5Raw(wordlc), commonHashLength);
while (h[0] == highByte && h[1] == highByte && h[2] == highByte && h[3] == highByte && h[4] == highByte) {
// ensure that word hashes do not start with hash '_____' which is a key for an extra hash range for private usage on the local peer
// statistically we are inside this loop only every 2^^30 calls of word2hash (which means almost never)

@ -40,7 +40,7 @@ import net.yacy.cora.storage.ComparableARC;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.MergeIterator;
@ -102,7 +102,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
this.targetFileSize = targetFileSize;
this.maxFileSize = maxFileSize;
this.writeBufferSize = writeBufferSize;
this.removeDelayedURLs = new TreeMap<byte[], HandleSet>(URIMetadataRow.rowdef.objectOrder);
this.removeDelayedURLs = new TreeMap<byte[], HandleSet>(Word.commonHashOrder);
this.flushShallRun = true;
this.flushThread = new FlushThread();
this.flushThread.start();
@ -399,7 +399,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
r = this.removeDelayedURLs.get(termHash);
}
if (r == null) {
r = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
r = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
}
try {
r.put(urlHashBytes);
@ -414,7 +414,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
@Override
public void removeDelayed() throws IOException {
final HandleSet words = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed.
final HandleSet words = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed.
synchronized (this.removeDelayedURLs) {
for (final byte[] b: this.removeDelayedURLs.keySet()) try {words.put(b);} catch (final SpaceExceededException e) {}
}
@ -476,7 +476,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
}
public RemoveReducer(final byte[] urlHashBytes) {
this.urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
this.urlHashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
try {
this.urlHashes.put(urlHashBytes);
} catch (final SpaceExceededException e) {

@ -43,11 +43,7 @@ import net.yacy.cora.protocol.TimeoutRequest;
import net.yacy.cora.storage.Configuration.Entry;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.Index;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.client.solrj.SolrServerException;
@ -282,83 +278,6 @@ public class migration {
sb.setConfig("crawler.http.acceptCharset", sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7"));
}
}
/**
* converts old urldb to Solr.
* In chunks of 1000 entries.
* Creates a lock file in workdir to allow only one active migration thread
* @return current size of urldb index
*/
@SuppressWarnings("deprecation")
public static int migrateUrldbtoSolr(final Switchboard sb) {
int ret = 0;
final File f;
final Fulltext ft = sb.index.fulltext();
if (ft.getURLDb() != null) {
ret = ft.getURLDb().size();
f = new File(sb.workPath, "migrateUrldbtoSolr.lck");
f.deleteOnExit();
if (f.exists()) {
return ret;
}
try {
f.createNewFile();
} catch (final IOException ex) {
ConcurrentLog.info("migrateUrldbtoSolr","could not create lock file");
}
final Thread t = new Thread() {
boolean go = true;
final Index urldb = ft.getURLDb();
public void run() {
try {
Thread.currentThread().setName("migration.migrateUrldbtoSolr");
int i = urldb.size();
while (go && i > 0) {
List<Row.Entry> chunk = urldb.random(1000);
if ((chunk == null) || (chunk.size() == 0)) {
go = false;
break;
}
Iterator<Row.Entry> chunkit = chunk.iterator();
while (go && chunkit.hasNext()) {
try { // to catch any data errors
URIMetadataRow row = new URIMetadataRow(chunkit.next(), null);
ft.putMetadata(row); // this deletes old urldb-entry first and inserts into Solr
i--;
if (Switchboard.getSwitchboard().shallTerminate()) {
go = false;
}
} catch (final Exception e) {
ConcurrentLog.info("migrateUrldbtoSolr", "some error while adding old data to new index, continue with next entry");
}
}
ConcurrentLog.info("migrateUrldbtoSolr", Integer.toString(i) + " entries left (convert next chunk of 1000 entries)");
}
ft.commit(true);
} catch (final IOException ex) {
ConcurrentLog.info("migrateUrldbtoSolr", "error reading old urldb index");
} finally {
if (f.exists()) {
f.delete(); // delete lock file
}
}
}
public void exit() {
go = false;
}
};
t.setPriority(Thread.MIN_PRIORITY);
t.start();
}
return ret;
}
/**
* Reindex embedded solr index

@ -39,7 +39,6 @@ import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.Memory;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.index.RowHandleSet;
@ -194,7 +193,7 @@ public class Dispatcher {
final ArrayList<ReferenceContainer<WordReference>> rc;
if (ram) {
// selection was only from ram, so we have to carefully remove only the selected entries
final HandleSet urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
final HandleSet urlHashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
Iterator<WordReference> it;
for (final ReferenceContainer<WordReference> c: containers) {
urlHashes.clear();

@ -52,6 +52,7 @@ import net.yacy.cora.util.SpaceExceededException;
import net.yacy.data.ListManager;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.SetTools;
@ -462,7 +463,7 @@ public class Blacklist {
}
HandleSet urlHashCache = getCacheUrlHashsSet(blacklistType);
if (urlHashCache == null) {
urlHashCache = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
urlHashCache = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
if (isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) {
try {
urlHashCache.put(url.hash());
@ -679,13 +680,13 @@ public class Blacklist {
try {
ObjectInputStream in = new ObjectInputStream(new FileInputStream(cachefile));
RowHandleSet rhs = (RowHandleSet) in.readObject();
this.cachedUrlHashs.put(type, rhs == null ? new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0) : rhs);
this.cachedUrlHashs.put(type, rhs == null ? new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0) : rhs);
in.close();
return;
} catch (final Throwable e) {
ConcurrentLog.logException(e);
}
}
this.cachedUrlHashs.put(type, new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0));
this.cachedUrlHashs.put(type, new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0));
}
}

@ -506,7 +506,6 @@ public final class Switchboard extends serverSwitch {
this.index.connectCitation(wordCacheMaxCount, fileSizeMax);
} catch (final IOException e) {ConcurrentLog.logException(e);}
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, true)) {
this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
try {this.index.fulltext().connectLocalSolr();} catch (final IOException e) {ConcurrentLog.logException(e);}
}
this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));
@ -1347,7 +1346,6 @@ public final class Switchboard extends serverSwitch {
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, true)) {
this.index.fulltext().connectLocalSolr();
this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
}
this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));

@ -74,10 +74,6 @@ public class DocumentIndex extends Segment {
);
super.connectRWI(cachesize, targetFileSize * 4 - 1);
super.connectCitation(cachesize, targetFileSize * 4 - 1);
super.connectUrlDb(
false, // useTailCache
false // exceed134217727
);
super.fulltext().connectLocalSolr();
super.fulltext().setUseWebgraph(true);
this.callback = callback;

@ -54,7 +54,6 @@ import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
import net.yacy.cora.federate.solr.instance.InstanceMirror;
import net.yacy.cora.federate.solr.instance.RemoteInstance;
import net.yacy.cora.federate.solr.instance.ShardInstance;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.sorting.ScoreMap;
@ -66,10 +65,6 @@ import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.Cache;
import net.yacy.kelondro.index.Index;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.table.SplitTable;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionConfiguration;
@ -90,9 +85,7 @@ public final class Fulltext {
// class objects
private final File segmentPath;
private final File archivePath;
private Index urlIndexFile;
private Export exportthread; // will have a export thread assigned if exporter is running
private String tablename;
private ArrayList<HostStat> statsDump;
private InstanceMirror solrInstances;
private final CollectionConfiguration collectionConfiguration;
@ -103,8 +96,6 @@ public final class Fulltext {
final CollectionConfiguration collectionConfiguration, final WebgraphConfiguration webgraphConfiguration) {
this.segmentPath = segmentPath;
this.archivePath = archivePath;
this.tablename = null;
this.urlIndexFile = null;
this.exportthread = null; // will have a export thread assigned if exporter is running
this.statsDump = null;
this.solrInstances = new InstanceMirror();
@ -121,35 +112,6 @@ public final class Fulltext {
return this.writeWebgraph;
}
/**
* @deprecated
* used only for migration
* @return the connected URLDb
*/
@Deprecated
public Index getURLDb() {
return this.urlIndexFile;
}
protected void connectUrlDb(final String tablename, final boolean useTailCache, final boolean exceed134217727) {
if (this.urlIndexFile != null) return;
this.tablename = tablename;
this.urlIndexFile = new SplitTable(new File(this.segmentPath, "default"), tablename, URIMetadataRow.rowdef, useTailCache, exceed134217727);
// SplitTable always returns != null, even if no file exists.
// as old UrlDb should be null if not exist, check and close if empty
// TODO: check if a SplitTable.open() returning null or error status on not existing file is preferable
if (this.urlIndexFile.isEmpty()) {
disconnectUrlDb();
}
}
public void disconnectUrlDb() {
if (this.urlIndexFile == null) return;
this.urlIndexFile.close();
this.urlIndexFile = null;
}
public CollectionConfiguration getDefaultConfiguration() {
return this.collectionConfiguration;
}
@ -233,19 +195,13 @@ public final class Fulltext {
}
public void clearCaches() {
if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache();
if (this.statsDump != null) this.statsDump.clear();
this.solrInstances.clearCaches();
this.statsDump = null;
}
public void clearURLIndex() throws IOException {
public void clearURLIndex() {
if (this.exportthread != null) this.exportthread.interrupt();
if (this.urlIndexFile == null) {
SplitTable.delete(new File(this.segmentPath, "default"), this.tablename);
} else {
this.urlIndexFile.clear();
}
this.statsDump = null;
this.commit(true);
}
@ -280,8 +236,7 @@ public final class Fulltext {
public long collectionSize() {
long t = System.currentTimeMillis();
if (t - this.collectionSizeLastAccess < 1000) return this.collectionSizeLastValue;
long size = this.urlIndexFile == null ? 0 : this.urlIndexFile.size();
size += this.solrInstances.getDefaultMirrorConnector().getSize();
long size = this.solrInstances.getDefaultMirrorConnector().getSize();
this.collectionSizeLastAccess = t;
this.collectionSizeLastValue = size;
return size;
@ -297,10 +252,6 @@ public final class Fulltext {
public void close() {
this.statsDump = null;
if (this.urlIndexFile != null) {
this.urlIndexFile.close();
this.urlIndexFile = null;
}
this.solrInstances.close();
}
@ -364,27 +315,12 @@ public final class Fulltext {
try {
SolrDocument doc = this.getDefaultConnector().getDocumentById(u);
if (doc != null) {
if (this.urlIndexFile != null) this.urlIndexFile.remove(urlHash); // migration
return new URIMetadataNode(doc, wre, weight);
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
// get the metadata from the old metadata index
if (this.urlIndexFile != null) try {
// slow migration to solr
final Row.Entry entry = this.urlIndexFile.remove(urlHash);
if (entry == null) return null;
URIMetadataRow row = new URIMetadataRow(entry, wre);
SolrInputDocument solrInput = this.collectionConfiguration.metadata2solr(row);
this.putDocument(solrInput);
SolrDocument sd = this.collectionConfiguration.toSolrDocument(solrInput);
return new URIMetadataNode(sd, wre, weight);
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
return null;
}
@ -394,14 +330,8 @@ public final class Fulltext {
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
String url = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
ConcurrentLog.info("Fulltext", "indexing: " + id + " " + url);
byte[] idb = ASCII.getBytes(id);
try {
if (this.urlIndexFile != null) this.urlIndexFile.remove(idb);
//Date sdDate = (Date) connector.getFieldById(id, CollectionSchema.last_modified.getSolrFieldName());
//Date docDate = null;
//if (sdDate == null || (docDate = SchemaConfiguration.getDate(doc, CollectionSchema.last_modified)) == null || sdDate.before(docDate)) {
connector.add(doc);
//}
} catch (final SolrException e) {
throw new IOException(e.getMessage(), e);
}
@ -428,7 +358,6 @@ public final class Fulltext {
byte[] idb = entry.hash();
String id = ASCII.String(idb);
try {
if (this.urlIndexFile != null) this.urlIndexFile.remove(idb);
// because node entries are richer than metadata entries we must check if they exist to prevent that they are overwritten
SolrDocument sd = this.getDefaultConnector().getDocumentById(id);
if (sd == null || (new URIMetadataNode(sd)).isOlder(entry)) {
@ -458,24 +387,7 @@ public final class Fulltext {
(freshdate == null || freshdate.after(now)) ? null :
(WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]"));
// delete in old metadata structure
if (Fulltext.this.urlIndexFile != null) {
final ArrayList<String> l = new ArrayList<String>();
CloneableIterator<byte[]> i;
try {
i = Fulltext.this.urlIndexFile.keys(true, null);
String hash;
while (i != null && i.hasNext()) {
hash = ASCII.String(i.next());
if (hosthashes.contains(hash.substring(6))) l.add(hash);
}
// then delete the urls using this list
for (final String h: l) Fulltext.this.urlIndexFile.delete(ASCII.getBytes(h));
} catch (final IOException e) {}
}
// finally remove the line with statistics
// remove the line with statistics
if (Fulltext.this.statsDump != null) {
final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
HostStat hs;
@ -578,12 +490,6 @@ public final class Fulltext {
} catch (final Throwable e) {
ConcurrentLog.logException(e);
}
if (Fulltext.this.urlIndexFile != null) try {
for (String id: deleteIDs) {
final Row.Entry r = Fulltext.this.urlIndexFile.remove(ASCII.getBytes(id));
if (r != null) Fulltext.this.statsDump = null;
}
} catch (final IOException e) {}
}
public boolean remove(final byte[] urlHash) {
@ -595,20 +501,12 @@ public final class Fulltext {
} catch (final Throwable e) {
ConcurrentLog.logException(e);
}
if (this.urlIndexFile != null) try {
final Row.Entry r = this.urlIndexFile.remove(urlHash);
if (r != null) this.statsDump = null;
return r != null;
} catch (final IOException e) {
return false;
}
return false;
}
@Deprecated
public boolean exists(final String urlHash) {
if (urlHash == null) return false;
if (this.urlIndexFile != null && this.urlIndexFile.has(ASCII.getBytes(urlHash))) return true;
try {
if (this.getDefaultConnector().existsById(urlHash)) return true;
} catch (final Throwable e) {
@ -629,17 +527,6 @@ public final class Fulltext {
if (ids.size() == 1) return exists(ids.iterator().next()) ? ids : e;
Set<String> idsC = new HashSet<String>();
idsC.addAll(ids);
if (this.urlIndexFile != null) {
Iterator<String> idsi = idsC.iterator();
String h;
while (idsi.hasNext()) {
h = idsi.next();
if (this.urlIndexFile.has(ASCII.getBytes(h))) {
idsi.remove();
e.add(h);
}
}
}
try {
Set<String> e1 = this.getDefaultConnector().existsByIds(idsC);
e.addAll(e1);

@ -66,7 +66,6 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.citation.CitationReferenceFactory;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
@ -204,10 +203,6 @@ public class Segment {
public long citationSegmentCount() {
return this.urlCitationIndex == null ? 0 : this.urlCitationIndex.getSegmentCount();
}
public void connectUrlDb(final boolean useTailCache, final boolean exceed134217727) {
this.fulltext.connectUrlDb(UrlDbName, useTailCache, exceed134217727);
}
public Fulltext fulltext() {
return this.fulltext;
@ -280,7 +275,7 @@ public class Segment {
}
private static RowHandleSet getPossibleRootHashes(DigestURL url) {
RowHandleSet rootCandidates = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10);
RowHandleSet rootCandidates = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10);
String rootStub = url.getProtocol() + "://" + url.getHost();
try {
rootCandidates.put(new DigestURL(rootStub).hash());

@ -70,7 +70,7 @@ import net.yacy.document.LargeNumberCache;
import net.yacy.document.LibraryProvider;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.data.word.WordReferenceVars;
@ -277,7 +277,7 @@ public final class SearchEvent {
this.addRunning = true;
this.receivedRemoteReferences = new AtomicInteger(0);
this.order = new ReferenceOrder(this.query.ranking, UTF8.getBytes(this.query.targetlang));
this.urlhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100);
this.urlhashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 100);
this.taggingPredicates = new HashMap<String, String>();
for (Tagging t: LibraryProvider.autotagging.getVocabularies()) {
this.taggingPredicates.put(t.getName(), t.getPredicate());

Loading…
Cancel
Save