- removed old metadata database and all migration code

- refactored all code which uses URIMetadataRow as standard for word
hash length and word hash ordering and moved that to the class 'Word',
becuase the class URIMetadataRow defined the old metadata data structure
and should be superfluous in the future
- removed unused methods from URIMetadataRow as preparation for further
removal of that class
pull/1/head
Michael Peter Christen 11 years ago
parent d3de309953
commit 1ea17bd9f3

@ -45,7 +45,6 @@ import net.yacy.cora.util.SpaceExceededException;
import net.yacy.data.ListManager; import net.yacy.data.ListManager;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceRow;
@ -110,8 +109,8 @@ public class IndexControlRWIs_p {
final String[] urls = post.getAll("urlhx.*"); final String[] urls = post.getAll("urlhx.*");
HandleSet urlb = HandleSet urlb =
new RowHandleSet( new RowHandleSet(
URIMetadataRow.rowdef.primaryKeyLength, Word.commonHashLength,
URIMetadataRow.rowdef.objectOrder, Word.commonHashOrder,
urls.length); urls.length);
if ( urls != null ) { if ( urls != null ) {
for ( final String s : urls ) { for ( final String s : urls ) {
@ -165,8 +164,8 @@ public class IndexControlRWIs_p {
final Iterator<WordReference> en = index.entries(); final Iterator<WordReference> en = index.entries();
urlb = urlb =
new RowHandleSet( new RowHandleSet(
URIMetadataRow.rowdef.primaryKeyLength, Word.commonHashLength,
URIMetadataRow.rowdef.objectOrder, Word.commonHashOrder,
index.size()); index.size());
while ( en.hasNext() ) { while ( en.hasNext() ) {
try { try {
@ -208,8 +207,8 @@ public class IndexControlRWIs_p {
} }
final HandleSet urlHashes = final HandleSet urlHashes =
new RowHandleSet( new RowHandleSet(
URIMetadataRow.rowdef.primaryKeyLength, Word.commonHashLength,
URIMetadataRow.rowdef.objectOrder, Word.commonHashOrder,
0); 0);
for ( final byte[] b : urlb ) { for ( final byte[] b : urlb ) {
try { try {
@ -363,8 +362,8 @@ public class IndexControlRWIs_p {
final String blacklist = post.get("blacklist", ""); final String blacklist = post.get("blacklist", "");
final HandleSet urlHashes = final HandleSet urlHashes =
new RowHandleSet( new RowHandleSet(
URIMetadataRow.rowdef.primaryKeyLength, Word.commonHashLength,
URIMetadataRow.rowdef.objectOrder, Word.commonHashOrder,
urlb.size()); urlb.size());
if ( post.containsKey("blacklisturls") ) { if ( post.containsKey("blacklisturls") ) {
final String[] supportedBlacklistTypes = final String[] supportedBlacklistTypes =

@ -149,7 +149,7 @@ public class IndexControlURLs_p {
// delete everything // delete everything
if ( post.containsKey("deletecomplete") ) { if ( post.containsKey("deletecomplete") ) {
if ( post.get("deleteIndex", "").equals("on") ) { if ( post.get("deleteIndex", "").equals("on") ) {
try {segment.fulltext().clearURLIndex();} catch (final IOException e) {} segment.fulltext().clearURLIndex();
try {segment.fulltext().clearLocalSolr();} catch (final IOException e) {} try {segment.fulltext().clearLocalSolr();} catch (final IOException e) {}
} }
if ( post.get("deleteRemoteSolr", "").equals("on")) { if ( post.get("deleteRemoteSolr", "").equals("on")) {

@ -87,11 +87,9 @@ public class IndexFederated_p {
if (previous_core_fulltext && !post_core_fulltext) { if (previous_core_fulltext && !post_core_fulltext) {
// switch off // switch off
sb.index.fulltext().disconnectLocalSolr(); sb.index.fulltext().disconnectLocalSolr();
sb.index.fulltext().disconnectUrlDb();
} }
if (!previous_core_fulltext && post_core_fulltext) { if (!previous_core_fulltext && post_core_fulltext) {
// switch on // switch on
sb.index.connectUrlDb(sb.useTailCache, sb.exceed134217727);
try { sb.index.fulltext().connectLocalSolr(); } catch (final IOException e) { ConcurrentLog.logException(e); } try { sb.index.fulltext().connectLocalSolr(); } catch (final IOException e) { ConcurrentLog.logException(e); }
} }

@ -36,7 +36,6 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException; import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.Digest; import net.yacy.cora.order.Digest;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;

@ -1,36 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Migrate URLdb</title>
#%env/templates/metas.template%#
</head>
<body>
#%env/templates/header.template%#
#%env/templates/submenuIndexControl.template%#
<h2>Migrate URLdb to embedded Solr Index</h2>
<p>Convert old meta data (urldb) index to embedded Solr fulltext index.</p>
<dl>
<dd>
<p>A low priority background job has been started which reads the old index, adds it to Solr and deletes the entry from the old index.</p>
<p>The default "slow migration" updates any entry in the old urldb index upon access (e.g. during search events).<br />
If you feel that the not accessed entries are still relevant, with this migration all entries from the old urldb index will be migrated.</p>
<p>You may refresh this page to see how many entries in the old index are left for migration</p>
<p>Hint: this background task runs until all entries are migrated or YaCy is shutdown. The migration is not automatically restarted.</p>
</dd>
</dl>
<form action="migrateurldb_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<input type="hidden" name="lastcount" value="#[lastcount]#" />
<input type="hidden" name="lasttime" value="#[lasttime]#" />
<p><b>#[count]# entries</b> in old index left to migrate.</p>
<p>For large indexes this may run for a long time (migration speed: #[speed]# entries per minute) <input type="submit" name="dorefresh" value="refresh" /></p>
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

@ -1,44 +0,0 @@
// migrateurldb_p.java
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.migration;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
public class migrateurldb_p {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
int cnt;
if ((cnt = migration.migrateUrldbtoSolr(sb)) > 0) {
prop.put("count", cnt);
if (post != null && post.containsKey("dorefresh")) {
int lastcount = post.getInt("lastcount", 0);
Long t = post.getLong("lasttime", 1);
Double difft = (System.currentTimeMillis() - t) / 60000.0d;
int diff = (int)((lastcount - cnt) / difft) ;
prop.put("speed", diff);
prop.put("lasttime", t);
prop.put("lastcount", lastcount);
} else {
prop.put("speed", "?");
prop.put("lastcount",cnt);
prop.put("lasttime", System.currentTimeMillis());
}
} else {
prop.put("speed", "");
prop.put("count", "no urldb index available");
}
// return rewrite properties
return prop;
}
}

@ -42,7 +42,7 @@ import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.Memory; import net.yacy.cora.util.Memory;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
@ -182,8 +182,8 @@ public final class transferRWI {
String wordHash; String wordHash;
byte[] urlHash; byte[] urlHash;
WordReferenceRow iEntry; WordReferenceRow iEntry;
final HandleSet unknownURL = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); final HandleSet unknownURL = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
final HandleSet knownURL = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); final HandleSet knownURL = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
final ArrayList<String> wordhashes = new ArrayList<String>(); final ArrayList<String> wordhashes = new ArrayList<String>();
int received = 0; int received = 0;
int blocked = 0; int blocked = 0;

@ -57,7 +57,6 @@ import org.apache.http.HttpHost;
import org.apache.http.HttpResponse; import org.apache.http.HttpResponse;
import org.apache.http.auth.AuthScope; import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CredentialsProvider; import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig; import org.apache.http.client.config.RequestConfig;

@ -51,7 +51,7 @@ import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Latency; import net.yacy.crawler.data.Latency;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.BufferedObjectIndex; import net.yacy.kelondro.index.BufferedObjectIndex;
import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.index.RowHandleSet;
@ -96,7 +96,7 @@ public class Balancer {
this.cacheStacksPath = cachePath; this.cacheStacksPath = cachePath;
this.domainStacks = new ConcurrentHashMap<String, HostHandles>(); this.domainStacks = new ConcurrentHashMap<String, HostHandles>();
this.domStackInitSize = Integer.MAX_VALUE; this.domStackInitSize = Integer.MAX_VALUE;
this.double_push_check = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.double_push_check = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
this.zeroWaitingCandidates = new ArrayList<Map.Entry<String, byte[]>>(); this.zeroWaitingCandidates = new ArrayList<Map.Entry<String, byte[]>>();
this.random = new Random(System.currentTimeMillis()); this.random = new Random(System.currentTimeMillis());
@ -564,7 +564,7 @@ public class Balancer {
if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return; if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return;
this.domainStacks.clear(); this.domainStacks.clear();
this.lastDomainStackFill = System.currentTimeMillis(); this.lastDomainStackFill = System.currentTimeMillis();
final HandleSet blackhandles = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10); final HandleSet blackhandles = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10);
String host; String host;
Request request; Request request;
int count = 0; int count = 0;

@ -49,7 +49,6 @@ import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.blob.MapHeap; import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
@ -588,7 +587,7 @@ public final class CrawlSwitchboard {
r = sei.next(); r = sei.next();
String handle = r.profileHandle(); String handle = r.profileHandle();
RowHandleSet us = this.profilesActiveCrawlsCounter.get(handle); RowHandleSet us = this.profilesActiveCrawlsCounter.get(handle);
if (us == null) {us = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.profilesActiveCrawlsCounter.put(handle, us);} if (us == null) {us = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); this.profilesActiveCrawlsCounter.put(handle, us);}
if (us.size() < 100) us.put(r.url().hash()); // store the hash, but not too many if (us.size() < 100) us.put(r.url().hash()); // store the hash, but not too many
deletionCandidate.remove(handle); deletionCandidate.remove(handle);
if (deletionCandidate.size() == 0) return new HashSet<String>(0); if (deletionCandidate.size() == 0) return new HashSet<String>(0);

@ -32,7 +32,7 @@ import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.BufferedObjectIndex; import net.yacy.kelondro.index.BufferedObjectIndex;
import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.index.RowHandleSet;
@ -60,7 +60,7 @@ public class HostQueue {
final boolean exceed134217727) { final boolean exceed134217727) {
this.hostHash = hostHash; this.hostHash = hostHash;
this.queuesPath = queuesPath; this.queuesPath = queuesPath;
this.urlHashDoubleCheck = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.urlHashDoubleCheck = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
// create a stack for newly entered entries // create a stack for newly entered entries
if (!(this.queuesPath.exists())) this.queuesPath.mkdir(); // make the path if (!(this.queuesPath.exists())) this.queuesPath.mkdir(); // make the path

@ -43,7 +43,7 @@ import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.blob.MapHeap; import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.index.RowHandleSet;
public class BookmarksDB { public class BookmarksDB {
@ -212,7 +212,7 @@ public class BookmarksDB {
final TreeSet<String> set=new TreeSet<String>(new bookmarkComparator(true)); final TreeSet<String> set=new TreeSet<String>(new bookmarkComparator(true));
final String tagHash=BookmarkHelper.tagHash(tagName); final String tagHash=BookmarkHelper.tagHash(tagName);
final Tag tag=getTag(tagHash); final Tag tag=getTag(tagHash);
RowHandleSet hashes = tag == null ? new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10) : tag.getUrlHashes(); RowHandleSet hashes = tag == null ? new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10) : tag.getUrlHashes();
if (priv) { if (priv) {
for (byte[] hash: hashes) set.add(ASCII.String(hash)); for (byte[] hash: hashes) set.add(ASCII.String(hash));
} else { } else {
@ -389,7 +389,7 @@ public class BookmarksDB {
private Tag(final String name) { private Tag(final String name) {
this.tagHash = BookmarkHelper.tagHash(name); this.tagHash = BookmarkHelper.tagHash(name);
this.tagName = name; this.tagName = name;
this.urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10); this.urlHashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10);
} }
/** /**

@ -11,16 +11,16 @@
// LICENSE // LICENSE
// //
// This program is free software; you can redistribute it and/or modify // This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by // it under the terms of the GNU General private License as published by
// the Free Software Foundation; either version 2 of the License, or // the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version. // (at your option) any later version.
// //
// This program is distributed in the hope that it will be useful, // This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of // but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details. // GNU General private License for more details.
// //
// You should have received a copy of the GNU General Public License // You should have received a copy of the GNU General private License
// along with this program; if not, write to the Free Software // along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@ -31,7 +31,6 @@ import java.text.ParseException;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Properties; import java.util.Properties;
import java.util.regex.Pattern;
import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
@ -43,7 +42,6 @@ import net.yacy.cora.order.Digest;
import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ByteBuffer;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.data.word.WordReferenceVars;
@ -59,7 +57,7 @@ public class URIMetadataRow {
// this object stores attributes for URL entries // this object stores attributes for URL entries
public static final Row rowdef = new Row( private static final Row rowdef = new Row(
"String hash-12, " + // the url's hash "String hash-12, " + // the url's hash
"String comp-360, " + // components: the url, description, author, tags and publisher "String comp-360, " + // components: the url, description, author, tags and publisher
"Cardinal mod-4 {b256}, " + // last-modified from the httpd "Cardinal mod-4 {b256}, " + // last-modified from the httpd
@ -108,7 +106,7 @@ public class URIMetadataRow {
private WordReference word; // this is only used if the url is transported via remote search requests private WordReference word; // this is only used if the url is transported via remote search requests
private Components comp; private Components comp;
public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord) { private URIMetadataRow(final Row.Entry entry, final WordReference searchedWord) {
this.entry = entry; this.entry = entry;
this.snippet = ""; this.snippet = "";
this.word = searchedWord; this.word = searchedWord;
@ -242,17 +240,6 @@ public class URIMetadataRow {
return h; return h;
} }
private String hostHash = null;
public String hosthash() {
if (this.hostHash != null) return this.hostHash;
this.hostHash = ASCII.String(this.entry.getPrimaryKeyBytes(), 6, 6);
return this.hostHash;
}
public boolean matches(final Pattern matcher) {
return this.metadata().matches(matcher);
}
public DigestURL url() { public DigestURL url() {
return this.metadata().url(); return this.metadata().url();
} }
@ -281,7 +268,7 @@ public class URIMetadataRow {
return this.metadata().lon(); return this.metadata().lon();
} }
private Components metadata() { public Components metadata() {
// avoid double computation of metadata elements // avoid double computation of metadata elements
if (this.comp != null) return this.comp; if (this.comp != null) return this.comp;
// parse elements from comp field; // parse elements from comp field;
@ -434,20 +421,6 @@ public class URIMetadataRow {
} }
} }
public Request toBalancerEntry(final String initiatorHash) {
return new Request(
ASCII.getBytes(initiatorHash),
metadata().url(),
referrerHash(),
metadata().dc_title(),
moddate(),
null,
0,
0,
0,
0);
}
/** /**
* @return the object as String.<br> * @return the object as String.<br>
* This e.g. looks like this: * This e.g. looks like this:
@ -472,7 +445,7 @@ public class URIMetadataRow {
private final String dc_title, dc_creator, dc_subject, dc_publisher; private final String dc_title, dc_creator, dc_subject, dc_publisher;
private String latlon; // a comma-separated tuple as "<latitude>,<longitude>" where the coordinates are given as WGS84 spatial coordinates in decimal degrees private String latlon; // a comma-separated tuple as "<latitude>,<longitude>" where the coordinates are given as WGS84 spatial coordinates in decimal degrees
public Components( private Components(
final String urlRaw, final String urlRaw,
final byte[] urlhash, final byte[] urlhash,
final String title, final String title,
@ -489,12 +462,7 @@ public class URIMetadataRow {
this.dc_publisher = publisher; this.dc_publisher = publisher;
this.latlon = latlon; this.latlon = latlon;
} }
public boolean matches(final Pattern matcher) { private DigestURL url() {
if (this.urlRaw != null) return matcher.matcher(this.urlRaw.toLowerCase()).matches();
if (this.url != null) return matcher.matcher(this.url.toNormalform(true).toLowerCase()).matches();
return false;
}
public DigestURL url() {
if (this.url == null) { if (this.url == null) {
try { try {
this.url = new DigestURL(this.urlRaw, this.urlHash); this.url = new DigestURL(this.urlRaw, this.urlHash);
@ -506,11 +474,11 @@ public class URIMetadataRow {
} }
return this.url; return this.url;
} }
public String dc_title() { return this.dc_title; } private String dc_title() { return this.dc_title; }
public String dc_creator() { return this.dc_creator; } private String dc_creator() { return this.dc_creator; }
public String dc_publisher() { return this.dc_publisher; } private String dc_publisher() { return this.dc_publisher; }
public String dc_subject() { return this.dc_subject; } private String dc_subject() { return this.dc_subject; }
public double lat() { private double lat() {
if (this.latlon == null || this.latlon.isEmpty()) return 0.0d; if (this.latlon == null || this.latlon.isEmpty()) return 0.0d;
final int p = this.latlon.indexOf(','); final int p = this.latlon.indexOf(',');
if (p < 0) return 0.0d; if (p < 0) return 0.0d;
@ -523,7 +491,7 @@ public class URIMetadataRow {
return 0.0d; return 0.0d;
} }
} }
public double lon() { private double lon() {
if (this.latlon == null || this.latlon.isEmpty()) return 0.0d; if (this.latlon == null || this.latlon.isEmpty()) return 0.0d;
final int p = this.latlon.indexOf(','); final int p = this.latlon.indexOf(',');
if (p < 0) return 0.0d; if (p < 0) return 0.0d;

@ -40,10 +40,8 @@ import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.MemoryControl;
public class Word { public class Word {
/** /**
* this is the lenght(12) of the hash key that is used:<br> * this is the lenght(12) of the hash key that is used:<br>
* - for seed hashes (this Object)<br> * - for seed hashes (this Object)<br>
@ -52,6 +50,7 @@ public class Word {
* these hashes all shall be generated by base64.enhancedCoder * these hashes all shall be generated by base64.enhancedCoder
*/ */
public static final int commonHashLength = 12; public static final int commonHashLength = 12;
public static final Base64Order commonHashOrder = Base64Order.enhancedCoder;
private static final int hashCacheSize = Math.max(20000, Math.min(200000, (int) (MemoryControl.available() / 40000L))); private static final int hashCacheSize = Math.max(20000, Math.min(200000, (int) (MemoryControl.available() / 40000L)));
private static ARC<String, byte[]> hashCache = null; private static ARC<String, byte[]> hashCache = null;
@ -64,12 +63,6 @@ public class Word {
ConcurrentLog.info("Word", "hashCache.size = " + 1000); ConcurrentLog.info("Word", "hashCache.size = " + 1000);
} }
} }
/*
private static ConcurrentHashMap<String, byte[]> hashCache = null;
static {
hashCache = new ConcurrentHashMap<String, byte[]>();
}
*/
// object carries statistics for words and sentences // object carries statistics for words and sentences
public int count; // number of occurrences public int count; // number of occurrences
@ -122,7 +115,7 @@ public class Word {
byte[] h = hashCache.get(wordlc); byte[] h = hashCache.get(wordlc);
if (h != null) return h; if (h != null) return h;
// calculate the hash // calculate the hash
h = Base64Order.enhancedCoder.encodeSubstring(Digest.encodeMD5Raw(wordlc), commonHashLength); h = commonHashOrder.encodeSubstring(Digest.encodeMD5Raw(wordlc), commonHashLength);
while (h[0] == highByte && h[1] == highByte && h[2] == highByte && h[3] == highByte && h[4] == highByte) { while (h[0] == highByte && h[1] == highByte && h[2] == highByte && h[3] == highByte && h[4] == highByte) {
// ensure that word hashes do not start with hash '_____' which is a key for an extra hash range for private usage on the local peer // ensure that word hashes do not start with hash '_____' which is a key for an extra hash range for private usage on the local peer
// statistically we are inside this loop only every 2^^30 calls of word2hash (which means almost never) // statistically we are inside this loop only every 2^^30 calls of word2hash (which means almost never)

@ -40,7 +40,7 @@ import net.yacy.cora.storage.ComparableARC;
import net.yacy.cora.storage.HandleSet; import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.MergeIterator; import net.yacy.kelondro.util.MergeIterator;
@ -102,7 +102,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
this.targetFileSize = targetFileSize; this.targetFileSize = targetFileSize;
this.maxFileSize = maxFileSize; this.maxFileSize = maxFileSize;
this.writeBufferSize = writeBufferSize; this.writeBufferSize = writeBufferSize;
this.removeDelayedURLs = new TreeMap<byte[], HandleSet>(URIMetadataRow.rowdef.objectOrder); this.removeDelayedURLs = new TreeMap<byte[], HandleSet>(Word.commonHashOrder);
this.flushShallRun = true; this.flushShallRun = true;
this.flushThread = new FlushThread(); this.flushThread = new FlushThread();
this.flushThread.start(); this.flushThread.start();
@ -399,7 +399,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
r = this.removeDelayedURLs.get(termHash); r = this.removeDelayedURLs.get(termHash);
} }
if (r == null) { if (r == null) {
r = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); r = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
} }
try { try {
r.put(urlHashBytes); r.put(urlHashBytes);
@ -414,7 +414,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
@Override @Override
public void removeDelayed() throws IOException { public void removeDelayed() throws IOException {
final HandleSet words = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed. final HandleSet words = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed.
synchronized (this.removeDelayedURLs) { synchronized (this.removeDelayedURLs) {
for (final byte[] b: this.removeDelayedURLs.keySet()) try {words.put(b);} catch (final SpaceExceededException e) {} for (final byte[] b: this.removeDelayedURLs.keySet()) try {words.put(b);} catch (final SpaceExceededException e) {}
} }
@ -476,7 +476,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
} }
public RemoveReducer(final byte[] urlHashBytes) { public RemoveReducer(final byte[] urlHashBytes) {
this.urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.urlHashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
try { try {
this.urlHashes.put(urlHashBytes); this.urlHashes.put(urlHashBytes);
} catch (final SpaceExceededException e) { } catch (final SpaceExceededException e) {

@ -43,11 +43,7 @@ import net.yacy.cora.protocol.TimeoutRequest;
import net.yacy.cora.storage.Configuration.Entry; import net.yacy.cora.storage.Configuration.Entry;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.Index;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.workflow.BusyThread; import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.SolrServerException;
@ -282,83 +278,6 @@ public class migration {
sb.setConfig("crawler.http.acceptCharset", sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7")); sb.setConfig("crawler.http.acceptCharset", sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7"));
} }
} }
/**
* converts old urldb to Solr.
* In chunks of 1000 entries.
* Creates a lock file in workdir to allow only one active migration thread
* @return current size of urldb index
*/
@SuppressWarnings("deprecation")
public static int migrateUrldbtoSolr(final Switchboard sb) {
int ret = 0;
final File f;
final Fulltext ft = sb.index.fulltext();
if (ft.getURLDb() != null) {
ret = ft.getURLDb().size();
f = new File(sb.workPath, "migrateUrldbtoSolr.lck");
f.deleteOnExit();
if (f.exists()) {
return ret;
}
try {
f.createNewFile();
} catch (final IOException ex) {
ConcurrentLog.info("migrateUrldbtoSolr","could not create lock file");
}
final Thread t = new Thread() {
boolean go = true;
final Index urldb = ft.getURLDb();
public void run() {
try {
Thread.currentThread().setName("migration.migrateUrldbtoSolr");
int i = urldb.size();
while (go && i > 0) {
List<Row.Entry> chunk = urldb.random(1000);
if ((chunk == null) || (chunk.size() == 0)) {
go = false;
break;
}
Iterator<Row.Entry> chunkit = chunk.iterator();
while (go && chunkit.hasNext()) {
try { // to catch any data errors
URIMetadataRow row = new URIMetadataRow(chunkit.next(), null);
ft.putMetadata(row); // this deletes old urldb-entry first and inserts into Solr
i--;
if (Switchboard.getSwitchboard().shallTerminate()) {
go = false;
}
} catch (final Exception e) {
ConcurrentLog.info("migrateUrldbtoSolr", "some error while adding old data to new index, continue with next entry");
}
}
ConcurrentLog.info("migrateUrldbtoSolr", Integer.toString(i) + " entries left (convert next chunk of 1000 entries)");
}
ft.commit(true);
} catch (final IOException ex) {
ConcurrentLog.info("migrateUrldbtoSolr", "error reading old urldb index");
} finally {
if (f.exists()) {
f.delete(); // delete lock file
}
}
}
public void exit() {
go = false;
}
};
t.setPriority(Thread.MIN_PRIORITY);
t.start();
}
return ret;
}
/** /**
* Reindex embedded solr index * Reindex embedded solr index

@ -39,7 +39,6 @@ import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.Memory; import net.yacy.cora.util.Memory;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.index.RowHandleSet;
@ -194,7 +193,7 @@ public class Dispatcher {
final ArrayList<ReferenceContainer<WordReference>> rc; final ArrayList<ReferenceContainer<WordReference>> rc;
if (ram) { if (ram) {
// selection was only from ram, so we have to carefully remove only the selected entries // selection was only from ram, so we have to carefully remove only the selected entries
final HandleSet urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); final HandleSet urlHashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
Iterator<WordReference> it; Iterator<WordReference> it;
for (final ReferenceContainer<WordReference> c: containers) { for (final ReferenceContainer<WordReference> c: containers) {
urlHashes.clear(); urlHashes.clear();

@ -52,6 +52,7 @@ import net.yacy.cora.util.SpaceExceededException;
import net.yacy.data.ListManager; import net.yacy.data.ListManager;
import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.SetTools; import net.yacy.kelondro.util.SetTools;
@ -462,7 +463,7 @@ public class Blacklist {
} }
HandleSet urlHashCache = getCacheUrlHashsSet(blacklistType); HandleSet urlHashCache = getCacheUrlHashsSet(blacklistType);
if (urlHashCache == null) { if (urlHashCache == null) {
urlHashCache = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); urlHashCache = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
if (isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) { if (isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) {
try { try {
urlHashCache.put(url.hash()); urlHashCache.put(url.hash());
@ -679,13 +680,13 @@ public class Blacklist {
try { try {
ObjectInputStream in = new ObjectInputStream(new FileInputStream(cachefile)); ObjectInputStream in = new ObjectInputStream(new FileInputStream(cachefile));
RowHandleSet rhs = (RowHandleSet) in.readObject(); RowHandleSet rhs = (RowHandleSet) in.readObject();
this.cachedUrlHashs.put(type, rhs == null ? new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0) : rhs); this.cachedUrlHashs.put(type, rhs == null ? new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0) : rhs);
in.close(); in.close();
return; return;
} catch (final Throwable e) { } catch (final Throwable e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
} }
this.cachedUrlHashs.put(type, new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0)); this.cachedUrlHashs.put(type, new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0));
} }
} }

@ -506,7 +506,6 @@ public final class Switchboard extends serverSwitch {
this.index.connectCitation(wordCacheMaxCount, fileSizeMax); this.index.connectCitation(wordCacheMaxCount, fileSizeMax);
} catch (final IOException e) {ConcurrentLog.logException(e);} } catch (final IOException e) {ConcurrentLog.logException(e);}
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, true)) { if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, true)) {
this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
try {this.index.fulltext().connectLocalSolr();} catch (final IOException e) {ConcurrentLog.logException(e);} try {this.index.fulltext().connectLocalSolr();} catch (final IOException e) {ConcurrentLog.logException(e);}
} }
this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false)); this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));
@ -1347,7 +1346,6 @@ public final class Switchboard extends serverSwitch {
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax); if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, true)) { if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, true)) {
this.index.fulltext().connectLocalSolr(); this.index.fulltext().connectLocalSolr();
this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
} }
this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false)); this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));

@ -74,10 +74,6 @@ public class DocumentIndex extends Segment {
); );
super.connectRWI(cachesize, targetFileSize * 4 - 1); super.connectRWI(cachesize, targetFileSize * 4 - 1);
super.connectCitation(cachesize, targetFileSize * 4 - 1); super.connectCitation(cachesize, targetFileSize * 4 - 1);
super.connectUrlDb(
false, // useTailCache
false // exceed134217727
);
super.fulltext().connectLocalSolr(); super.fulltext().connectLocalSolr();
super.fulltext().setUseWebgraph(true); super.fulltext().setUseWebgraph(true);
this.callback = callback; this.callback = callback;

@ -54,7 +54,6 @@ import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
import net.yacy.cora.federate.solr.instance.InstanceMirror; import net.yacy.cora.federate.solr.instance.InstanceMirror;
import net.yacy.cora.federate.solr.instance.RemoteInstance; import net.yacy.cora.federate.solr.instance.RemoteInstance;
import net.yacy.cora.federate.solr.instance.ShardInstance; import net.yacy.cora.federate.solr.instance.ShardInstance;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.ScoreMap;
@ -66,10 +65,6 @@ import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.Cache;
import net.yacy.kelondro.index.Index;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.table.SplitTable;
import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionConfiguration;
@ -90,9 +85,7 @@ public final class Fulltext {
// class objects // class objects
private final File segmentPath; private final File segmentPath;
private final File archivePath; private final File archivePath;
private Index urlIndexFile;
private Export exportthread; // will have a export thread assigned if exporter is running private Export exportthread; // will have a export thread assigned if exporter is running
private String tablename;
private ArrayList<HostStat> statsDump; private ArrayList<HostStat> statsDump;
private InstanceMirror solrInstances; private InstanceMirror solrInstances;
private final CollectionConfiguration collectionConfiguration; private final CollectionConfiguration collectionConfiguration;
@ -103,8 +96,6 @@ public final class Fulltext {
final CollectionConfiguration collectionConfiguration, final WebgraphConfiguration webgraphConfiguration) { final CollectionConfiguration collectionConfiguration, final WebgraphConfiguration webgraphConfiguration) {
this.segmentPath = segmentPath; this.segmentPath = segmentPath;
this.archivePath = archivePath; this.archivePath = archivePath;
this.tablename = null;
this.urlIndexFile = null;
this.exportthread = null; // will have a export thread assigned if exporter is running this.exportthread = null; // will have a export thread assigned if exporter is running
this.statsDump = null; this.statsDump = null;
this.solrInstances = new InstanceMirror(); this.solrInstances = new InstanceMirror();
@ -121,35 +112,6 @@ public final class Fulltext {
return this.writeWebgraph; return this.writeWebgraph;
} }
/**
* @deprecated
* used only for migration
* @return the connected URLDb
*/
@Deprecated
public Index getURLDb() {
return this.urlIndexFile;
}
protected void connectUrlDb(final String tablename, final boolean useTailCache, final boolean exceed134217727) {
if (this.urlIndexFile != null) return;
this.tablename = tablename;
this.urlIndexFile = new SplitTable(new File(this.segmentPath, "default"), tablename, URIMetadataRow.rowdef, useTailCache, exceed134217727);
// SplitTable always returns != null, even if no file exists.
// as old UrlDb should be null if not exist, check and close if empty
// TODO: check if a SplitTable.open() returning null or error status on not existing file is preferable
if (this.urlIndexFile.isEmpty()) {
disconnectUrlDb();
}
}
public void disconnectUrlDb() {
if (this.urlIndexFile == null) return;
this.urlIndexFile.close();
this.urlIndexFile = null;
}
public CollectionConfiguration getDefaultConfiguration() { public CollectionConfiguration getDefaultConfiguration() {
return this.collectionConfiguration; return this.collectionConfiguration;
} }
@ -233,19 +195,13 @@ public final class Fulltext {
} }
public void clearCaches() { public void clearCaches() {
if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache();
if (this.statsDump != null) this.statsDump.clear(); if (this.statsDump != null) this.statsDump.clear();
this.solrInstances.clearCaches(); this.solrInstances.clearCaches();
this.statsDump = null; this.statsDump = null;
} }
public void clearURLIndex() throws IOException { public void clearURLIndex() {
if (this.exportthread != null) this.exportthread.interrupt(); if (this.exportthread != null) this.exportthread.interrupt();
if (this.urlIndexFile == null) {
SplitTable.delete(new File(this.segmentPath, "default"), this.tablename);
} else {
this.urlIndexFile.clear();
}
this.statsDump = null; this.statsDump = null;
this.commit(true); this.commit(true);
} }
@ -280,8 +236,7 @@ public final class Fulltext {
public long collectionSize() { public long collectionSize() {
long t = System.currentTimeMillis(); long t = System.currentTimeMillis();
if (t - this.collectionSizeLastAccess < 1000) return this.collectionSizeLastValue; if (t - this.collectionSizeLastAccess < 1000) return this.collectionSizeLastValue;
long size = this.urlIndexFile == null ? 0 : this.urlIndexFile.size(); long size = this.solrInstances.getDefaultMirrorConnector().getSize();
size += this.solrInstances.getDefaultMirrorConnector().getSize();
this.collectionSizeLastAccess = t; this.collectionSizeLastAccess = t;
this.collectionSizeLastValue = size; this.collectionSizeLastValue = size;
return size; return size;
@ -297,10 +252,6 @@ public final class Fulltext {
public void close() { public void close() {
this.statsDump = null; this.statsDump = null;
if (this.urlIndexFile != null) {
this.urlIndexFile.close();
this.urlIndexFile = null;
}
this.solrInstances.close(); this.solrInstances.close();
} }
@ -364,27 +315,12 @@ public final class Fulltext {
try { try {
SolrDocument doc = this.getDefaultConnector().getDocumentById(u); SolrDocument doc = this.getDefaultConnector().getDocumentById(u);
if (doc != null) { if (doc != null) {
if (this.urlIndexFile != null) this.urlIndexFile.remove(urlHash); // migration
return new URIMetadataNode(doc, wre, weight); return new URIMetadataNode(doc, wre, weight);
} }
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
// get the metadata from the old metadata index
if (this.urlIndexFile != null) try {
// slow migration to solr
final Row.Entry entry = this.urlIndexFile.remove(urlHash);
if (entry == null) return null;
URIMetadataRow row = new URIMetadataRow(entry, wre);
SolrInputDocument solrInput = this.collectionConfiguration.metadata2solr(row);
this.putDocument(solrInput);
SolrDocument sd = this.collectionConfiguration.toSolrDocument(solrInput);
return new URIMetadataNode(sd, wre, weight);
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
return null; return null;
} }
@ -394,14 +330,8 @@ public final class Fulltext {
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
String url = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); String url = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
ConcurrentLog.info("Fulltext", "indexing: " + id + " " + url); ConcurrentLog.info("Fulltext", "indexing: " + id + " " + url);
byte[] idb = ASCII.getBytes(id);
try { try {
if (this.urlIndexFile != null) this.urlIndexFile.remove(idb);
//Date sdDate = (Date) connector.getFieldById(id, CollectionSchema.last_modified.getSolrFieldName());
//Date docDate = null;
//if (sdDate == null || (docDate = SchemaConfiguration.getDate(doc, CollectionSchema.last_modified)) == null || sdDate.before(docDate)) {
connector.add(doc); connector.add(doc);
//}
} catch (final SolrException e) { } catch (final SolrException e) {
throw new IOException(e.getMessage(), e); throw new IOException(e.getMessage(), e);
} }
@ -428,7 +358,6 @@ public final class Fulltext {
byte[] idb = entry.hash(); byte[] idb = entry.hash();
String id = ASCII.String(idb); String id = ASCII.String(idb);
try { try {
if (this.urlIndexFile != null) this.urlIndexFile.remove(idb);
// because node entries are richer than metadata entries we must check if they exist to prevent that they are overwritten // because node entries are richer than metadata entries we must check if they exist to prevent that they are overwritten
SolrDocument sd = this.getDefaultConnector().getDocumentById(id); SolrDocument sd = this.getDefaultConnector().getDocumentById(id);
if (sd == null || (new URIMetadataNode(sd)).isOlder(entry)) { if (sd == null || (new URIMetadataNode(sd)).isOlder(entry)) {
@ -458,24 +387,7 @@ public final class Fulltext {
(freshdate == null || freshdate.after(now)) ? null : (freshdate == null || freshdate.after(now)) ? null :
(WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]")); (WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]"));
// delete in old metadata structure // remove the line with statistics
if (Fulltext.this.urlIndexFile != null) {
final ArrayList<String> l = new ArrayList<String>();
CloneableIterator<byte[]> i;
try {
i = Fulltext.this.urlIndexFile.keys(true, null);
String hash;
while (i != null && i.hasNext()) {
hash = ASCII.String(i.next());
if (hosthashes.contains(hash.substring(6))) l.add(hash);
}
// then delete the urls using this list
for (final String h: l) Fulltext.this.urlIndexFile.delete(ASCII.getBytes(h));
} catch (final IOException e) {}
}
// finally remove the line with statistics
if (Fulltext.this.statsDump != null) { if (Fulltext.this.statsDump != null) {
final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator(); final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
HostStat hs; HostStat hs;
@ -578,12 +490,6 @@ public final class Fulltext {
} catch (final Throwable e) { } catch (final Throwable e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
if (Fulltext.this.urlIndexFile != null) try {
for (String id: deleteIDs) {
final Row.Entry r = Fulltext.this.urlIndexFile.remove(ASCII.getBytes(id));
if (r != null) Fulltext.this.statsDump = null;
}
} catch (final IOException e) {}
} }
public boolean remove(final byte[] urlHash) { public boolean remove(final byte[] urlHash) {
@ -595,20 +501,12 @@ public final class Fulltext {
} catch (final Throwable e) { } catch (final Throwable e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
if (this.urlIndexFile != null) try {
final Row.Entry r = this.urlIndexFile.remove(urlHash);
if (r != null) this.statsDump = null;
return r != null;
} catch (final IOException e) {
return false;
}
return false; return false;
} }
@Deprecated @Deprecated
public boolean exists(final String urlHash) { public boolean exists(final String urlHash) {
if (urlHash == null) return false; if (urlHash == null) return false;
if (this.urlIndexFile != null && this.urlIndexFile.has(ASCII.getBytes(urlHash))) return true;
try { try {
if (this.getDefaultConnector().existsById(urlHash)) return true; if (this.getDefaultConnector().existsById(urlHash)) return true;
} catch (final Throwable e) { } catch (final Throwable e) {
@ -629,17 +527,6 @@ public final class Fulltext {
if (ids.size() == 1) return exists(ids.iterator().next()) ? ids : e; if (ids.size() == 1) return exists(ids.iterator().next()) ? ids : e;
Set<String> idsC = new HashSet<String>(); Set<String> idsC = new HashSet<String>();
idsC.addAll(ids); idsC.addAll(ids);
if (this.urlIndexFile != null) {
Iterator<String> idsi = idsC.iterator();
String h;
while (idsi.hasNext()) {
h = idsi.next();
if (this.urlIndexFile.has(ASCII.getBytes(h))) {
idsi.remove();
e.add(h);
}
}
}
try { try {
Set<String> e1 = this.getDefaultConnector().existsByIds(idsC); Set<String> e1 = this.getDefaultConnector().existsByIds(idsC);
e.addAll(e1); e.addAll(e1);

@ -66,7 +66,6 @@ import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.citation.CitationReferenceFactory; import net.yacy.kelondro.data.citation.CitationReferenceFactory;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory; import net.yacy.kelondro.data.word.WordReferenceFactory;
@ -205,10 +204,6 @@ public class Segment {
return this.urlCitationIndex == null ? 0 : this.urlCitationIndex.getSegmentCount(); return this.urlCitationIndex == null ? 0 : this.urlCitationIndex.getSegmentCount();
} }
public void connectUrlDb(final boolean useTailCache, final boolean exceed134217727) {
this.fulltext.connectUrlDb(UrlDbName, useTailCache, exceed134217727);
}
public Fulltext fulltext() { public Fulltext fulltext() {
return this.fulltext; return this.fulltext;
} }
@ -280,7 +275,7 @@ public class Segment {
} }
private static RowHandleSet getPossibleRootHashes(DigestURL url) { private static RowHandleSet getPossibleRootHashes(DigestURL url) {
RowHandleSet rootCandidates = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10); RowHandleSet rootCandidates = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10);
String rootStub = url.getProtocol() + "://" + url.getHost(); String rootStub = url.getProtocol() + "://" + url.getHost();
try { try {
rootCandidates.put(new DigestURL(rootStub).hash()); rootCandidates.put(new DigestURL(rootStub).hash());

@ -70,7 +70,7 @@ import net.yacy.document.LargeNumberCache;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory; import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.data.word.WordReferenceVars;
@ -277,7 +277,7 @@ public final class SearchEvent {
this.addRunning = true; this.addRunning = true;
this.receivedRemoteReferences = new AtomicInteger(0); this.receivedRemoteReferences = new AtomicInteger(0);
this.order = new ReferenceOrder(this.query.ranking, UTF8.getBytes(this.query.targetlang)); this.order = new ReferenceOrder(this.query.ranking, UTF8.getBytes(this.query.targetlang));
this.urlhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); this.urlhashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 100);
this.taggingPredicates = new HashMap<String, String>(); this.taggingPredicates = new HashMap<String, String>();
for (Tagging t: LibraryProvider.autotagging.getVocabularies()) { for (Tagging t: LibraryProvider.autotagging.getVocabularies()) {
this.taggingPredicates.put(t.getName(), t.getPredicate()); this.taggingPredicates.put(t.getName(), t.getPredicate());

Loading…
Cancel
Save