From 1ea17bd9f3034849a8325f103622baf87663a040 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 20 Jan 2014 18:31:46 +0100 Subject: [PATCH] - removed old metadata database and all migration code - refactored all code which uses URIMetadataRow as standard for word hash length and word hash ordering and moved that to the class 'Word', becuase the class URIMetadataRow defined the old metadata data structure and should be superfluous in the future - removed unused methods from URIMetadataRow as preparation for further removal of that class --- htroot/IndexControlRWIs_p.java | 17 ++- htroot/IndexControlURLs_p.java | 2 +- htroot/IndexFederated_p.java | 2 - htroot/SettingsAck_p.java | 1 - htroot/migrateurldb_p.html | 36 ------ htroot/migrateurldb_p.java | 44 ------- htroot/yacy/transferRWI.java | 6 +- .../yacy/cora/protocol/http/HTTPClient.java | 1 - source/net/yacy/crawler/Balancer.java | 6 +- source/net/yacy/crawler/CrawlSwitchboard.java | 3 +- source/net/yacy/crawler/HostQueue.java | 4 +- source/net/yacy/data/BookmarksDB.java | 6 +- .../kelondro/data/meta/URIMetadataRow.java | 60 +++------ source/net/yacy/kelondro/data/word/Word.java | 13 +- source/net/yacy/kelondro/rwi/IndexCell.java | 10 +- source/net/yacy/migration.java | 81 ------------ source/net/yacy/peers/Dispatcher.java | 3 +- source/net/yacy/repository/Blacklist.java | 7 +- source/net/yacy/search/Switchboard.java | 2 - .../net/yacy/search/index/DocumentIndex.java | 4 - source/net/yacy/search/index/Fulltext.java | 119 +----------------- source/net/yacy/search/index/Segment.java | 7 +- source/net/yacy/search/query/SearchEvent.java | 4 +- 23 files changed, 54 insertions(+), 384 deletions(-) delete mode 100644 htroot/migrateurldb_p.html delete mode 100644 htroot/migrateurldb_p.java diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 33b63489a..0852f0382 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -45,7 +45,6 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.data.ListManager; import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.URIMetadataNode; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceRow; @@ -110,8 +109,8 @@ public class IndexControlRWIs_p { final String[] urls = post.getAll("urlhx.*"); HandleSet urlb = new RowHandleSet( - URIMetadataRow.rowdef.primaryKeyLength, - URIMetadataRow.rowdef.objectOrder, + Word.commonHashLength, + Word.commonHashOrder, urls.length); if ( urls != null ) { for ( final String s : urls ) { @@ -165,8 +164,8 @@ public class IndexControlRWIs_p { final Iterator en = index.entries(); urlb = new RowHandleSet( - URIMetadataRow.rowdef.primaryKeyLength, - URIMetadataRow.rowdef.objectOrder, + Word.commonHashLength, + Word.commonHashOrder, index.size()); while ( en.hasNext() ) { try { @@ -208,8 +207,8 @@ public class IndexControlRWIs_p { } final HandleSet urlHashes = new RowHandleSet( - URIMetadataRow.rowdef.primaryKeyLength, - URIMetadataRow.rowdef.objectOrder, + Word.commonHashLength, + Word.commonHashOrder, 0); for ( final byte[] b : urlb ) { try { @@ -363,8 +362,8 @@ public class IndexControlRWIs_p { final String blacklist = post.get("blacklist", ""); final HandleSet urlHashes = new RowHandleSet( - URIMetadataRow.rowdef.primaryKeyLength, - URIMetadataRow.rowdef.objectOrder, + Word.commonHashLength, + Word.commonHashOrder, urlb.size()); if ( post.containsKey("blacklisturls") ) { final String[] supportedBlacklistTypes = diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 6d8e1237c..a57cc79fd 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -149,7 +149,7 @@ public class IndexControlURLs_p { // delete everything if ( post.containsKey("deletecomplete") ) { if ( post.get("deleteIndex", "").equals("on") ) { - try {segment.fulltext().clearURLIndex();} catch (final IOException e) {} + segment.fulltext().clearURLIndex(); try {segment.fulltext().clearLocalSolr();} catch (final IOException e) {} } if ( post.get("deleteRemoteSolr", "").equals("on")) { diff --git a/htroot/IndexFederated_p.java b/htroot/IndexFederated_p.java index 4947b4c5c..d66077310 100644 --- a/htroot/IndexFederated_p.java +++ b/htroot/IndexFederated_p.java @@ -87,11 +87,9 @@ public class IndexFederated_p { if (previous_core_fulltext && !post_core_fulltext) { // switch off sb.index.fulltext().disconnectLocalSolr(); - sb.index.fulltext().disconnectUrlDb(); } if (!previous_core_fulltext && post_core_fulltext) { // switch on - sb.index.connectUrlDb(sb.useTailCache, sb.exceed134217727); try { sb.index.fulltext().connectLocalSolr(); } catch (final IOException e) { ConcurrentLog.logException(e); } } diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index 8d982eff1..edc8821e5 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -36,7 +36,6 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import net.yacy.cora.document.id.MultiProtocolURL; -import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Digest; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; diff --git a/htroot/migrateurldb_p.html b/htroot/migrateurldb_p.html deleted file mode 100644 index 7da6e2157..000000000 --- a/htroot/migrateurldb_p.html +++ /dev/null @@ -1,36 +0,0 @@ - - - - Migrate URLdb - #%env/templates/metas.template%# - - - #%env/templates/header.template%# - #%env/templates/submenuIndexControl.template%# - -

Migrate URLdb to embedded Solr Index

- -

Convert old meta data (urldb) index to embedded Solr fulltext index.

- -
-
-

A low priority background job has been started which reads the old index, adds it to Solr and deletes the entry from the old index.

-

The default "slow migration" updates any entry in the old urldb index upon access (e.g. during search events).
- If you feel that the not accessed entries are still relevant, with this migration all entries from the old urldb index will be migrated.

-

You may refresh this page to see how many entries in the old index are left for migration

-

Hint: this background task runs until all entries are migrated or YaCy is shutdown. The migration is not automatically restarted.

-
-
-
-
- - - -

#[count]# entries in old index left to migrate.

-

For large indexes this may run for a long time (migration speed: #[speed]# entries per minute)

-
-
- - #%env/templates/footer.template%# - - diff --git a/htroot/migrateurldb_p.java b/htroot/migrateurldb_p.java deleted file mode 100644 index 3a977aa0d..000000000 --- a/htroot/migrateurldb_p.java +++ /dev/null @@ -1,44 +0,0 @@ -// migrateurldb_p.java - -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.migration; -import net.yacy.search.Switchboard; -import net.yacy.server.serverObjects; -import net.yacy.server.serverSwitch; - -public class migrateurldb_p { - - public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { - final serverObjects prop = new serverObjects(); - final Switchboard sb = (Switchboard) env; - - int cnt; - - if ((cnt = migration.migrateUrldbtoSolr(sb)) > 0) { - prop.put("count", cnt); - - if (post != null && post.containsKey("dorefresh")) { - int lastcount = post.getInt("lastcount", 0); - Long t = post.getLong("lasttime", 1); - - Double difft = (System.currentTimeMillis() - t) / 60000.0d; - int diff = (int)((lastcount - cnt) / difft) ; - prop.put("speed", diff); - prop.put("lasttime", t); - prop.put("lastcount", lastcount); - - } else { - prop.put("speed", "?"); - prop.put("lastcount",cnt); - prop.put("lasttime", System.currentTimeMillis()); - } - } else { - prop.put("speed", ""); - prop.put("count", "no urldb index available"); - } - - - // return rewrite properties - return prop; - } -} \ No newline at end of file diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index e2a8d16fe..2367105ae 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -42,7 +42,7 @@ import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.Memory; import net.yacy.cora.util.SpaceExceededException; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.FileUtils; @@ -182,8 +182,8 @@ public final class transferRWI { String wordHash; byte[] urlHash; WordReferenceRow iEntry; - final HandleSet unknownURL = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); - final HandleSet knownURL = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + final HandleSet unknownURL = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); + final HandleSet knownURL = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); final ArrayList wordhashes = new ArrayList(); int received = 0; int blocked = 0; diff --git a/source/net/yacy/cora/protocol/http/HTTPClient.java b/source/net/yacy/cora/protocol/http/HTTPClient.java index 41e503687..7d4ac41ce 100644 --- a/source/net/yacy/cora/protocol/http/HTTPClient.java +++ b/source/net/yacy/cora/protocol/http/HTTPClient.java @@ -57,7 +57,6 @@ import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.ClientProtocolException; import org.apache.http.client.CredentialsProvider; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index a3ae93a02..0e4da872f 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -51,7 +51,7 @@ import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.Latency; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.robots.RobotsTxt; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.BufferedObjectIndex; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.RowHandleSet; @@ -96,7 +96,7 @@ public class Balancer { this.cacheStacksPath = cachePath; this.domainStacks = new ConcurrentHashMap(); this.domStackInitSize = Integer.MAX_VALUE; - this.double_push_check = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + this.double_push_check = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); this.zeroWaitingCandidates = new ArrayList>(); this.random = new Random(System.currentTimeMillis()); @@ -564,7 +564,7 @@ public class Balancer { if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return; this.domainStacks.clear(); this.lastDomainStackFill = System.currentTimeMillis(); - final HandleSet blackhandles = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10); + final HandleSet blackhandles = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10); String host; Request request; int count = 0; diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index f60c04089..8d805b16d 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -49,7 +49,6 @@ import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.blob.MapHeap; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.FileUtils; @@ -588,7 +587,7 @@ public final class CrawlSwitchboard { r = sei.next(); String handle = r.profileHandle(); RowHandleSet us = this.profilesActiveCrawlsCounter.get(handle); - if (us == null) {us = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.profilesActiveCrawlsCounter.put(handle, us);} + if (us == null) {us = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); this.profilesActiveCrawlsCounter.put(handle, us);} if (us.size() < 100) us.put(r.url().hash()); // store the hash, but not too many deletionCandidate.remove(handle); if (deletionCandidate.size() == 0) return new HashSet(0); diff --git a/source/net/yacy/crawler/HostQueue.java b/source/net/yacy/crawler/HostQueue.java index c78ac989d..9d7c51df3 100644 --- a/source/net/yacy/crawler/HostQueue.java +++ b/source/net/yacy/crawler/HostQueue.java @@ -32,7 +32,7 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.robots.RobotsTxt; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.BufferedObjectIndex; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.RowHandleSet; @@ -60,7 +60,7 @@ public class HostQueue { final boolean exceed134217727) { this.hostHash = hostHash; this.queuesPath = queuesPath; - this.urlHashDoubleCheck = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + this.urlHashDoubleCheck = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); // create a stack for newly entered entries if (!(this.queuesPath.exists())) this.queuesPath.mkdir(); // make the path diff --git a/source/net/yacy/data/BookmarksDB.java b/source/net/yacy/data/BookmarksDB.java index d9c0140a6..049513af3 100644 --- a/source/net/yacy/data/BookmarksDB.java +++ b/source/net/yacy/data/BookmarksDB.java @@ -43,7 +43,7 @@ import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.kelondro.blob.MapHeap; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; public class BookmarksDB { @@ -212,7 +212,7 @@ public class BookmarksDB { final TreeSet set=new TreeSet(new bookmarkComparator(true)); final String tagHash=BookmarkHelper.tagHash(tagName); final Tag tag=getTag(tagHash); - RowHandleSet hashes = tag == null ? new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10) : tag.getUrlHashes(); + RowHandleSet hashes = tag == null ? new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10) : tag.getUrlHashes(); if (priv) { for (byte[] hash: hashes) set.add(ASCII.String(hash)); } else { @@ -389,7 +389,7 @@ public class BookmarksDB { private Tag(final String name) { this.tagHash = BookmarkHelper.tagHash(name); this.tagName = name; - this.urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10); + this.urlHashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10); } /** diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index 2bf071661..7c4ad5eb9 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -11,16 +11,16 @@ // LICENSE // // This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by +// it under the terms of the GNU General private License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. +// GNU General private License for more details. // -// You should have received a copy of the GNU General Public License +// You should have received a copy of the GNU General private License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA @@ -31,7 +31,6 @@ import java.text.ParseException; import java.util.Date; import java.util.List; import java.util.Properties; -import java.util.regex.Pattern; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.encoding.ASCII; @@ -43,7 +42,6 @@ import net.yacy.cora.order.Digest; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceVars; @@ -59,7 +57,7 @@ public class URIMetadataRow { // this object stores attributes for URL entries - public static final Row rowdef = new Row( + private static final Row rowdef = new Row( "String hash-12, " + // the url's hash "String comp-360, " + // components: the url, description, author, tags and publisher "Cardinal mod-4 {b256}, " + // last-modified from the httpd @@ -108,7 +106,7 @@ public class URIMetadataRow { private WordReference word; // this is only used if the url is transported via remote search requests private Components comp; - public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord) { + private URIMetadataRow(final Row.Entry entry, final WordReference searchedWord) { this.entry = entry; this.snippet = ""; this.word = searchedWord; @@ -242,17 +240,6 @@ public class URIMetadataRow { return h; } - private String hostHash = null; - public String hosthash() { - if (this.hostHash != null) return this.hostHash; - this.hostHash = ASCII.String(this.entry.getPrimaryKeyBytes(), 6, 6); - return this.hostHash; - } - - public boolean matches(final Pattern matcher) { - return this.metadata().matches(matcher); - } - public DigestURL url() { return this.metadata().url(); } @@ -281,7 +268,7 @@ public class URIMetadataRow { return this.metadata().lon(); } - private Components metadata() { + public Components metadata() { // avoid double computation of metadata elements if (this.comp != null) return this.comp; // parse elements from comp field; @@ -434,20 +421,6 @@ public class URIMetadataRow { } } - public Request toBalancerEntry(final String initiatorHash) { - return new Request( - ASCII.getBytes(initiatorHash), - metadata().url(), - referrerHash(), - metadata().dc_title(), - moddate(), - null, - 0, - 0, - 0, - 0); - } - /** * @return the object as String.
* This e.g. looks like this: @@ -472,7 +445,7 @@ public class URIMetadataRow { private final String dc_title, dc_creator, dc_subject, dc_publisher; private String latlon; // a comma-separated tuple as "," where the coordinates are given as WGS84 spatial coordinates in decimal degrees - public Components( + private Components( final String urlRaw, final byte[] urlhash, final String title, @@ -489,12 +462,7 @@ public class URIMetadataRow { this.dc_publisher = publisher; this.latlon = latlon; } - public boolean matches(final Pattern matcher) { - if (this.urlRaw != null) return matcher.matcher(this.urlRaw.toLowerCase()).matches(); - if (this.url != null) return matcher.matcher(this.url.toNormalform(true).toLowerCase()).matches(); - return false; - } - public DigestURL url() { + private DigestURL url() { if (this.url == null) { try { this.url = new DigestURL(this.urlRaw, this.urlHash); @@ -506,11 +474,11 @@ public class URIMetadataRow { } return this.url; } - public String dc_title() { return this.dc_title; } - public String dc_creator() { return this.dc_creator; } - public String dc_publisher() { return this.dc_publisher; } - public String dc_subject() { return this.dc_subject; } - public double lat() { + private String dc_title() { return this.dc_title; } + private String dc_creator() { return this.dc_creator; } + private String dc_publisher() { return this.dc_publisher; } + private String dc_subject() { return this.dc_subject; } + private double lat() { if (this.latlon == null || this.latlon.isEmpty()) return 0.0d; final int p = this.latlon.indexOf(','); if (p < 0) return 0.0d; @@ -523,7 +491,7 @@ public class URIMetadataRow { return 0.0d; } } - public double lon() { + private double lon() { if (this.latlon == null || this.latlon.isEmpty()) return 0.0d; final int p = this.latlon.indexOf(','); if (p < 0) return 0.0d; diff --git a/source/net/yacy/kelondro/data/word/Word.java b/source/net/yacy/kelondro/data/word/Word.java index ca93fa7ca..6c207cfff 100644 --- a/source/net/yacy/kelondro/data/word/Word.java +++ b/source/net/yacy/kelondro/data/word/Word.java @@ -40,10 +40,8 @@ import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.util.MemoryControl; - public class Word { - /** * this is the lenght(12) of the hash key that is used:
* - for seed hashes (this Object)
@@ -51,7 +49,8 @@ public class Word { * - for L-URL hashes (plasmaLURL.urlHashLength)

* these hashes all shall be generated by base64.enhancedCoder */ - public static final int commonHashLength = 12; + public static final int commonHashLength = 12; + public static final Base64Order commonHashOrder = Base64Order.enhancedCoder; private static final int hashCacheSize = Math.max(20000, Math.min(200000, (int) (MemoryControl.available() / 40000L))); private static ARC hashCache = null; @@ -64,12 +63,6 @@ public class Word { ConcurrentLog.info("Word", "hashCache.size = " + 1000); } } - /* - private static ConcurrentHashMap hashCache = null; - static { - hashCache = new ConcurrentHashMap(); - } - */ // object carries statistics for words and sentences public int count; // number of occurrences @@ -122,7 +115,7 @@ public class Word { byte[] h = hashCache.get(wordlc); if (h != null) return h; // calculate the hash - h = Base64Order.enhancedCoder.encodeSubstring(Digest.encodeMD5Raw(wordlc), commonHashLength); + h = commonHashOrder.encodeSubstring(Digest.encodeMD5Raw(wordlc), commonHashLength); while (h[0] == highByte && h[1] == highByte && h[2] == highByte && h[3] == highByte && h[4] == highByte) { // ensure that word hashes do not start with hash '_____' which is a key for an extra hash range for private usage on the local peer // statistically we are inside this loop only every 2^^30 calls of word2hash (which means almost never) diff --git a/source/net/yacy/kelondro/rwi/IndexCell.java b/source/net/yacy/kelondro/rwi/IndexCell.java index 7549a98ac..f3ea810b2 100644 --- a/source/net/yacy/kelondro/rwi/IndexCell.java +++ b/source/net/yacy/kelondro/rwi/IndexCell.java @@ -40,7 +40,7 @@ import net.yacy.cora.storage.ComparableARC; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.MergeIterator; @@ -102,7 +102,7 @@ public final class IndexCell extends AbstractBu this.targetFileSize = targetFileSize; this.maxFileSize = maxFileSize; this.writeBufferSize = writeBufferSize; - this.removeDelayedURLs = new TreeMap(URIMetadataRow.rowdef.objectOrder); + this.removeDelayedURLs = new TreeMap(Word.commonHashOrder); this.flushShallRun = true; this.flushThread = new FlushThread(); this.flushThread.start(); @@ -399,7 +399,7 @@ public final class IndexCell extends AbstractBu r = this.removeDelayedURLs.get(termHash); } if (r == null) { - r = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + r = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); } try { r.put(urlHashBytes); @@ -414,7 +414,7 @@ public final class IndexCell extends AbstractBu @Override public void removeDelayed() throws IOException { - final HandleSet words = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed. + final HandleSet words = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed. synchronized (this.removeDelayedURLs) { for (final byte[] b: this.removeDelayedURLs.keySet()) try {words.put(b);} catch (final SpaceExceededException e) {} } @@ -476,7 +476,7 @@ public final class IndexCell extends AbstractBu } public RemoveReducer(final byte[] urlHashBytes) { - this.urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + this.urlHashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); try { this.urlHashes.put(urlHashBytes); } catch (final SpaceExceededException e) { diff --git a/source/net/yacy/migration.java b/source/net/yacy/migration.java index 4ea302dca..e19da7ce6 100644 --- a/source/net/yacy/migration.java +++ b/source/net/yacy/migration.java @@ -43,11 +43,7 @@ import net.yacy.cora.protocol.TimeoutRequest; import net.yacy.cora.storage.Configuration.Entry; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.LibraryProvider; -import net.yacy.kelondro.data.meta.URIMetadataRow; -import net.yacy.kelondro.index.Index; -import net.yacy.kelondro.index.Row; import net.yacy.kelondro.workflow.BusyThread; -import net.yacy.search.index.Fulltext; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; import org.apache.solr.client.solrj.SolrServerException; @@ -282,83 +278,6 @@ public class migration { sb.setConfig("crawler.http.acceptCharset", sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7")); } } - /** - * converts old urldb to Solr. - * In chunks of 1000 entries. - * Creates a lock file in workdir to allow only one active migration thread - * @return current size of urldb index - */ - @SuppressWarnings("deprecation") - public static int migrateUrldbtoSolr(final Switchboard sb) { - int ret = 0; - final File f; - final Fulltext ft = sb.index.fulltext(); - - if (ft.getURLDb() != null) { - ret = ft.getURLDb().size(); - f = new File(sb.workPath, "migrateUrldbtoSolr.lck"); - f.deleteOnExit(); - if (f.exists()) { - return ret; - } - try { - f.createNewFile(); - } catch (final IOException ex) { - ConcurrentLog.info("migrateUrldbtoSolr","could not create lock file"); - } - - final Thread t = new Thread() { - boolean go = true; - final Index urldb = ft.getURLDb(); - - public void run() { - try { - Thread.currentThread().setName("migration.migrateUrldbtoSolr"); - - int i = urldb.size(); - while (go && i > 0) { - - List chunk = urldb.random(1000); - if ((chunk == null) || (chunk.size() == 0)) { - go = false; - break; - } - Iterator chunkit = chunk.iterator(); - - while (go && chunkit.hasNext()) { - try { // to catch any data errors - URIMetadataRow row = new URIMetadataRow(chunkit.next(), null); - ft.putMetadata(row); // this deletes old urldb-entry first and inserts into Solr - i--; - if (Switchboard.getSwitchboard().shallTerminate()) { - go = false; - } - } catch (final Exception e) { - ConcurrentLog.info("migrateUrldbtoSolr", "some error while adding old data to new index, continue with next entry"); - } - } - ConcurrentLog.info("migrateUrldbtoSolr", Integer.toString(i) + " entries left (convert next chunk of 1000 entries)"); - } - ft.commit(true); - - } catch (final IOException ex) { - ConcurrentLog.info("migrateUrldbtoSolr", "error reading old urldb index"); - } finally { - if (f.exists()) { - f.delete(); // delete lock file - } - } - } - - public void exit() { - go = false; - } - }; - t.setPriority(Thread.MIN_PRIORITY); - t.start(); - } - return ret; - } /** * Reindex embedded solr index diff --git a/source/net/yacy/peers/Dispatcher.java b/source/net/yacy/peers/Dispatcher.java index 92da26e4f..a5ee4a1e0 100644 --- a/source/net/yacy/peers/Dispatcher.java +++ b/source/net/yacy/peers/Dispatcher.java @@ -39,7 +39,6 @@ import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.Memory; import net.yacy.cora.util.SpaceExceededException; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.index.RowHandleSet; @@ -194,7 +193,7 @@ public class Dispatcher { final ArrayList> rc; if (ram) { // selection was only from ram, so we have to carefully remove only the selected entries - final HandleSet urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + final HandleSet urlHashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); Iterator it; for (final ReferenceContainer c: containers) { urlHashes.clear(); diff --git a/source/net/yacy/repository/Blacklist.java b/source/net/yacy/repository/Blacklist.java index d68999c4a..d54a3dd3a 100644 --- a/source/net/yacy/repository/Blacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -52,6 +52,7 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.data.ListManager; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.SetTools; @@ -462,7 +463,7 @@ public class Blacklist { } HandleSet urlHashCache = getCacheUrlHashsSet(blacklistType); if (urlHashCache == null) { - urlHashCache = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + urlHashCache = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); if (isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) { try { urlHashCache.put(url.hash()); @@ -679,13 +680,13 @@ public class Blacklist { try { ObjectInputStream in = new ObjectInputStream(new FileInputStream(cachefile)); RowHandleSet rhs = (RowHandleSet) in.readObject(); - this.cachedUrlHashs.put(type, rhs == null ? new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0) : rhs); + this.cachedUrlHashs.put(type, rhs == null ? new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0) : rhs); in.close(); return; } catch (final Throwable e) { ConcurrentLog.logException(e); } } - this.cachedUrlHashs.put(type, new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0)); + this.cachedUrlHashs.put(type, new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0)); } } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 2af8c0235..6b5f10746 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -506,7 +506,6 @@ public final class Switchboard extends serverSwitch { this.index.connectCitation(wordCacheMaxCount, fileSizeMax); } catch (final IOException e) {ConcurrentLog.logException(e);} if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, true)) { - this.index.connectUrlDb(this.useTailCache, this.exceed134217727); try {this.index.fulltext().connectLocalSolr();} catch (final IOException e) {ConcurrentLog.logException(e);} } this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false)); @@ -1347,7 +1346,6 @@ public final class Switchboard extends serverSwitch { if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax); if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, true)) { this.index.fulltext().connectLocalSolr(); - this.index.connectUrlDb(this.useTailCache, this.exceed134217727); } this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false)); diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index 46b8a5960..ea392e561 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -74,10 +74,6 @@ public class DocumentIndex extends Segment { ); super.connectRWI(cachesize, targetFileSize * 4 - 1); super.connectCitation(cachesize, targetFileSize * 4 - 1); - super.connectUrlDb( - false, // useTailCache - false // exceed134217727 - ); super.fulltext().connectLocalSolr(); super.fulltext().setUseWebgraph(true); this.callback = callback; diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 77afc1d87..43b4d6cc9 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -54,7 +54,6 @@ import net.yacy.cora.federate.solr.instance.EmbeddedInstance; import net.yacy.cora.federate.solr.instance.InstanceMirror; import net.yacy.cora.federate.solr.instance.RemoteInstance; import net.yacy.cora.federate.solr.instance.ShardInstance; -import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.sorting.ScoreMap; @@ -66,10 +65,6 @@ import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.WordReferenceVars; -import net.yacy.kelondro.index.Cache; -import net.yacy.kelondro.index.Index; -import net.yacy.kelondro.index.Row; -import net.yacy.kelondro.table.SplitTable; import net.yacy.kelondro.util.MemoryControl; import net.yacy.search.Switchboard; import net.yacy.search.schema.CollectionConfiguration; @@ -90,9 +85,7 @@ public final class Fulltext { // class objects private final File segmentPath; private final File archivePath; - private Index urlIndexFile; private Export exportthread; // will have a export thread assigned if exporter is running - private String tablename; private ArrayList statsDump; private InstanceMirror solrInstances; private final CollectionConfiguration collectionConfiguration; @@ -103,8 +96,6 @@ public final class Fulltext { final CollectionConfiguration collectionConfiguration, final WebgraphConfiguration webgraphConfiguration) { this.segmentPath = segmentPath; this.archivePath = archivePath; - this.tablename = null; - this.urlIndexFile = null; this.exportthread = null; // will have a export thread assigned if exporter is running this.statsDump = null; this.solrInstances = new InstanceMirror(); @@ -121,35 +112,6 @@ public final class Fulltext { return this.writeWebgraph; } - /** - * @deprecated - * used only for migration - * @return the connected URLDb - - */ - @Deprecated - public Index getURLDb() { - return this.urlIndexFile; - } - - protected void connectUrlDb(final String tablename, final boolean useTailCache, final boolean exceed134217727) { - if (this.urlIndexFile != null) return; - this.tablename = tablename; - this.urlIndexFile = new SplitTable(new File(this.segmentPath, "default"), tablename, URIMetadataRow.rowdef, useTailCache, exceed134217727); - // SplitTable always returns != null, even if no file exists. - // as old UrlDb should be null if not exist, check and close if empty - // TODO: check if a SplitTable.open() returning null or error status on not existing file is preferable - if (this.urlIndexFile.isEmpty()) { - disconnectUrlDb(); - } - } - - public void disconnectUrlDb() { - if (this.urlIndexFile == null) return; - this.urlIndexFile.close(); - this.urlIndexFile = null; - } - public CollectionConfiguration getDefaultConfiguration() { return this.collectionConfiguration; } @@ -233,19 +195,13 @@ public final class Fulltext { } public void clearCaches() { - if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache(); if (this.statsDump != null) this.statsDump.clear(); this.solrInstances.clearCaches(); this.statsDump = null; } - public void clearURLIndex() throws IOException { + public void clearURLIndex() { if (this.exportthread != null) this.exportthread.interrupt(); - if (this.urlIndexFile == null) { - SplitTable.delete(new File(this.segmentPath, "default"), this.tablename); - } else { - this.urlIndexFile.clear(); - } this.statsDump = null; this.commit(true); } @@ -280,8 +236,7 @@ public final class Fulltext { public long collectionSize() { long t = System.currentTimeMillis(); if (t - this.collectionSizeLastAccess < 1000) return this.collectionSizeLastValue; - long size = this.urlIndexFile == null ? 0 : this.urlIndexFile.size(); - size += this.solrInstances.getDefaultMirrorConnector().getSize(); + long size = this.solrInstances.getDefaultMirrorConnector().getSize(); this.collectionSizeLastAccess = t; this.collectionSizeLastValue = size; return size; @@ -297,10 +252,6 @@ public final class Fulltext { public void close() { this.statsDump = null; - if (this.urlIndexFile != null) { - this.urlIndexFile.close(); - this.urlIndexFile = null; - } this.solrInstances.close(); } @@ -364,27 +315,12 @@ public final class Fulltext { try { SolrDocument doc = this.getDefaultConnector().getDocumentById(u); if (doc != null) { - if (this.urlIndexFile != null) this.urlIndexFile.remove(urlHash); // migration return new URIMetadataNode(doc, wre, weight); } } catch (final IOException e) { ConcurrentLog.logException(e); } - // get the metadata from the old metadata index - if (this.urlIndexFile != null) try { - // slow migration to solr - final Row.Entry entry = this.urlIndexFile.remove(urlHash); - if (entry == null) return null; - URIMetadataRow row = new URIMetadataRow(entry, wre); - SolrInputDocument solrInput = this.collectionConfiguration.metadata2solr(row); - this.putDocument(solrInput); - SolrDocument sd = this.collectionConfiguration.toSolrDocument(solrInput); - return new URIMetadataNode(sd, wre, weight); - } catch (final IOException e) { - ConcurrentLog.logException(e); - } - return null; } @@ -394,14 +330,8 @@ public final class Fulltext { String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); String url = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); ConcurrentLog.info("Fulltext", "indexing: " + id + " " + url); - byte[] idb = ASCII.getBytes(id); try { - if (this.urlIndexFile != null) this.urlIndexFile.remove(idb); - //Date sdDate = (Date) connector.getFieldById(id, CollectionSchema.last_modified.getSolrFieldName()); - //Date docDate = null; - //if (sdDate == null || (docDate = SchemaConfiguration.getDate(doc, CollectionSchema.last_modified)) == null || sdDate.before(docDate)) { connector.add(doc); - //} } catch (final SolrException e) { throw new IOException(e.getMessage(), e); } @@ -428,7 +358,6 @@ public final class Fulltext { byte[] idb = entry.hash(); String id = ASCII.String(idb); try { - if (this.urlIndexFile != null) this.urlIndexFile.remove(idb); // because node entries are richer than metadata entries we must check if they exist to prevent that they are overwritten SolrDocument sd = this.getDefaultConnector().getDocumentById(id); if (sd == null || (new URIMetadataNode(sd)).isOlder(entry)) { @@ -458,24 +387,7 @@ public final class Fulltext { (freshdate == null || freshdate.after(now)) ? null : (WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]")); - // delete in old metadata structure - if (Fulltext.this.urlIndexFile != null) { - final ArrayList l = new ArrayList(); - CloneableIterator i; - try { - i = Fulltext.this.urlIndexFile.keys(true, null); - String hash; - while (i != null && i.hasNext()) { - hash = ASCII.String(i.next()); - if (hosthashes.contains(hash.substring(6))) l.add(hash); - } - - // then delete the urls using this list - for (final String h: l) Fulltext.this.urlIndexFile.delete(ASCII.getBytes(h)); - } catch (final IOException e) {} - } - - // finally remove the line with statistics + // remove the line with statistics if (Fulltext.this.statsDump != null) { final Iterator hsi = Fulltext.this.statsDump.iterator(); HostStat hs; @@ -578,12 +490,6 @@ public final class Fulltext { } catch (final Throwable e) { ConcurrentLog.logException(e); } - if (Fulltext.this.urlIndexFile != null) try { - for (String id: deleteIDs) { - final Row.Entry r = Fulltext.this.urlIndexFile.remove(ASCII.getBytes(id)); - if (r != null) Fulltext.this.statsDump = null; - } - } catch (final IOException e) {} } public boolean remove(final byte[] urlHash) { @@ -595,20 +501,12 @@ public final class Fulltext { } catch (final Throwable e) { ConcurrentLog.logException(e); } - if (this.urlIndexFile != null) try { - final Row.Entry r = this.urlIndexFile.remove(urlHash); - if (r != null) this.statsDump = null; - return r != null; - } catch (final IOException e) { - return false; - } return false; } @Deprecated public boolean exists(final String urlHash) { if (urlHash == null) return false; - if (this.urlIndexFile != null && this.urlIndexFile.has(ASCII.getBytes(urlHash))) return true; try { if (this.getDefaultConnector().existsById(urlHash)) return true; } catch (final Throwable e) { @@ -629,17 +527,6 @@ public final class Fulltext { if (ids.size() == 1) return exists(ids.iterator().next()) ? ids : e; Set idsC = new HashSet(); idsC.addAll(ids); - if (this.urlIndexFile != null) { - Iterator idsi = idsC.iterator(); - String h; - while (idsi.hasNext()) { - h = idsi.next(); - if (this.urlIndexFile.has(ASCII.getBytes(h))) { - idsi.remove(); - e.add(h); - } - } - } try { Set e1 = this.getDefaultConnector().existsByIds(idsC); e.addAll(e1); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 39512de1b..45f639a07 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -66,7 +66,6 @@ import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReferenceFactory; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; @@ -204,10 +203,6 @@ public class Segment { public long citationSegmentCount() { return this.urlCitationIndex == null ? 0 : this.urlCitationIndex.getSegmentCount(); } - - public void connectUrlDb(final boolean useTailCache, final boolean exceed134217727) { - this.fulltext.connectUrlDb(UrlDbName, useTailCache, exceed134217727); - } public Fulltext fulltext() { return this.fulltext; @@ -280,7 +275,7 @@ public class Segment { } private static RowHandleSet getPossibleRootHashes(DigestURL url) { - RowHandleSet rootCandidates = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10); + RowHandleSet rootCandidates = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10); String rootStub = url.getProtocol() + "://" + url.getHost(); try { rootCandidates.put(new DigestURL(rootStub).hash()); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index a84a18c6e..8e3cc2f46 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -70,7 +70,7 @@ import net.yacy.document.LargeNumberCache; import net.yacy.document.LibraryProvider; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.URIMetadataNode; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; import net.yacy.kelondro.data.word.WordReferenceVars; @@ -277,7 +277,7 @@ public final class SearchEvent { this.addRunning = true; this.receivedRemoteReferences = new AtomicInteger(0); this.order = new ReferenceOrder(this.query.ranking, UTF8.getBytes(this.query.targetlang)); - this.urlhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); + this.urlhashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 100); this.taggingPredicates = new HashMap(); for (Tagging t: LibraryProvider.autotagging.getVocabularies()) { this.taggingPredicates.put(t.getName(), t.getPredicate());