diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 33b63489a..0852f0382 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -45,7 +45,6 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.data.ListManager; import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.URIMetadataNode; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceRow; @@ -110,8 +109,8 @@ public class IndexControlRWIs_p { final String[] urls = post.getAll("urlhx.*"); HandleSet urlb = new RowHandleSet( - URIMetadataRow.rowdef.primaryKeyLength, - URIMetadataRow.rowdef.objectOrder, + Word.commonHashLength, + Word.commonHashOrder, urls.length); if ( urls != null ) { for ( final String s : urls ) { @@ -165,8 +164,8 @@ public class IndexControlRWIs_p { final Iterator en = index.entries(); urlb = new RowHandleSet( - URIMetadataRow.rowdef.primaryKeyLength, - URIMetadataRow.rowdef.objectOrder, + Word.commonHashLength, + Word.commonHashOrder, index.size()); while ( en.hasNext() ) { try { @@ -208,8 +207,8 @@ public class IndexControlRWIs_p { } final HandleSet urlHashes = new RowHandleSet( - URIMetadataRow.rowdef.primaryKeyLength, - URIMetadataRow.rowdef.objectOrder, + Word.commonHashLength, + Word.commonHashOrder, 0); for ( final byte[] b : urlb ) { try { @@ -363,8 +362,8 @@ public class IndexControlRWIs_p { final String blacklist = post.get("blacklist", ""); final HandleSet urlHashes = new RowHandleSet( - URIMetadataRow.rowdef.primaryKeyLength, - URIMetadataRow.rowdef.objectOrder, + Word.commonHashLength, + Word.commonHashOrder, urlb.size()); if ( post.containsKey("blacklisturls") ) { final String[] supportedBlacklistTypes = diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 6d8e1237c..a57cc79fd 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -149,7 +149,7 @@ public class IndexControlURLs_p { // delete everything if ( post.containsKey("deletecomplete") ) { if ( post.get("deleteIndex", "").equals("on") ) { - try {segment.fulltext().clearURLIndex();} catch (final IOException e) {} + segment.fulltext().clearURLIndex(); try {segment.fulltext().clearLocalSolr();} catch (final IOException e) {} } if ( post.get("deleteRemoteSolr", "").equals("on")) { diff --git a/htroot/IndexFederated_p.java b/htroot/IndexFederated_p.java index 4947b4c5c..d66077310 100644 --- a/htroot/IndexFederated_p.java +++ b/htroot/IndexFederated_p.java @@ -87,11 +87,9 @@ public class IndexFederated_p { if (previous_core_fulltext && !post_core_fulltext) { // switch off sb.index.fulltext().disconnectLocalSolr(); - sb.index.fulltext().disconnectUrlDb(); } if (!previous_core_fulltext && post_core_fulltext) { // switch on - sb.index.connectUrlDb(sb.useTailCache, sb.exceed134217727); try { sb.index.fulltext().connectLocalSolr(); } catch (final IOException e) { ConcurrentLog.logException(e); } } diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index 8d982eff1..edc8821e5 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -36,7 +36,6 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import net.yacy.cora.document.id.MultiProtocolURL; -import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Digest; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; diff --git a/htroot/migrateurldb_p.html b/htroot/migrateurldb_p.html deleted file mode 100644 index 7da6e2157..000000000 --- a/htroot/migrateurldb_p.html +++ /dev/null @@ -1,36 +0,0 @@ - - - - Migrate URLdb - #%env/templates/metas.template%# - - - #%env/templates/header.template%# - #%env/templates/submenuIndexControl.template%# - -

Migrate URLdb to embedded Solr Index

- -

Convert old meta data (urldb) index to embedded Solr fulltext index.

- -
-
-

A low priority background job has been started which reads the old index, adds it to Solr and deletes the entry from the old index.

-

The default "slow migration" updates any entry in the old urldb index upon access (e.g. during search events).
- If you feel that the not accessed entries are still relevant, with this migration all entries from the old urldb index will be migrated.

-

You may refresh this page to see how many entries in the old index are left for migration

-

Hint: this background task runs until all entries are migrated or YaCy is shutdown. The migration is not automatically restarted.

-
-
-
-
- - - -

#[count]# entries in old index left to migrate.

-

For large indexes this may run for a long time (migration speed: #[speed]# entries per minute)

-
-
- - #%env/templates/footer.template%# - - diff --git a/htroot/migrateurldb_p.java b/htroot/migrateurldb_p.java deleted file mode 100644 index 3a977aa0d..000000000 --- a/htroot/migrateurldb_p.java +++ /dev/null @@ -1,44 +0,0 @@ -// migrateurldb_p.java - -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.migration; -import net.yacy.search.Switchboard; -import net.yacy.server.serverObjects; -import net.yacy.server.serverSwitch; - -public class migrateurldb_p { - - public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { - final serverObjects prop = new serverObjects(); - final Switchboard sb = (Switchboard) env; - - int cnt; - - if ((cnt = migration.migrateUrldbtoSolr(sb)) > 0) { - prop.put("count", cnt); - - if (post != null && post.containsKey("dorefresh")) { - int lastcount = post.getInt("lastcount", 0); - Long t = post.getLong("lasttime", 1); - - Double difft = (System.currentTimeMillis() - t) / 60000.0d; - int diff = (int)((lastcount - cnt) / difft) ; - prop.put("speed", diff); - prop.put("lasttime", t); - prop.put("lastcount", lastcount); - - } else { - prop.put("speed", "?"); - prop.put("lastcount",cnt); - prop.put("lasttime", System.currentTimeMillis()); - } - } else { - prop.put("speed", ""); - prop.put("count", "no urldb index available"); - } - - - // return rewrite properties - return prop; - } -} \ No newline at end of file diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index e2a8d16fe..2367105ae 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -42,7 +42,7 @@ import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.Memory; import net.yacy.cora.util.SpaceExceededException; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.FileUtils; @@ -182,8 +182,8 @@ public final class transferRWI { String wordHash; byte[] urlHash; WordReferenceRow iEntry; - final HandleSet unknownURL = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); - final HandleSet knownURL = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + final HandleSet unknownURL = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); + final HandleSet knownURL = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); final ArrayList wordhashes = new ArrayList(); int received = 0; int blocked = 0; diff --git a/source/net/yacy/cora/protocol/http/HTTPClient.java b/source/net/yacy/cora/protocol/http/HTTPClient.java index 41e503687..7d4ac41ce 100644 --- a/source/net/yacy/cora/protocol/http/HTTPClient.java +++ b/source/net/yacy/cora/protocol/http/HTTPClient.java @@ -57,7 +57,6 @@ import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.ClientProtocolException; import org.apache.http.client.CredentialsProvider; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index a3ae93a02..0e4da872f 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -51,7 +51,7 @@ import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.Latency; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.robots.RobotsTxt; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.BufferedObjectIndex; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.RowHandleSet; @@ -96,7 +96,7 @@ public class Balancer { this.cacheStacksPath = cachePath; this.domainStacks = new ConcurrentHashMap(); this.domStackInitSize = Integer.MAX_VALUE; - this.double_push_check = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + this.double_push_check = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); this.zeroWaitingCandidates = new ArrayList>(); this.random = new Random(System.currentTimeMillis()); @@ -564,7 +564,7 @@ public class Balancer { if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return; this.domainStacks.clear(); this.lastDomainStackFill = System.currentTimeMillis(); - final HandleSet blackhandles = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10); + final HandleSet blackhandles = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10); String host; Request request; int count = 0; diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index f60c04089..8d805b16d 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -49,7 +49,6 @@ import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.blob.MapHeap; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.FileUtils; @@ -588,7 +587,7 @@ public final class CrawlSwitchboard { r = sei.next(); String handle = r.profileHandle(); RowHandleSet us = this.profilesActiveCrawlsCounter.get(handle); - if (us == null) {us = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.profilesActiveCrawlsCounter.put(handle, us);} + if (us == null) {us = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); this.profilesActiveCrawlsCounter.put(handle, us);} if (us.size() < 100) us.put(r.url().hash()); // store the hash, but not too many deletionCandidate.remove(handle); if (deletionCandidate.size() == 0) return new HashSet(0); diff --git a/source/net/yacy/crawler/HostQueue.java b/source/net/yacy/crawler/HostQueue.java index c78ac989d..9d7c51df3 100644 --- a/source/net/yacy/crawler/HostQueue.java +++ b/source/net/yacy/crawler/HostQueue.java @@ -32,7 +32,7 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.robots.RobotsTxt; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.BufferedObjectIndex; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.RowHandleSet; @@ -60,7 +60,7 @@ public class HostQueue { final boolean exceed134217727) { this.hostHash = hostHash; this.queuesPath = queuesPath; - this.urlHashDoubleCheck = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + this.urlHashDoubleCheck = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); // create a stack for newly entered entries if (!(this.queuesPath.exists())) this.queuesPath.mkdir(); // make the path diff --git a/source/net/yacy/data/BookmarksDB.java b/source/net/yacy/data/BookmarksDB.java index d9c0140a6..049513af3 100644 --- a/source/net/yacy/data/BookmarksDB.java +++ b/source/net/yacy/data/BookmarksDB.java @@ -43,7 +43,7 @@ import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.kelondro.blob.MapHeap; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; public class BookmarksDB { @@ -212,7 +212,7 @@ public class BookmarksDB { final TreeSet set=new TreeSet(new bookmarkComparator(true)); final String tagHash=BookmarkHelper.tagHash(tagName); final Tag tag=getTag(tagHash); - RowHandleSet hashes = tag == null ? new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10) : tag.getUrlHashes(); + RowHandleSet hashes = tag == null ? new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10) : tag.getUrlHashes(); if (priv) { for (byte[] hash: hashes) set.add(ASCII.String(hash)); } else { @@ -389,7 +389,7 @@ public class BookmarksDB { private Tag(final String name) { this.tagHash = BookmarkHelper.tagHash(name); this.tagName = name; - this.urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10); + this.urlHashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10); } /** diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index 2bf071661..7c4ad5eb9 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -11,16 +11,16 @@ // LICENSE // // This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by +// it under the terms of the GNU General private License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. +// GNU General private License for more details. // -// You should have received a copy of the GNU General Public License +// You should have received a copy of the GNU General private License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA @@ -31,7 +31,6 @@ import java.text.ParseException; import java.util.Date; import java.util.List; import java.util.Properties; -import java.util.regex.Pattern; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.encoding.ASCII; @@ -43,7 +42,6 @@ import net.yacy.cora.order.Digest; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceVars; @@ -59,7 +57,7 @@ public class URIMetadataRow { // this object stores attributes for URL entries - public static final Row rowdef = new Row( + private static final Row rowdef = new Row( "String hash-12, " + // the url's hash "String comp-360, " + // components: the url, description, author, tags and publisher "Cardinal mod-4 {b256}, " + // last-modified from the httpd @@ -108,7 +106,7 @@ public class URIMetadataRow { private WordReference word; // this is only used if the url is transported via remote search requests private Components comp; - public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord) { + private URIMetadataRow(final Row.Entry entry, final WordReference searchedWord) { this.entry = entry; this.snippet = ""; this.word = searchedWord; @@ -242,17 +240,6 @@ public class URIMetadataRow { return h; } - private String hostHash = null; - public String hosthash() { - if (this.hostHash != null) return this.hostHash; - this.hostHash = ASCII.String(this.entry.getPrimaryKeyBytes(), 6, 6); - return this.hostHash; - } - - public boolean matches(final Pattern matcher) { - return this.metadata().matches(matcher); - } - public DigestURL url() { return this.metadata().url(); } @@ -281,7 +268,7 @@ public class URIMetadataRow { return this.metadata().lon(); } - private Components metadata() { + public Components metadata() { // avoid double computation of metadata elements if (this.comp != null) return this.comp; // parse elements from comp field; @@ -434,20 +421,6 @@ public class URIMetadataRow { } } - public Request toBalancerEntry(final String initiatorHash) { - return new Request( - ASCII.getBytes(initiatorHash), - metadata().url(), - referrerHash(), - metadata().dc_title(), - moddate(), - null, - 0, - 0, - 0, - 0); - } - /** * @return the object as String.
* This e.g. looks like this: @@ -472,7 +445,7 @@ public class URIMetadataRow { private final String dc_title, dc_creator, dc_subject, dc_publisher; private String latlon; // a comma-separated tuple as "," where the coordinates are given as WGS84 spatial coordinates in decimal degrees - public Components( + private Components( final String urlRaw, final byte[] urlhash, final String title, @@ -489,12 +462,7 @@ public class URIMetadataRow { this.dc_publisher = publisher; this.latlon = latlon; } - public boolean matches(final Pattern matcher) { - if (this.urlRaw != null) return matcher.matcher(this.urlRaw.toLowerCase()).matches(); - if (this.url != null) return matcher.matcher(this.url.toNormalform(true).toLowerCase()).matches(); - return false; - } - public DigestURL url() { + private DigestURL url() { if (this.url == null) { try { this.url = new DigestURL(this.urlRaw, this.urlHash); @@ -506,11 +474,11 @@ public class URIMetadataRow { } return this.url; } - public String dc_title() { return this.dc_title; } - public String dc_creator() { return this.dc_creator; } - public String dc_publisher() { return this.dc_publisher; } - public String dc_subject() { return this.dc_subject; } - public double lat() { + private String dc_title() { return this.dc_title; } + private String dc_creator() { return this.dc_creator; } + private String dc_publisher() { return this.dc_publisher; } + private String dc_subject() { return this.dc_subject; } + private double lat() { if (this.latlon == null || this.latlon.isEmpty()) return 0.0d; final int p = this.latlon.indexOf(','); if (p < 0) return 0.0d; @@ -523,7 +491,7 @@ public class URIMetadataRow { return 0.0d; } } - public double lon() { + private double lon() { if (this.latlon == null || this.latlon.isEmpty()) return 0.0d; final int p = this.latlon.indexOf(','); if (p < 0) return 0.0d; diff --git a/source/net/yacy/kelondro/data/word/Word.java b/source/net/yacy/kelondro/data/word/Word.java index ca93fa7ca..6c207cfff 100644 --- a/source/net/yacy/kelondro/data/word/Word.java +++ b/source/net/yacy/kelondro/data/word/Word.java @@ -40,10 +40,8 @@ import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.util.MemoryControl; - public class Word { - /** * this is the lenght(12) of the hash key that is used:
* - for seed hashes (this Object)
@@ -51,7 +49,8 @@ public class Word { * - for L-URL hashes (plasmaLURL.urlHashLength)

* these hashes all shall be generated by base64.enhancedCoder */ - public static final int commonHashLength = 12; + public static final int commonHashLength = 12; + public static final Base64Order commonHashOrder = Base64Order.enhancedCoder; private static final int hashCacheSize = Math.max(20000, Math.min(200000, (int) (MemoryControl.available() / 40000L))); private static ARC hashCache = null; @@ -64,12 +63,6 @@ public class Word { ConcurrentLog.info("Word", "hashCache.size = " + 1000); } } - /* - private static ConcurrentHashMap hashCache = null; - static { - hashCache = new ConcurrentHashMap(); - } - */ // object carries statistics for words and sentences public int count; // number of occurrences @@ -122,7 +115,7 @@ public class Word { byte[] h = hashCache.get(wordlc); if (h != null) return h; // calculate the hash - h = Base64Order.enhancedCoder.encodeSubstring(Digest.encodeMD5Raw(wordlc), commonHashLength); + h = commonHashOrder.encodeSubstring(Digest.encodeMD5Raw(wordlc), commonHashLength); while (h[0] == highByte && h[1] == highByte && h[2] == highByte && h[3] == highByte && h[4] == highByte) { // ensure that word hashes do not start with hash '_____' which is a key for an extra hash range for private usage on the local peer // statistically we are inside this loop only every 2^^30 calls of word2hash (which means almost never) diff --git a/source/net/yacy/kelondro/rwi/IndexCell.java b/source/net/yacy/kelondro/rwi/IndexCell.java index 7549a98ac..f3ea810b2 100644 --- a/source/net/yacy/kelondro/rwi/IndexCell.java +++ b/source/net/yacy/kelondro/rwi/IndexCell.java @@ -40,7 +40,7 @@ import net.yacy.cora.storage.ComparableARC; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.MergeIterator; @@ -102,7 +102,7 @@ public final class IndexCell extends AbstractBu this.targetFileSize = targetFileSize; this.maxFileSize = maxFileSize; this.writeBufferSize = writeBufferSize; - this.removeDelayedURLs = new TreeMap(URIMetadataRow.rowdef.objectOrder); + this.removeDelayedURLs = new TreeMap(Word.commonHashOrder); this.flushShallRun = true; this.flushThread = new FlushThread(); this.flushThread.start(); @@ -399,7 +399,7 @@ public final class IndexCell extends AbstractBu r = this.removeDelayedURLs.get(termHash); } if (r == null) { - r = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + r = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); } try { r.put(urlHashBytes); @@ -414,7 +414,7 @@ public final class IndexCell extends AbstractBu @Override public void removeDelayed() throws IOException { - final HandleSet words = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed. + final HandleSet words = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed. synchronized (this.removeDelayedURLs) { for (final byte[] b: this.removeDelayedURLs.keySet()) try {words.put(b);} catch (final SpaceExceededException e) {} } @@ -476,7 +476,7 @@ public final class IndexCell extends AbstractBu } public RemoveReducer(final byte[] urlHashBytes) { - this.urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + this.urlHashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); try { this.urlHashes.put(urlHashBytes); } catch (final SpaceExceededException e) { diff --git a/source/net/yacy/migration.java b/source/net/yacy/migration.java index 4ea302dca..e19da7ce6 100644 --- a/source/net/yacy/migration.java +++ b/source/net/yacy/migration.java @@ -43,11 +43,7 @@ import net.yacy.cora.protocol.TimeoutRequest; import net.yacy.cora.storage.Configuration.Entry; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.LibraryProvider; -import net.yacy.kelondro.data.meta.URIMetadataRow; -import net.yacy.kelondro.index.Index; -import net.yacy.kelondro.index.Row; import net.yacy.kelondro.workflow.BusyThread; -import net.yacy.search.index.Fulltext; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; import org.apache.solr.client.solrj.SolrServerException; @@ -282,83 +278,6 @@ public class migration { sb.setConfig("crawler.http.acceptCharset", sb.getConfig("crawler.acceptCharset","ISO-8859-1,utf-8;q=0.7,*;q=0.7")); } } - /** - * converts old urldb to Solr. - * In chunks of 1000 entries. - * Creates a lock file in workdir to allow only one active migration thread - * @return current size of urldb index - */ - @SuppressWarnings("deprecation") - public static int migrateUrldbtoSolr(final Switchboard sb) { - int ret = 0; - final File f; - final Fulltext ft = sb.index.fulltext(); - - if (ft.getURLDb() != null) { - ret = ft.getURLDb().size(); - f = new File(sb.workPath, "migrateUrldbtoSolr.lck"); - f.deleteOnExit(); - if (f.exists()) { - return ret; - } - try { - f.createNewFile(); - } catch (final IOException ex) { - ConcurrentLog.info("migrateUrldbtoSolr","could not create lock file"); - } - - final Thread t = new Thread() { - boolean go = true; - final Index urldb = ft.getURLDb(); - - public void run() { - try { - Thread.currentThread().setName("migration.migrateUrldbtoSolr"); - - int i = urldb.size(); - while (go && i > 0) { - - List chunk = urldb.random(1000); - if ((chunk == null) || (chunk.size() == 0)) { - go = false; - break; - } - Iterator chunkit = chunk.iterator(); - - while (go && chunkit.hasNext()) { - try { // to catch any data errors - URIMetadataRow row = new URIMetadataRow(chunkit.next(), null); - ft.putMetadata(row); // this deletes old urldb-entry first and inserts into Solr - i--; - if (Switchboard.getSwitchboard().shallTerminate()) { - go = false; - } - } catch (final Exception e) { - ConcurrentLog.info("migrateUrldbtoSolr", "some error while adding old data to new index, continue with next entry"); - } - } - ConcurrentLog.info("migrateUrldbtoSolr", Integer.toString(i) + " entries left (convert next chunk of 1000 entries)"); - } - ft.commit(true); - - } catch (final IOException ex) { - ConcurrentLog.info("migrateUrldbtoSolr", "error reading old urldb index"); - } finally { - if (f.exists()) { - f.delete(); // delete lock file - } - } - } - - public void exit() { - go = false; - } - }; - t.setPriority(Thread.MIN_PRIORITY); - t.start(); - } - return ret; - } /** * Reindex embedded solr index diff --git a/source/net/yacy/peers/Dispatcher.java b/source/net/yacy/peers/Dispatcher.java index 92da26e4f..a5ee4a1e0 100644 --- a/source/net/yacy/peers/Dispatcher.java +++ b/source/net/yacy/peers/Dispatcher.java @@ -39,7 +39,6 @@ import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.Memory; import net.yacy.cora.util.SpaceExceededException; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.index.RowHandleSet; @@ -194,7 +193,7 @@ public class Dispatcher { final ArrayList> rc; if (ram) { // selection was only from ram, so we have to carefully remove only the selected entries - final HandleSet urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + final HandleSet urlHashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); Iterator it; for (final ReferenceContainer c: containers) { urlHashes.clear(); diff --git a/source/net/yacy/repository/Blacklist.java b/source/net/yacy/repository/Blacklist.java index d68999c4a..d54a3dd3a 100644 --- a/source/net/yacy/repository/Blacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -52,6 +52,7 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.data.ListManager; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.SetTools; @@ -462,7 +463,7 @@ public class Blacklist { } HandleSet urlHashCache = getCacheUrlHashsSet(blacklistType); if (urlHashCache == null) { - urlHashCache = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + urlHashCache = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); if (isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) { try { urlHashCache.put(url.hash()); @@ -679,13 +680,13 @@ public class Blacklist { try { ObjectInputStream in = new ObjectInputStream(new FileInputStream(cachefile)); RowHandleSet rhs = (RowHandleSet) in.readObject(); - this.cachedUrlHashs.put(type, rhs == null ? new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0) : rhs); + this.cachedUrlHashs.put(type, rhs == null ? new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0) : rhs); in.close(); return; } catch (final Throwable e) { ConcurrentLog.logException(e); } } - this.cachedUrlHashs.put(type, new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0)); + this.cachedUrlHashs.put(type, new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0)); } } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 2af8c0235..6b5f10746 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -506,7 +506,6 @@ public final class Switchboard extends serverSwitch { this.index.connectCitation(wordCacheMaxCount, fileSizeMax); } catch (final IOException e) {ConcurrentLog.logException(e);} if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, true)) { - this.index.connectUrlDb(this.useTailCache, this.exceed134217727); try {this.index.fulltext().connectLocalSolr();} catch (final IOException e) {ConcurrentLog.logException(e);} } this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false)); @@ -1347,7 +1346,6 @@ public final class Switchboard extends serverSwitch { if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax); if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, true)) { this.index.fulltext().connectLocalSolr(); - this.index.connectUrlDb(this.useTailCache, this.exceed134217727); } this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false)); diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index 46b8a5960..ea392e561 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -74,10 +74,6 @@ public class DocumentIndex extends Segment { ); super.connectRWI(cachesize, targetFileSize * 4 - 1); super.connectCitation(cachesize, targetFileSize * 4 - 1); - super.connectUrlDb( - false, // useTailCache - false // exceed134217727 - ); super.fulltext().connectLocalSolr(); super.fulltext().setUseWebgraph(true); this.callback = callback; diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 77afc1d87..43b4d6cc9 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -54,7 +54,6 @@ import net.yacy.cora.federate.solr.instance.EmbeddedInstance; import net.yacy.cora.federate.solr.instance.InstanceMirror; import net.yacy.cora.federate.solr.instance.RemoteInstance; import net.yacy.cora.federate.solr.instance.ShardInstance; -import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.sorting.ScoreMap; @@ -66,10 +65,6 @@ import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.WordReferenceVars; -import net.yacy.kelondro.index.Cache; -import net.yacy.kelondro.index.Index; -import net.yacy.kelondro.index.Row; -import net.yacy.kelondro.table.SplitTable; import net.yacy.kelondro.util.MemoryControl; import net.yacy.search.Switchboard; import net.yacy.search.schema.CollectionConfiguration; @@ -90,9 +85,7 @@ public final class Fulltext { // class objects private final File segmentPath; private final File archivePath; - private Index urlIndexFile; private Export exportthread; // will have a export thread assigned if exporter is running - private String tablename; private ArrayList statsDump; private InstanceMirror solrInstances; private final CollectionConfiguration collectionConfiguration; @@ -103,8 +96,6 @@ public final class Fulltext { final CollectionConfiguration collectionConfiguration, final WebgraphConfiguration webgraphConfiguration) { this.segmentPath = segmentPath; this.archivePath = archivePath; - this.tablename = null; - this.urlIndexFile = null; this.exportthread = null; // will have a export thread assigned if exporter is running this.statsDump = null; this.solrInstances = new InstanceMirror(); @@ -121,35 +112,6 @@ public final class Fulltext { return this.writeWebgraph; } - /** - * @deprecated - * used only for migration - * @return the connected URLDb - - */ - @Deprecated - public Index getURLDb() { - return this.urlIndexFile; - } - - protected void connectUrlDb(final String tablename, final boolean useTailCache, final boolean exceed134217727) { - if (this.urlIndexFile != null) return; - this.tablename = tablename; - this.urlIndexFile = new SplitTable(new File(this.segmentPath, "default"), tablename, URIMetadataRow.rowdef, useTailCache, exceed134217727); - // SplitTable always returns != null, even if no file exists. - // as old UrlDb should be null if not exist, check and close if empty - // TODO: check if a SplitTable.open() returning null or error status on not existing file is preferable - if (this.urlIndexFile.isEmpty()) { - disconnectUrlDb(); - } - } - - public void disconnectUrlDb() { - if (this.urlIndexFile == null) return; - this.urlIndexFile.close(); - this.urlIndexFile = null; - } - public CollectionConfiguration getDefaultConfiguration() { return this.collectionConfiguration; } @@ -233,19 +195,13 @@ public final class Fulltext { } public void clearCaches() { - if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache(); if (this.statsDump != null) this.statsDump.clear(); this.solrInstances.clearCaches(); this.statsDump = null; } - public void clearURLIndex() throws IOException { + public void clearURLIndex() { if (this.exportthread != null) this.exportthread.interrupt(); - if (this.urlIndexFile == null) { - SplitTable.delete(new File(this.segmentPath, "default"), this.tablename); - } else { - this.urlIndexFile.clear(); - } this.statsDump = null; this.commit(true); } @@ -280,8 +236,7 @@ public final class Fulltext { public long collectionSize() { long t = System.currentTimeMillis(); if (t - this.collectionSizeLastAccess < 1000) return this.collectionSizeLastValue; - long size = this.urlIndexFile == null ? 0 : this.urlIndexFile.size(); - size += this.solrInstances.getDefaultMirrorConnector().getSize(); + long size = this.solrInstances.getDefaultMirrorConnector().getSize(); this.collectionSizeLastAccess = t; this.collectionSizeLastValue = size; return size; @@ -297,10 +252,6 @@ public final class Fulltext { public void close() { this.statsDump = null; - if (this.urlIndexFile != null) { - this.urlIndexFile.close(); - this.urlIndexFile = null; - } this.solrInstances.close(); } @@ -364,27 +315,12 @@ public final class Fulltext { try { SolrDocument doc = this.getDefaultConnector().getDocumentById(u); if (doc != null) { - if (this.urlIndexFile != null) this.urlIndexFile.remove(urlHash); // migration return new URIMetadataNode(doc, wre, weight); } } catch (final IOException e) { ConcurrentLog.logException(e); } - // get the metadata from the old metadata index - if (this.urlIndexFile != null) try { - // slow migration to solr - final Row.Entry entry = this.urlIndexFile.remove(urlHash); - if (entry == null) return null; - URIMetadataRow row = new URIMetadataRow(entry, wre); - SolrInputDocument solrInput = this.collectionConfiguration.metadata2solr(row); - this.putDocument(solrInput); - SolrDocument sd = this.collectionConfiguration.toSolrDocument(solrInput); - return new URIMetadataNode(sd, wre, weight); - } catch (final IOException e) { - ConcurrentLog.logException(e); - } - return null; } @@ -394,14 +330,8 @@ public final class Fulltext { String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); String url = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); ConcurrentLog.info("Fulltext", "indexing: " + id + " " + url); - byte[] idb = ASCII.getBytes(id); try { - if (this.urlIndexFile != null) this.urlIndexFile.remove(idb); - //Date sdDate = (Date) connector.getFieldById(id, CollectionSchema.last_modified.getSolrFieldName()); - //Date docDate = null; - //if (sdDate == null || (docDate = SchemaConfiguration.getDate(doc, CollectionSchema.last_modified)) == null || sdDate.before(docDate)) { connector.add(doc); - //} } catch (final SolrException e) { throw new IOException(e.getMessage(), e); } @@ -428,7 +358,6 @@ public final class Fulltext { byte[] idb = entry.hash(); String id = ASCII.String(idb); try { - if (this.urlIndexFile != null) this.urlIndexFile.remove(idb); // because node entries are richer than metadata entries we must check if they exist to prevent that they are overwritten SolrDocument sd = this.getDefaultConnector().getDocumentById(id); if (sd == null || (new URIMetadataNode(sd)).isOlder(entry)) { @@ -458,24 +387,7 @@ public final class Fulltext { (freshdate == null || freshdate.after(now)) ? null : (WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]")); - // delete in old metadata structure - if (Fulltext.this.urlIndexFile != null) { - final ArrayList l = new ArrayList(); - CloneableIterator i; - try { - i = Fulltext.this.urlIndexFile.keys(true, null); - String hash; - while (i != null && i.hasNext()) { - hash = ASCII.String(i.next()); - if (hosthashes.contains(hash.substring(6))) l.add(hash); - } - - // then delete the urls using this list - for (final String h: l) Fulltext.this.urlIndexFile.delete(ASCII.getBytes(h)); - } catch (final IOException e) {} - } - - // finally remove the line with statistics + // remove the line with statistics if (Fulltext.this.statsDump != null) { final Iterator hsi = Fulltext.this.statsDump.iterator(); HostStat hs; @@ -578,12 +490,6 @@ public final class Fulltext { } catch (final Throwable e) { ConcurrentLog.logException(e); } - if (Fulltext.this.urlIndexFile != null) try { - for (String id: deleteIDs) { - final Row.Entry r = Fulltext.this.urlIndexFile.remove(ASCII.getBytes(id)); - if (r != null) Fulltext.this.statsDump = null; - } - } catch (final IOException e) {} } public boolean remove(final byte[] urlHash) { @@ -595,20 +501,12 @@ public final class Fulltext { } catch (final Throwable e) { ConcurrentLog.logException(e); } - if (this.urlIndexFile != null) try { - final Row.Entry r = this.urlIndexFile.remove(urlHash); - if (r != null) this.statsDump = null; - return r != null; - } catch (final IOException e) { - return false; - } return false; } @Deprecated public boolean exists(final String urlHash) { if (urlHash == null) return false; - if (this.urlIndexFile != null && this.urlIndexFile.has(ASCII.getBytes(urlHash))) return true; try { if (this.getDefaultConnector().existsById(urlHash)) return true; } catch (final Throwable e) { @@ -629,17 +527,6 @@ public final class Fulltext { if (ids.size() == 1) return exists(ids.iterator().next()) ? ids : e; Set idsC = new HashSet(); idsC.addAll(ids); - if (this.urlIndexFile != null) { - Iterator idsi = idsC.iterator(); - String h; - while (idsi.hasNext()) { - h = idsi.next(); - if (this.urlIndexFile.has(ASCII.getBytes(h))) { - idsi.remove(); - e.add(h); - } - } - } try { Set e1 = this.getDefaultConnector().existsByIds(idsC); e.addAll(e1); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 39512de1b..45f639a07 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -66,7 +66,6 @@ import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReferenceFactory; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; @@ -204,10 +203,6 @@ public class Segment { public long citationSegmentCount() { return this.urlCitationIndex == null ? 0 : this.urlCitationIndex.getSegmentCount(); } - - public void connectUrlDb(final boolean useTailCache, final boolean exceed134217727) { - this.fulltext.connectUrlDb(UrlDbName, useTailCache, exceed134217727); - } public Fulltext fulltext() { return this.fulltext; @@ -280,7 +275,7 @@ public class Segment { } private static RowHandleSet getPossibleRootHashes(DigestURL url) { - RowHandleSet rootCandidates = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10); + RowHandleSet rootCandidates = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10); String rootStub = url.getProtocol() + "://" + url.getHost(); try { rootCandidates.put(new DigestURL(rootStub).hash()); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index a84a18c6e..8e3cc2f46 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -70,7 +70,7 @@ import net.yacy.document.LargeNumberCache; import net.yacy.document.LibraryProvider; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.URIMetadataNode; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; import net.yacy.kelondro.data.word.WordReferenceVars; @@ -277,7 +277,7 @@ public final class SearchEvent { this.addRunning = true; this.receivedRemoteReferences = new AtomicInteger(0); this.order = new ReferenceOrder(this.query.ranking, UTF8.getBytes(this.query.targetlang)); - this.urlhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); + this.urlhashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 100); this.taggingPredicates = new HashMap(); for (Tagging t: LibraryProvider.autotagging.getVocabularies()) { this.taggingPredicates.put(t.getName(), t.getPredicate());