From b5fc2b63ea3b154c2985ac422cf4ede0913b0ede Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 11 Jul 2014 19:52:25 +0200 Subject: [PATCH] removed exist() retrieval functions from error cache and replaced it with metadata retrieval from connectors directly. This should cause better usage of the cache. Automatically increase the metadata cache if more memory is available. --- htroot/HostBrowser.java | 7 +++---- .../federate/solr/instance/InstanceMirror.java | 4 +++- source/net/yacy/crawler/CrawlStacker.java | 17 +++++++++++------ source/net/yacy/crawler/data/CrawlQueues.java | 5 +---- source/net/yacy/search/Switchboard.java | 12 +++++++++++- source/net/yacy/search/index/ErrorCache.java | 6 +++--- 6 files changed, 32 insertions(+), 19 deletions(-) diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 510f16b03..75075c106 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -59,7 +59,6 @@ import net.yacy.search.index.Fulltext; import net.yacy.search.index.Segment.ReferenceReport; import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.query.QueryParams; -import net.yacy.search.schema.CollectionConfiguration.FailDoc; import net.yacy.search.schema.CollectionSchema; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -466,7 +465,7 @@ public class HostBrowser { prop.putHTML("files_list_" + c + "_type_admin", admin ? "true" : "false"); StoreType type = (StoreType) entry.getValue(); try {uri = new DigestURL(entry.getKey());} catch (final MalformedURLException e) {uri = null;} - HarvestProcess process = uri == null ? null : sb.crawlQueues.exists(uri.hash(), true); + HarvestProcess process = uri == null ? null : sb.crawlQueues.exists(uri.hash()); // todo: cannot identify errors boolean loading = load.equals(entry.getKey()) || (process != null && process != HarvestProcess.ERRORS); boolean error = process == HarvestProcess.ERRORS || type == StoreType.EXCLUDED || type == StoreType.FAILED; boolean dc = type != StoreType.INDEX && !error && !loading && list.containsKey(entry.getKey() + "/"); @@ -482,8 +481,8 @@ public class HostBrowser { FailType failType = errorDocs.get(entry.getKey()); if (failType == null) { // maybe this is only in the errorURL - FailDoc faildoc = sb.crawlQueues.errorURL.get(ASCII.String(uri.hash())); - prop.putHTML("files_list_" + c + "_type_stored_error", process == HarvestProcess.ERRORS && faildoc != null ? faildoc.getFailReason() : "unknown error"); + //Metadata faildoc = sb.index.fulltext().getDefaultConnector().getMetadata(ASCII.String(uri.hash())); + prop.putHTML("files_list_" + c + "_type_stored_error", "unknown error"); } else { String ids = ASCII.String(uri.hash()); InfoCacheEntry ice = infoCache.get(ids); diff --git a/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java b/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java index 0dcca5efd..e5448fb43 100644 --- a/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java +++ b/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java @@ -32,6 +32,7 @@ import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; import net.yacy.cora.federate.solr.connector.MirrorSolrConnector; import net.yacy.cora.federate.solr.connector.RemoteSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.kelondro.util.MemoryControl; public class InstanceMirror { @@ -165,7 +166,8 @@ public class InstanceMirror { if (msc != null) return msc; EmbeddedSolrConnector esc = getEmbeddedConnector(corename); RemoteSolrConnector rsc = getRemoteConnector(corename); - msc = new ConcurrentUpdateSolrConnector(new MirrorSolrConnector(esc, rsc), RemoteInstance.queueSizeByMemory(), 10000, Runtime.getRuntime().availableProcessors()); + int cacheSize = (int) (MemoryControl.available() / 30000); // will return about 10000 for standard ram size + msc = new ConcurrentUpdateSolrConnector(new MirrorSolrConnector(esc, rsc), RemoteInstance.queueSizeByMemory(), cacheSize, Runtime.getRuntime().availableProcessors()); //msc = new MirrorSolrConnector(esc, rsc); this.mirrorConnectorCache.put(corename, msc); return msc; diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index a1f33d4da..e0e6a30b8 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -43,6 +43,7 @@ import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.FailCategory; +import net.yacy.cora.federate.solr.connector.SolrConnector.Metadata; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.ftp.FTPClient; @@ -60,7 +61,6 @@ import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.FilterEngine; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; -import net.yacy.search.schema.CollectionConfiguration; public final class CrawlStacker { @@ -379,22 +379,27 @@ public final class CrawlStacker { public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile profile) { // check if the url is double registered - final HarvestProcess dbocc = this.nextQueue.exists(url.hash(), false); // returns the name of the queue if entry exists + final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists if (dbocc != null) { return "double in: " + dbocc.name(); } String urlhash = ASCII.String(url.hash()); - final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(urlhash); - final Date oldDate = errorEntry == null ? null : errorEntry.getFailDate(); + Metadata oldEntry = null; + try { + oldEntry = this.indexSegment.fulltext().getDefaultConnector().getMetadata(urlhash); + } catch (IOException e) { + ConcurrentLog.logException(e); + } + final Long oldDate = oldEntry == null ? null : oldEntry.date; if (oldDate == null) { return null; // no evidence that we know that url } - final boolean recrawl = profile.recrawlIfOlder() > oldDate.getTime(); + final boolean recrawl = profile.recrawlIfOlder() > oldDate.longValue(); final String urlstring = url.toString(); if (recrawl) { if (CrawlStacker.log.isInfo()) CrawlStacker.log.info("RE-CRAWL of URL '" + urlstring + "': this url was crawled " + - ((System.currentTimeMillis() - oldDate.getTime()) / 60000 / 60 / 24) + " days ago."); + ((System.currentTimeMillis() - oldDate.longValue()) / 60000 / 60 / 24) + " days ago."); } else { return "double in: LURL-DB, oldDate = " + oldDate.toString(); } diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index d7f16c881..55c938502 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -143,7 +143,7 @@ public class CrawlQueues { * @param hash * @return if the hash exists, the name of the database is returned, otherwise null is returned */ - public HarvestProcess exists(final byte[] hash, final boolean checkErrorCache) { + public HarvestProcess exists(final byte[] hash) { if (this.delegatedURL.containsKey(ASCII.String(hash))) { return HarvestProcess.DELEGATED; } @@ -155,9 +155,6 @@ public class CrawlQueues { return HarvestProcess.WORKER; } } - if (checkErrorCache && this.errorURL.exists(hash)) { - return HarvestProcess.ERRORS; - } return null; } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 0e2daa6f3..8a5c122f4 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -98,6 +98,7 @@ import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.solr.SchemaConfiguration; +import net.yacy.cora.federate.solr.connector.SolrConnector.Metadata; import net.yacy.cora.federate.solr.instance.RemoteInstance; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; @@ -1616,7 +1617,16 @@ public final class Switchboard extends serverSwitch { */ public HarvestProcess urlExists(final String hash) { if (this.index.getLoadTime(hash) >= 0) return HarvestProcess.LOADED; - return this.crawlQueues.exists(ASCII.getBytes(hash), true); + HarvestProcess hp = this.crawlQueues.exists(ASCII.getBytes(hash)); + if (hp != null) return hp; + try { + Metadata md = this.index.fulltext().getDefaultConnector().getMetadata(hash); + if (md == null) return null; + return HarvestProcess.LOADED; // todo: can also be in error + } catch (IOException e) { + ConcurrentLog.logException(e); + return null; + } } public void urlRemove(final Segment segment, final byte[] hash) { diff --git a/source/net/yacy/search/index/ErrorCache.java b/source/net/yacy/search/index/ErrorCache.java index 29a21cdae..85235123a 100644 --- a/source/net/yacy/search/index/ErrorCache.java +++ b/source/net/yacy/search/index/ErrorCache.java @@ -167,7 +167,8 @@ public class ErrorCache { } return l; } - + + /* public CollectionConfiguration.FailDoc get(final String urlhash) { CollectionConfiguration.FailDoc failDoc = null; synchronized (this.cache) { @@ -185,7 +186,6 @@ public class ErrorCache { return null; } } - public boolean exists(final byte[] urlHash) { String urlHashString = ASCII.String(urlHash); try { @@ -200,7 +200,7 @@ public class ErrorCache { return false; } } - +*/ public void clearStack() { synchronized (this.cache) { this.cache.clear();