replaces getLoadTime() by exists() with a simpler query

since solr-8.8.1 getLoadTime() causes a high cpu usage
4 years ago · 26223dc25a
parent 8e4d014c06
commit 26223dc25a
13 changed files with 105 additions and 101 deletions
--- a/htroot/IndexBrowser_p.java
+++ b/htroot/IndexBrowser_p.java
@ -136,8 +136,6 @@ public class IndexBrowser_p {
        prop.put("files", 0);
        prop.put("hostanalysis", 0);

-        String referer = header.get("Referer", "");
-
        String path = post == null ? "" : post.get("path", "").trim();
        sb.index.fulltext().commit(true);
        if (post == null || env == null) {
@ -161,16 +159,11 @@ public class IndexBrowser_p {

        String load = post.get("load", "");
        boolean wait = false;
-        try {
-            if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && sb.index.getLoadTime(ASCII.String(pathURI.hash())) < 0) {
+        if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && !sb.index.exists(ASCII.String(pathURI.hash()))) {
            // in case that the url does not exist and loading is wanted turn this request into a loading request
            load = path;
            wait = true;
        }
-        } catch (IOException e1) {
-            load = path;
-            wait = true;
-        }
        if (load.length() > 0 && loadRight) {
            // stack URL
            DigestURL url;
@ -185,13 +178,8 @@ public class IndexBrowser_p {
                        sb.crawler.defaultProxyProfile.timezoneOffset()
                    ));
                prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
-                if (wait) waitloop: for (int i = 0; i < 30; i++) {
-                    try {
-                        if (sb.index.getLoadTime(ASCII.String(url.hash())) >= 0) break;
-                    } catch (IOException e1) {
-                        e1.printStackTrace();
-                        break waitloop;
-                    }
+                if (wait) for (int i = 0; i < 30; i++) {
+                    if (sb.index.exists(ASCII.String(url.hash()))) break;
                    try {Thread.sleep(100);} catch (final InterruptedException e) {}
                }
            } catch (final MalformedURLException e) {
--- a/htroot/IndexControlRWIs_p.java
+++ b/htroot/IndexControlRWIs_p.java
@ -304,20 +304,20 @@ public class IndexControlRWIs_p {
                        Reference iEntry;
                        while (urlIter.hasNext()) {
                            iEntry = urlIter.next();
-                            long loadTime = segment.fulltext().getLoadTime(ASCII.String(iEntry.urlhash()));
-                            if (loadTime < 0) {
+                            boolean exists = segment.fulltext().exists(ASCII.String(iEntry.urlhash()));
+                            if (exists) {
                                try {
-                                    unknownURLEntries.put(iEntry.urlhash());
+                                    knownURLs.put(iEntry.urlhash());
                                } catch (final SpaceExceededException e) {
                                    ConcurrentLog.logException(e);
                                }
-                                urlIter.remove();
                            } else {
                                try {
-									knownURLs.put(iEntry.urlhash());
+                                    unknownURLEntries.put(iEntry.urlhash());
                                } catch (final SpaceExceededException e) {
                                    ConcurrentLog.logException(e);
                                }
+                                urlIter.remove();
                            }
                        }

--- a/htroot/yacy/transferRWI.java
+++ b/htroot/yacy/transferRWI.java
@ -27,7 +27,6 @@
 // javac -classpath .:../classes transferRWI.java


-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Iterator;
@ -234,12 +233,7 @@ public final class transferRWI {
            }
            for (String id: testids) {
                try {
-                    try {
-                        if (sb.index.fulltext().getLoadTime(id) < 0) {
-                            unknownURL.put(ASCII.getBytes(id));
-                        }
-                    } catch (IOException e) {
-                        ConcurrentLog.logException(e);
+                    if (!sb.index.fulltext().exists(id)) {
                        unknownURL.put(ASCII.getBytes(id));
                    }
                } catch (final SpaceExceededException e) {
--- a/htroot/yacy/transferURL.java
+++ b/htroot/yacy/transferURL.java
@ -146,14 +146,9 @@ public final class transferURL {
            
            doublecheck = 0;
            for (String id : lEm.keySet()) {
-                long lt = -1;
-                try {
-                    lt = sb.index.getLoadTime(id);
-                } catch (IOException e1) {
-                    lt = -1;
-                    ConcurrentLog.logException(e1);
-                }
-                if (lt < 0) {
+                if (sb.index.exists(id)) {
+                    doublecheck++;
+                } else {
                    lEntry = lEm.get(id);

                    // write entry to database
@ -166,8 +161,6 @@ public final class transferURL {
                    } catch (final IOException e) {
                        ConcurrentLog.logException(e);
                    }
-                } else {
-                    doublecheck++;
                }
            }

--- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java
@ -465,8 +465,10 @@ public abstract class AbstractSolrConnector implements SolrConnector {
        params.setFacet(false);
        if (fields != null && fields.length > 0) params.setFields(fields);
        params.setIncludeScore(false);
+        if (count > 0) {
            params.setParam("defType", "edismax");
            params.setParam(DisMaxParams.QF, CollectionSchema.text_t.getSolrFieldName() + "^1.0");
+        }
        return params;
    }
    
@ -498,6 +500,22 @@ public abstract class AbstractSolrConnector implements SolrConnector {
        return md;
    }
    
+    /**
+     * check if a given document, identified by url hash as document id exists
+     * @param id the url hash and document id
+     * @return whether the documents exists
+     */
+    @Override
+    public boolean exists(final String id) {
+        final String query = "{!cache=false raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id;
+        try {
+            return getCountByQuery(query) > 0l;
+        } catch (IOException e) {
+            ConcurrentLog.logException(e);
+            return false;
+        }
+    }
+    
    /**
     * get the number of results when this query is done.
     * This should only be called if the actual result is never used, and only the count is interesting
--- a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java
@ -30,11 +30,6 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;

-import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
-import net.yacy.cora.federate.solr.instance.SolrInstance;
-import net.yacy.cora.util.ConcurrentLog;
-import net.yacy.search.schema.CollectionSchema;
-
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexableField;
@ -57,7 +52,6 @@ import org.apache.solr.core.SolrCore;
 import org.apache.solr.handler.component.SearchHandler;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.request.SolrQueryRequestBase;
-import org.apache.solr.request.SolrRequestInfo;
 import org.apache.solr.response.ResultContext;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.schema.FieldType;
@ -71,6 +65,11 @@ import org.apache.solr.search.SolrCache;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.util.RefCounted;

+import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
+import net.yacy.cora.federate.solr.instance.SolrInstance;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.search.schema.CollectionSchema;
+
 public class EmbeddedSolrConnector extends SolrServerConnector implements SolrConnector {
    
    public static final String SELECT = "/select";
@ -407,6 +406,22 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
        return numFound;
    }
    
+    /**
+     * check if a given document, identified by url hash as document id exists
+     * @param id the url hash and document id
+     * @return whether the documents exists
+     */
+    @Override
+    public boolean exists(final String id) {
+        final String query = "{!cache=false raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id;
+        try (DocListSearcher docListSearcher = new DocListSearcher(query, null, 0, 0, CollectionSchema.id.getSolrFieldName())) {
+            return docListSearcher.response.matches() > 0l;
+        } catch (Throwable e) {
+            ConcurrentLog.logException(e);
+            return false;
+        }
+    }
+
    /**
     * check if a given document, identified by url hash as document id exists
     * @param id the url hash and document id
--- a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java
@ -410,6 +410,14 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
        return s;
    }
    
+    @Override
+    public boolean exists(final String id) {
+        boolean result = false;
+        if (this.solr0 != null) result = result || this.solr0.exists(id);
+        if (this.solr1 != null) result = result || this.solr1.exists(id);
+        return result;
+    }
+
    @Override
    public LoadTimeURL getLoadTimeURL(String id) throws IOException {
        if (this.solr0 != null && this.solr1 == null) return this.solr0.getLoadTimeURL(id);
--- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java
@ -125,6 +125,13 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
     */
    public LoadTimeURL getLoadTimeURL(final String id) throws IOException;
    
+    /**
+     * check if a given document, identified by url hash as document id exists
+     * @param id the url hash and document id
+     * @return whether the documents exists
+     */
+    public boolean exists(final String id);
+
    /**
     * add a solr input document
     * @param solrdoc
--- a/source/net/yacy/cora/sorting/Array.java
+++ b/source/net/yacy/cora/sorting/Array.java
@ -26,8 +26,6 @@ package net.yacy.cora.sorting;

 import java.util.ArrayList;
 import java.util.Random;
-import java.util.concurrent.BlockingQueue;
-import java.util.concurrent.LinkedBlockingQueue;



@ -36,7 +34,6 @@ import java.util.concurrent.LinkedBlockingQueue;
 * @author admin
 *
 */
-@SuppressWarnings({ "rawtypes", "unchecked" })
 public class Array {

    public static <A> void sort(final Sortable<A> x) {
--- a/source/net/yacy/peers/Transmission.java
+++ b/source/net/yacy/peers/Transmission.java
@ -24,7 +24,6 @@

 package net.yacy.peers;

-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Iterator;
@ -175,18 +174,12 @@ public class Transmission {
            i = c.entries();
            while (i.hasNext()) {
                final WordReference e = i.next();
-                try {
-                    if (Transmission.this.segment.fulltext().getLoadTime(ASCII.String(e.urlhash())) >= 0) {
+                if (Transmission.this.segment.fulltext().exists(ASCII.String(e.urlhash()))) {
                    this.references.put(e.urlhash());
                } else {
                    notFoundx.add(e.urlhash());
                    this.badReferences.put(e.urlhash());
                }
-                } catch (IOException e1) {
-                    ConcurrentLog.logException(e1);
-                    notFoundx.add(e.urlhash());
-                    this.badReferences.put(e.urlhash());
-                }
            }
            // now delete all references that were not found
            for (final byte[] b : notFoundx) c.removeReference(b);
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -3702,14 +3702,7 @@ public final class Switchboard extends serverSwitch {
        // we must wait here until the url has actually disappeared
        int t = 100;
        while (t-- > 0) {
-            try {
-                long lt = this.index.getLoadTime(ASCII.String(urlhash));
-                if (lt < 0) break;
-            } catch (IOException e) {
-                // if this fails, the url may still exist
-                // we should abandon the whole process
-                return "exist-test failed: " + e.getMessage();
-            }
+            if (!this.index.exists(ASCII.String(urlhash))) break;
            try {Thread.sleep(100);} catch (final InterruptedException e) {}
            ConcurrentLog.fine("Switchboard", "STACKURL: waiting for deletion, t=" + t);
            //if (t == 20) this.index.fulltext().commit(true);
@ -3830,16 +3823,10 @@ public final class Switchboard extends serverSwitch {
        for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
            final String urlName = e.getValue().toNormalform(true);
            if (doublecheck) {
-                try {
-                    if (this.index.getLoadTime(e.getKey()) >= 0) {
+                if (this.index.exists(e.getKey())) {
                    this.log.info("addToIndex: double " + urlName);
                    continue;
                }
-                } catch (IOException ee) {
-                    // double check fail may mean that the url exist
-                    this.log.info("addToIndex: doublecheck failed for " + urlName + ": " + ee.getMessage());
-                    continue;
-                }
            }
            final Request request = this.loader.request(e.getValue(), true, true);
            final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
@ -3914,11 +3901,7 @@ public final class Switchboard extends serverSwitch {
        Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
        for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url);
        for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
-            try {
-                if (this.index.getLoadTime(e.getKey()) >= 0) continue; // double
-            } catch (IOException ee) {
-                continue; // if the check fails, consider the url as double
-            }
+            if (this.index.exists(e.getKey())) continue; // double
            DigestURL url = e.getValue();
            final Request request = this.loader.request(url, true, true);
            final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@ -33,7 +33,6 @@ import java.net.MalformedURLException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Date;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
@ -45,6 +44,15 @@ import java.util.regex.Pattern;
 import java.util.zip.Deflater;
 import java.util.zip.GZIPOutputStream;

+import org.apache.lucene.util.Version;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.SolrConfig;
+import org.apache.solr.schema.IndexSchema;
+
 import net.yacy.cora.date.GenericFormatter;
 import net.yacy.cora.date.ISO8601Formatter;
 import net.yacy.cora.document.encoding.ASCII;
@ -79,19 +87,10 @@ import net.yacy.search.schema.CollectionSchema;
 import net.yacy.search.schema.WebgraphConfiguration;
 import net.yacy.search.schema.WebgraphSchema;

-import org.apache.solr.common.SolrDocument;
-import org.apache.solr.common.SolrDocumentList;
-import org.apache.solr.common.SolrException;
-import org.apache.solr.common.SolrInputDocument;
-import org.apache.solr.common.SolrException.ErrorCode;
-import org.apache.lucene.util.Version;
-import org.apache.solr.core.SolrConfig;
-import org.apache.solr.schema.IndexSchema;
-
 public final class Fulltext {

    private static final String SOLR_PATH = "solr_8_8_1"; // the number should be identical to the number in the property luceneMatchVersion in solrconfig.xml
-    private static final String SOLR_OLD_PATH[] = new String[]{"solr_36", "solr_40", "solr_44", "solr_45", "solr_46", "solr_47", "solr_4_9", "solr_4_10", "solr_5_2", "solr_5_5", "solr_6_6"};
+//    private static final String SOLR_OLD_PATH[] = new String[]{"solr_36", "solr_40", "solr_44", "solr_45", "solr_46", "solr_47", "solr_4_9", "solr_4_10", "solr_5_2", "solr_5_5", "solr_6_6"};

    // class objects
    private final File                    segmentPath;
@ -574,12 +573,21 @@ public final class Fulltext {
        return new DigestURL(md.url, ASCII.getBytes(urlHash));
    }
    
+    /**
+     * check if a given document, identified by url hash as document id exists
+     * @param id the url hash and document id
+     * @return whether the documents exists
+     */
+    public boolean exists(final String id) {
+        return this.getDefaultConnector().exists(id);
+    }
+
    /**
     * get the load time of a resource.
     * @param urlHash
     * @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
     */
-    public long getLoadTime(final String urlHash) throws IOException {
+    private long getLoadTime(final String urlHash) throws IOException {
        if (urlHash == null) return -1l;
        SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
        if (md == null) return -1l;
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -399,12 +399,12 @@ public class Segment {
    }
    
    /**
-     * get the load time of a resource.
-     * @param urlhash the resource hash
-     * @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
+     * check if a given document, identified by url hash as document id exists
+     * @param id the url hash and document id
+     * @return whether the documents exists
     */
-    public long getLoadTime(final String urlhash) throws IOException {
-        return this.fulltext.getLoadTime(urlhash);
+    public boolean exists(final String id) {
+        return this.fulltext.exists(id);
    }

    /**