replaces getLoadTime() by exists() with a simpler query

since solr-8.8.1 getLoadTime() causes a high cpu usage
pull/405/head
sgaebel 4 years ago
parent 8e4d014c06
commit 26223dc25a

@ -136,8 +136,6 @@ public class IndexBrowser_p {
prop.put("files", 0);
prop.put("hostanalysis", 0);
String referer = header.get("Referer", "");
String path = post == null ? "" : post.get("path", "").trim();
sb.index.fulltext().commit(true);
if (post == null || env == null) {
@ -161,16 +159,11 @@ public class IndexBrowser_p {
String load = post.get("load", "");
boolean wait = false;
try {
if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && sb.index.getLoadTime(ASCII.String(pathURI.hash())) < 0) {
if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && !sb.index.exists(ASCII.String(pathURI.hash()))) {
// in case that the url does not exist and loading is wanted turn this request into a loading request
load = path;
wait = true;
}
} catch (IOException e1) {
load = path;
wait = true;
}
if (load.length() > 0 && loadRight) {
// stack URL
DigestURL url;
@ -185,13 +178,8 @@ public class IndexBrowser_p {
sb.crawler.defaultProxyProfile.timezoneOffset()
));
prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
if (wait) waitloop: for (int i = 0; i < 30; i++) {
try {
if (sb.index.getLoadTime(ASCII.String(url.hash())) >= 0) break;
} catch (IOException e1) {
e1.printStackTrace();
break waitloop;
}
if (wait) for (int i = 0; i < 30; i++) {
if (sb.index.exists(ASCII.String(url.hash()))) break;
try {Thread.sleep(100);} catch (final InterruptedException e) {}
}
} catch (final MalformedURLException e) {

@ -304,20 +304,20 @@ public class IndexControlRWIs_p {
Reference iEntry;
while (urlIter.hasNext()) {
iEntry = urlIter.next();
long loadTime = segment.fulltext().getLoadTime(ASCII.String(iEntry.urlhash()));
if (loadTime < 0) {
boolean exists = segment.fulltext().exists(ASCII.String(iEntry.urlhash()));
if (exists) {
try {
unknownURLEntries.put(iEntry.urlhash());
knownURLs.put(iEntry.urlhash());
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);
}
urlIter.remove();
} else {
try {
knownURLs.put(iEntry.urlhash());
unknownURLEntries.put(iEntry.urlhash());
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);
}
urlIter.remove();
}
}

@ -27,7 +27,6 @@
// javac -classpath .:../classes transferRWI.java
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
@ -234,12 +233,7 @@ public final class transferRWI {
}
for (String id: testids) {
try {
try {
if (sb.index.fulltext().getLoadTime(id) < 0) {
unknownURL.put(ASCII.getBytes(id));
}
} catch (IOException e) {
ConcurrentLog.logException(e);
if (!sb.index.fulltext().exists(id)) {
unknownURL.put(ASCII.getBytes(id));
}
} catch (final SpaceExceededException e) {

@ -146,14 +146,9 @@ public final class transferURL {
doublecheck = 0;
for (String id : lEm.keySet()) {
long lt = -1;
try {
lt = sb.index.getLoadTime(id);
} catch (IOException e1) {
lt = -1;
ConcurrentLog.logException(e1);
}
if (lt < 0) {
if (sb.index.exists(id)) {
doublecheck++;
} else {
lEntry = lEm.get(id);
// write entry to database
@ -166,8 +161,6 @@ public final class transferURL {
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
} else {
doublecheck++;
}
}

@ -465,8 +465,10 @@ public abstract class AbstractSolrConnector implements SolrConnector {
params.setFacet(false);
if (fields != null && fields.length > 0) params.setFields(fields);
params.setIncludeScore(false);
if (count > 0) {
params.setParam("defType", "edismax");
params.setParam(DisMaxParams.QF, CollectionSchema.text_t.getSolrFieldName() + "^1.0");
}
return params;
}
@ -498,6 +500,22 @@ public abstract class AbstractSolrConnector implements SolrConnector {
return md;
}
/**
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id
* @return whether the documents exists
*/
@Override
public boolean exists(final String id) {
final String query = "{!cache=false raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id;
try {
return getCountByQuery(query) > 0l;
} catch (IOException e) {
ConcurrentLog.logException(e);
return false;
}
}
/**
* get the number of results when this query is done.
* This should only be called if the actual result is never used, and only the count is interesting

@ -30,11 +30,6 @@ import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
import net.yacy.cora.federate.solr.instance.SolrInstance;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.search.schema.CollectionSchema;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexableField;
@ -57,7 +52,6 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.SearchHandler;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.request.SolrRequestInfo;
import org.apache.solr.response.ResultContext;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.FieldType;
@ -71,6 +65,11 @@ import org.apache.solr.search.SolrCache;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.RefCounted;
import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
import net.yacy.cora.federate.solr.instance.SolrInstance;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.search.schema.CollectionSchema;
public class EmbeddedSolrConnector extends SolrServerConnector implements SolrConnector {
public static final String SELECT = "/select";
@ -407,6 +406,22 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
return numFound;
}
/**
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id
* @return whether the documents exists
*/
@Override
public boolean exists(final String id) {
final String query = "{!cache=false raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id;
try (DocListSearcher docListSearcher = new DocListSearcher(query, null, 0, 0, CollectionSchema.id.getSolrFieldName())) {
return docListSearcher.response.matches() > 0l;
} catch (Throwable e) {
ConcurrentLog.logException(e);
return false;
}
}
/**
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id

@ -410,6 +410,14 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
return s;
}
@Override
public boolean exists(final String id) {
boolean result = false;
if (this.solr0 != null) result = result || this.solr0.exists(id);
if (this.solr1 != null) result = result || this.solr1.exists(id);
return result;
}
@Override
public LoadTimeURL getLoadTimeURL(String id) throws IOException {
if (this.solr0 != null && this.solr1 == null) return this.solr0.getLoadTimeURL(id);

@ -125,6 +125,13 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
*/
public LoadTimeURL getLoadTimeURL(final String id) throws IOException;
/**
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id
* @return whether the documents exists
*/
public boolean exists(final String id);
/**
* add a solr input document
* @param solrdoc

@ -26,8 +26,6 @@ package net.yacy.cora.sorting;
import java.util.ArrayList;
import java.util.Random;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
@ -36,7 +34,6 @@ import java.util.concurrent.LinkedBlockingQueue;
* @author admin
*
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
public class Array {
public static <A> void sort(final Sortable<A> x) {

@ -24,7 +24,6 @@
package net.yacy.peers;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
@ -175,18 +174,12 @@ public class Transmission {
i = c.entries();
while (i.hasNext()) {
final WordReference e = i.next();
try {
if (Transmission.this.segment.fulltext().getLoadTime(ASCII.String(e.urlhash())) >= 0) {
if (Transmission.this.segment.fulltext().exists(ASCII.String(e.urlhash()))) {
this.references.put(e.urlhash());
} else {
notFoundx.add(e.urlhash());
this.badReferences.put(e.urlhash());
}
} catch (IOException e1) {
ConcurrentLog.logException(e1);
notFoundx.add(e.urlhash());
this.badReferences.put(e.urlhash());
}
}
// now delete all references that were not found
for (final byte[] b : notFoundx) c.removeReference(b);

@ -3702,14 +3702,7 @@ public final class Switchboard extends serverSwitch {
// we must wait here until the url has actually disappeared
int t = 100;
while (t-- > 0) {
try {
long lt = this.index.getLoadTime(ASCII.String(urlhash));
if (lt < 0) break;
} catch (IOException e) {
// if this fails, the url may still exist
// we should abandon the whole process
return "exist-test failed: " + e.getMessage();
}
if (!this.index.exists(ASCII.String(urlhash))) break;
try {Thread.sleep(100);} catch (final InterruptedException e) {}
ConcurrentLog.fine("Switchboard", "STACKURL: waiting for deletion, t=" + t);
//if (t == 20) this.index.fulltext().commit(true);
@ -3830,16 +3823,10 @@ public final class Switchboard extends serverSwitch {
for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
final String urlName = e.getValue().toNormalform(true);
if (doublecheck) {
try {
if (this.index.getLoadTime(e.getKey()) >= 0) {
if (this.index.exists(e.getKey())) {
this.log.info("addToIndex: double " + urlName);
continue;
}
} catch (IOException ee) {
// double check fail may mean that the url exist
this.log.info("addToIndex: doublecheck failed for " + urlName + ": " + ee.getMessage());
continue;
}
}
final Request request = this.loader.request(e.getValue(), true, true);
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
@ -3914,11 +3901,7 @@ public final class Switchboard extends serverSwitch {
Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url);
for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
try {
if (this.index.getLoadTime(e.getKey()) >= 0) continue; // double
} catch (IOException ee) {
continue; // if the check fails, consider the url as double
}
if (this.index.exists(e.getKey())) continue; // double
DigestURL url = e.getValue();
final Request request = this.loader.request(url, true, true);
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));

@ -33,7 +33,6 @@ import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
@ -45,6 +44,15 @@ import java.util.regex.Pattern;
import java.util.zip.Deflater;
import java.util.zip.GZIPOutputStream;
import org.apache.lucene.util.Version;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.schema.IndexSchema;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.encoding.ASCII;
@ -79,19 +87,10 @@ import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.WebgraphConfiguration;
import net.yacy.search.schema.WebgraphSchema;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.lucene.util.Version;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.schema.IndexSchema;
public final class Fulltext {
private static final String SOLR_PATH = "solr_8_8_1"; // the number should be identical to the number in the property luceneMatchVersion in solrconfig.xml
private static final String SOLR_OLD_PATH[] = new String[]{"solr_36", "solr_40", "solr_44", "solr_45", "solr_46", "solr_47", "solr_4_9", "solr_4_10", "solr_5_2", "solr_5_5", "solr_6_6"};
// private static final String SOLR_OLD_PATH[] = new String[]{"solr_36", "solr_40", "solr_44", "solr_45", "solr_46", "solr_47", "solr_4_9", "solr_4_10", "solr_5_2", "solr_5_5", "solr_6_6"};
// class objects
private final File segmentPath;
@ -574,12 +573,21 @@ public final class Fulltext {
return new DigestURL(md.url, ASCII.getBytes(urlHash));
}
/**
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id
* @return whether the documents exists
*/
public boolean exists(final String id) {
return this.getDefaultConnector().exists(id);
}
/**
* get the load time of a resource.
* @param urlHash
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
*/
public long getLoadTime(final String urlHash) throws IOException {
private long getLoadTime(final String urlHash) throws IOException {
if (urlHash == null) return -1l;
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
if (md == null) return -1l;

@ -399,12 +399,12 @@ public class Segment {
}
/**
* get the load time of a resource.
* @param urlhash the resource hash
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id
* @return whether the documents exists
*/
public long getLoadTime(final String urlhash) throws IOException {
return this.fulltext.getLoadTime(urlhash);
public boolean exists(final String id) {
return this.fulltext.exists(id);
}
/**

Loading…
Cancel
Save