replaced check for load time method

instead of loading the solr document, an index only for the last loading
time was created. This prevents that solr has to fetch from its index
while the index is created. Excessive re-loading of documents while
indexing has shown to produce deadlocks, so this should now be
prevented.
pull/436/head
Michael Peter Christen 3 years ago
parent 1ead7b85b5
commit 163ba26d90

@ -82,11 +82,11 @@ public class IndexControlURLs_p {
List<File> dumpFiles = segment.fulltext().dumpFiles();
prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath());
prop.put("dumprestore_optimizemax", 10);
prop.put("dumprestore_rebootSolrEnabled",
sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT,
SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT)
&& !sb.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED,
SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED_DEFAULT));
prop.put("dumprestore_rebootSolrEnabled",
sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT,
SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT)
&& !sb.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED,
SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED_DEFAULT));
prop.put("cleanup", ucount == 0 ? 0 : 1);
prop.put("cleanupsolr", segment.fulltext().connectedRemoteSolr() ? 1 : 0);
prop.put("cleanuprwi", segment.termIndex() != null && !segment.termIndex().isEmpty() ? 1 : 0);
@ -119,14 +119,20 @@ public class IndexControlURLs_p {
// delete everything
if ( post.containsKey("deletecomplete") ) {
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
if ( post.get("deleteIndex", "").equals("on") ) {
try {segment.fulltext().clearLocalSolr();} catch (final IOException e) {}
try {
segment.fulltext().clearLocalSolr();
segment.loadTimeIndex().clear();
} catch (final IOException e) {}
}
if ( post.get("deleteRemoteSolr", "").equals("on")) {
try {segment.fulltext().clearRemoteSolr();} catch (final IOException e) {}
try {
segment.fulltext().clearRemoteSolr();
segment.loadTimeIndex().clear();
} catch (final IOException e) {}
}
if ( post.get("deleteRWI", "").equals("on")) {
if (segment.termIndex() != null) try {segment.termIndex().clear();} catch (final IOException e) {}
@ -135,7 +141,10 @@ public class IndexControlURLs_p {
if (segment.connectedCitation()) try {segment.urlCitation().clear();} catch (final IOException e) {}
}
if ( post.get("deleteFirstSeen", "").equals("on")) {
try {segment.firstSeen().clear();} catch (final IOException e) {}
try {
segment.firstSeenIndex().clear();
segment.loadTimeIndex().clear();
} catch (final IOException e) {}
}
if ( post.get("deleteCrawlQueues", "").equals("on") ) {
sb.crawlQueues.clear();
@ -152,8 +161,8 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urlhashdeleteall")) {
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
int i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, agent, CacheStrategy.IFEXIST);
@ -161,8 +170,8 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urlhashdelete")) {
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
DigestURL url;
try {
@ -181,8 +190,8 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urldelete")) {
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
try {
urlhash = ASCII.String((new DigestURL(urlstring)).hash());
@ -229,31 +238,31 @@ public class IndexControlURLs_p {
}
if (post.containsKey("optimizesolr")) {
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
final int size = post.getInt("optimizemax", 10);
segment.fulltext().optimize(size);
final int size = post.getInt("optimizemax", 10);
segment.fulltext().optimize(size);
sb.tables.recordAPICall(post, "IndexControlURLs_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr optimize " + size);
}
if (post.containsKey("rebootsolr")) {
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
if (sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT,
SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT)
&& !sb.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED,
SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED_DEFAULT)) {
/* This operation is designed only for an embdded local Solr with no mirroring to an external remote Solr server */
segment.fulltext().rebootEmbeddedLocalSolr();
sb.tables.recordAPICall(post, "IndexControlURLs_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr reboot");
}
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
if (sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT,
SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT)
&& !sb.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED,
SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED_DEFAULT)) {
/* This operation is designed only for an embdded local Solr with no mirroring to an external remote Solr server */
segment.fulltext().rebootEmbeddedLocalSolr();
sb.tables.recordAPICall(post, "IndexControlURLs_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr reboot");
}
}
if (post.containsKey("deletedomain")) {
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
final String domain = post.get("domain");
Set<String> hostnames = new HashSet<String>();

@ -96,6 +96,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
protected final static int pagesize_docs = 100;
protected final static int pagesize_ids = 1000;
@Deprecated
protected static LoadTimeURL getLoadTimeURL(final Object doc) {
if (doc == null) return null;
Object d = null;
@ -478,6 +479,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
* @return metadata if any entry in solr exists, null otherwise
* @throws IOException
*/
@Deprecated
@Override
public LoadTimeURL getLoadTimeURL(String id) throws IOException {
// construct raw query

@ -425,10 +425,12 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
/**
* check if a given document, identified by url hash as document id exists
* @Deprecated use Segment.getLastSeenTime instead
* @param id the url hash and document id
* @return the load date if any entry in solr exists, null otherwise
* @throws IOException
*/
@Deprecated
@Override
public LoadTimeURL getLoadTimeURL(String id) throws IOException {
int responseCount = 0;

@ -27,9 +27,6 @@ import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicLong;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.kelondro.data.word.Word;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
@ -38,6 +35,9 @@ import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.kelondro.data.word.Word;
public class MirrorSolrConnector extends AbstractSolrConnector implements SolrConnector {
// the twin solrs
@ -188,7 +188,7 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
public SolrDocument getDocumentById(final String key, final String ... fields) throws IOException {
assert key.length() == Word.commonHashLength : "wrong id: " + key;
SolrDocument doc;
if ((solr0 != null && ((doc = solr0.getDocumentById(key, fields)) != null)) || (solr1 != null && ((doc = solr1.getDocumentById(key, fields)) != null))) {
if ((this.solr0 != null && ((doc = this.solr0.getDocumentById(key, fields)) != null)) || (this.solr1 != null && ((doc = this.solr1.getDocumentById(key, fields)) != null))) {
return doc;
}
return null;
@ -418,6 +418,7 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
return result;
}
@Deprecated
@Override
public LoadTimeURL getLoadTimeURL(String id) throws IOException {
if (this.solr0 != null && this.solr1 == null) return this.solr0.getLoadTimeURL(id);

@ -26,8 +26,6 @@ import java.util.LinkedHashMap;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import net.yacy.cora.sorting.ReversibleScoreMap;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
@ -35,6 +33,8 @@ import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams;
import net.yacy.cora.sorting.ReversibleScoreMap;
public interface SolrConnector extends Iterable<String> /* Iterable of document IDs */ {
public static class LoadTimeURL {
@ -123,6 +123,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* @return the load time metadata (url and load data) if any entry in solr exists, null otherwise
* @throws IOException
*/
@Deprecated
public LoadTimeURL getLoadTimeURL(final String id) throws IOException;
/**

@ -425,23 +425,15 @@ public final class CrawlStacker implements WorkflowTask<Request>{
if (dbocc != null) {
return CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX + ": " + dbocc.name();
}
final String urlhash = ASCII.String(url.hash());
LoadTimeURL oldEntry = null;
try {
oldEntry = this.indexSegment.fulltext().getDefaultConnector().getLoadTimeURL(urlhash);
} catch (final IOException e) {
// if an exception here occurs then there is the danger that urls which had been in the crawler are overwritten a second time
// to prevent that, we reject urls in these events
ConcurrentLog.logException(e);
return "exception during double-test: " + e.getMessage();
}
String urls = url.toNormalform(false);
LoadTimeURL oldEntry = this.indexSegment.getLoadTimeURL(urls, url.hash());
// deny urls that exceed allowed number of occurrences
final int maxAllowedPagesPerDomain = profile.domMaxPages();
if (maxAllowedPagesPerDomain < Integer.MAX_VALUE && maxAllowedPagesPerDomain > 0) {
final AtomicInteger dp = profile.getCount(url.getHost());
if (dp != null && dp.get() >= maxAllowedPagesPerDomain) {
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + url.toNormalform(false) + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + urls + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
return "crawl stack domain counter exceeded (test by profile)";
}

File diff suppressed because it is too large Load Diff

@ -319,8 +319,8 @@ public final class Fulltext {
private long lastCommit = 0;
public void commit(boolean softCommit) {
long t = System.currentTimeMillis();
if (lastCommit + 10000 > t) return;
lastCommit = t;
if (this.lastCommit + 10000 > t) return;
this.lastCommit = t;
getDefaultConnector().commit(softCommit);
if (this.writeWebgraph) getWebgraphConnector().commit(softCommit);
}
@ -587,6 +587,7 @@ public final class Fulltext {
* @param urlHash
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
*/
@Deprecated
private long getLoadTime(final String urlHash) throws IOException {
if (urlHash == null) return -1l;
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
@ -854,7 +855,7 @@ public final class Fulltext {
try (/* Resources automatically closed by this try-with-resources statement */
final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(this.f.getAbsolutePath() + ".gz") : this.f);
final OutputStream wrappedStream = ((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{def.setLevel(Deflater.BEST_COMPRESSION);}} : os;
final OutputStream wrappedStream = ((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os;
final PrintWriter pw = new PrintWriter(new BufferedOutputStream(wrappedStream));
) {
if (this.format == ExportFormat.html) {

@ -51,6 +51,7 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.ByteOrder;
@ -110,8 +111,9 @@ public class Segment {
public static final long targetFileSize = 64 * 1024 * 1024; // 256 MB
public static final int writeBufferSize = 4 * 1024 * 1024;
public static final String termIndexName = "text.index";
public static final String citationIndexName = "citation.index";
public static final String citationIndexName = "citation.index";
public static final String firstseenIndexName = "firstseen.index";
public static final String loadtimeIndexName = "loadtime.index";
// the reference factory
public static final ReferenceFactory<WordReference> wordReferenceFactory = new WordReferenceFactory();
@ -122,9 +124,10 @@ public class Segment {
private final File segmentPath;
protected final Fulltext fulltext;
protected IndexCell<WordReference> termIndex;
protected IndexCell<CitationReference> urlCitationIndex;
protected IndexTable firstSeenIndex;
protected IODispatcher merger = null; // shared iodispatcher for kelondro indexes
private IndexCell<CitationReference> urlCitationIndex;
private IndexTable firstSeenIndex;
private IndexTable loadTimeIndex;
private IODispatcher merger = null; // shared iodispatcher for kelondro indexes
/**
* create a new Segment
@ -143,6 +146,7 @@ public class Segment {
this.termIndex = null;
this.urlCitationIndex = null;
this.firstSeenIndex = new IndexTable(new File(segmentPath, firstseenIndexName), 12, 8, false, false);
this.loadTimeIndex = new IndexTable(new File(segmentPath, loadtimeIndexName), 12, 8, false, false);
}
public boolean connectedRWI() {
@ -166,7 +170,7 @@ public class Segment {
targetFileSize,
maxFileSize,
writeBufferSize,
merger);
this.merger);
}
public void disconnectRWI() {
@ -196,7 +200,7 @@ public class Segment {
targetFileSize,
maxFileSize,
writeBufferSize,
merger);
this.merger);
}
public void disconnectCitation() {
@ -225,10 +229,14 @@ public class Segment {
return this.urlCitationIndex;
}
public IndexTable firstSeen() {
public IndexTable firstSeenIndex() {
return this.firstSeenIndex;
}
public IndexTable loadTimeIndex() {
return this.loadTimeIndex;
}
public ReferenceReportCache getReferenceReportCache() {
return new ReferenceReportCache();
}
@ -239,12 +247,12 @@ public class Segment {
this.cache = new ConcurrentHashMap<String, ReferenceReport>();
}
public ReferenceReport getReferenceReport(final String id, final boolean acceptSelfReference) throws IOException {
ReferenceReport rr = cache.get(id);
if (MemoryControl.shortStatus()) cache.clear();
ReferenceReport rr = this.cache.get(id);
if (MemoryControl.shortStatus()) this.cache.clear();
if (rr != null) return rr;
try {
rr = new ReferenceReport(ASCII.getBytes(id), acceptSelfReference);
cache.put(id, rr);
this.cache.put(id, rr);
return rr;
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);
@ -278,19 +286,19 @@ public class Segment {
CitationReference ref = ri.next();
byte[] hh = ref.hosthash(); // host hash
if (ByteBuffer.equals(hh, 0, id, 6, 6)) {
internalIDs.put(ref.urlhash());
internal++;
this.internalIDs.put(ref.urlhash());
this.internal++;
} else {
externalHosts.put(hh);
externalIDs.put(ref.urlhash());
external++;
this.externalHosts.put(hh);
this.externalIDs.put(ref.urlhash());
this.external++;
}
}
} catch (SpaceExceededException e) {
// the Citation Index got too large, we ignore the problem and hope that a second solr index is attached which will take over now
if (Segment.this.fulltext.useWebgraph()) internalIDs.clear();
if (Segment.this.fulltext.useWebgraph()) this.internalIDs.clear();
}
if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.useWebgraph()) {
if ((this.internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.useWebgraph()) {
// reqd the references from the webgraph
SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector();
BlockingQueue<SolrDocument> docs = webgraph.concurrentDocumentsByQuery("{!cache=false raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 10000000, Long.MAX_VALUE, 100, 1, false, WebgraphSchema.source_id_s.getSolrFieldName());
@ -305,13 +313,13 @@ public class Segment {
System.arraycopy(refidh, 6, hh, 0, 6);
if (ByteBuffer.equals(hh, 0, id, 6, 6)) {
if (acceptSelfReference || !Arrays.equals(refidh, id)) {
internalIDs.put(refidh);
internal++;
this.internalIDs.put(refidh);
this.internal++;
}
} else {
externalHosts.put(hh);
externalIDs.put(refidh);
external++;
this.externalHosts.put(hh);
this.externalIDs.put(refidh);
this.external++;
}
}
} catch (final InterruptedException e) {
@ -398,6 +406,35 @@ public class Segment {
}
}
public void setLoadTime(final byte[] urlhash, long time) {
if (urlhash == null || time <= 0) return;
try {
this.loadTimeIndex.put(urlhash, time); // ALWAYS overwrite!
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}
public long getLoadTime(final byte[] urlhash) {
if (urlhash == null) return -1;
try {
return this.loadTimeIndex.get(urlhash);
} catch (IOException e) {
ConcurrentLog.logException(e);
return -1;
}
}
public LoadTimeURL getLoadTimeURL(String url, byte[] urlhash) {
long t = getLoadTime(urlhash);
if (t < 0) return null;
return new LoadTimeURL(url, t);
}
public LoadTimeURL getLoadTimeURL(String url, String id) {
return getLoadTimeURL(url, id.getBytes());
}
/**
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id
@ -483,6 +520,7 @@ public class Segment {
if (this.fulltext != null) this.fulltext.close();
if (this.urlCitationIndex != null) this.urlCitationIndex.close();
if (this.firstSeenIndex != null) this.firstSeenIndex.close();
if (this.loadTimeIndex != null) this.loadTimeIndex.close();
if (this.merger != null) {
this.merger.terminate();
this.merger = null;
@ -661,7 +699,9 @@ public class Segment {
}
// REMEMBER FIRST SEEN
setFirstSeenTime(url.hash(), Math.min(document.getLastModified().getTime(), System.currentTimeMillis())); // should exist already in the index at this time, but just to make sure
long now = System.currentTimeMillis();
setFirstSeenTime(url.hash(), Math.min(document.getLastModified().getTime(), now)); // should exist already in the index at this time, but just to make sure
setLoadTime(url.hash(), now); // always overwrites index entry
// write the edges to the citation reference index
if (this.connectedCitation()) try {
@ -676,7 +716,7 @@ public class Segment {
String referrerhash = id;
String anchorhash = ASCII.String(new DigestURL(targetURL).hash());
if (referrerhash != null && anchorhash != null) {
urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime()));
this.urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime()));
}
} catch (Throwable e) {
ConcurrentLog.logException(e);
@ -692,7 +732,7 @@ public class Segment {
String referrerhash = id;
String anchorhash = ASCII.String(new DigestURL(targetURL).hash());
if (referrerhash != null && anchorhash != null) {
urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime()));
this.urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime()));
}
} catch (Throwable e) {
ConcurrentLog.logException(e);

Loading…
Cancel
Save