You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/net/yacy/crawler/RecrawlBusyThread.java

426 lines
16 KiB

/**
* RecrawlBusyThread.java
* Copyright 2015 by Burkhard Buelte
* First released 15.05.2015 at http://yacy.net
*
* This is a part of YaCy, a peer-to-peer based web search engine
*
* LICENSE
*
* This library is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt If not, see
* <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler;
import java.io.IOException;
import java.net.MalformedURLException;
import java.time.LocalDateTime;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionSchema;
/**
* Selects documents by a query from the local index
* and feeds the found urls to the crawler to recrawl the documents.
* This is intended to keep the index up-to-date
* Currently the doucments are selected by expired fresh_date_dt field
* an added to the crawler in smaller chunks (see chunksize) as long as no other crawl is running.
*/
public class RecrawlBusyThread extends AbstractBusyThread {
/** The thread name */
public final static String THREAD_NAME = "recrawlindex";
/** The default selection query */
public static final String DEFAULT_QUERY = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]";
/** Default value for inclusion or not of documents with a https status different from 200 (success) */
public static final boolean DEFAULT_INCLUDE_FAILED = false;
/** The default value whether to delete on Recrawl */
public static final boolean DEFAULT_DELETE_ON_RECRAWL = false;
/** The current query selecting documents to recrawl */
private String currentQuery;
/** flag if docs with httpstatus_i <> 200 shall be recrawled */
private boolean includefailed;
/** flag whether to delete on Recrawl */
private boolean deleteOnRecrawl;
private int chunkstart = 0;
private final int chunksize = 100;
private final Switchboard sb;
/** buffer of urls to recrawl */
private final Set<DigestURL> urlstack;
/** The total number of candidate URLs found for recrawl */
private long urlsToRecrawl = 0;
/** Total number of URLs added to the crawler queue for recrawl */
private long recrawledUrlsCount = 0;
/** Total number of URLs rejected for some reason by the crawl stacker or the crawler queue */
private long rejectedUrlsCount = 0;
/** Total number of malformed URLs found */
private long malformedUrlsCount = 0;
/** Total number of malformed URLs deleted from index */
private long malformedUrlsDeletedCount = 0;
private String solrSortBy;
/** Set to true when more URLs are still to be processed */
private boolean moreToRecrawl = true;
/** True when the job terminated early because an error occurred when requesting the Solr index, or the Solr index was closed */
private boolean terminatedBySolrFailure = false;
/** The recrawl job start time */
private LocalDateTime startTime;
/** The recrawl job end time */
private LocalDateTime endTime;
/**
* @param xsb
* the Switchboard instance holding server environment
* @param query
* the Solr selection query
* @param includeFailed
* set to true when documents with a https status different from 200
* (success) must be included
*/
public RecrawlBusyThread(final Switchboard xsb, final String query, final boolean includeFailed, final boolean deleteOnRecrawl) {
super(3000, 1000); // set lower limits of cycle delay
setName(THREAD_NAME);
this.setIdleSleep(10*60000); // set actual cycle delays
this.setBusySleep(2*60000);
this.setPriority(Thread.MIN_PRIORITY);
this.setLoadPreReqisite(1);
this.sb = xsb;
this.currentQuery = query;
this.includefailed = includeFailed;
this.deleteOnRecrawl = deleteOnRecrawl;
this.urlstack = new HashSet<DigestURL>();
// workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues
// org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues.
solrSortBy = CollectionSchema.load_date_dt.getSolrFieldName() + " asc";
final SolrConnector solrConnector = this.sb.index.fulltext().getDefaultConnector();
if (solrConnector != null && !solrConnector.isClosed()) {
/* Ensure indexed data is up-to-date before running the main job */
solrConnector.commit(true);
}
}
/**
* Set the query to select documents to recrawl
* and resets the counter to start a fresh query loop
* @param q select query
* @param includefailedurls true=all http status docs are recrawled, false=httpstatus=200 docs are recrawled
* @param deleteOnRecrawl
*/
public void setQuery(String q, boolean includefailedurls, final boolean deleteOnRecrawl) {
this.currentQuery = q;
this.includefailed = includefailedurls;
this.deleteOnRecrawl = deleteOnRecrawl;
this.chunkstart = 0;
}
public String getQuery() {
return this.currentQuery;
}
/**
*
* @param queryBase
* the base query
* @param includeFailed
* set to true when documents with a https status different from 200
* (success) must be included
* @return the Solr selection query for candidate URLs to recrawl
*/
public static final String buildSelectionQuery(final String queryBase, final boolean includeFailed) {
return includeFailed ? queryBase : queryBase + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)";
}
/**
* Flag to include failed urls (httpstatus_i <> 200)
* if true -> currentQuery is used as is,
* if false -> the term " AND (httpstatus_i:200)" is appended to currentQuery
* @param includefailedurls
*/
public void setIncludeFailed(boolean includefailedurls) {
this.includefailed = includefailedurls;
}
public boolean getIncludeFailed () {
return this.includefailed;
}
public void setDeleteOnRecrawl(final boolean deleteOnRecrawl) {
this.deleteOnRecrawl = deleteOnRecrawl;
}
public boolean getDeleteOnRecrawl() {
return this.deleteOnRecrawl;
}
/**
* feed urls to the local crawler
* (Switchboard.addToCrawler() is not used here, as there existing urls are always skiped)
*
* @return true if urls were added/accepted to the crawler
*/
private boolean feedToCrawler() {
int added = 0;
if (!this.urlstack.isEmpty()) {
final CrawlProfile profile = sb.crawler.defaultRecrawlJobProfile;
for (final DigestURL url : this.urlstack) {
final Request request = new Request(ASCII.getBytes(this.sb.peers.mySeed().hash), url, null, "",
new Date(), profile.handle(), 0, profile.timezoneOffset());
String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0);
if (!includefailed && acceptedError == null) { // skip check if failed docs to be included
acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile);
}
if (acceptedError != null) {
this.rejectedUrlsCount++;
ConcurrentLog.info(THREAD_NAME, "addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);
continue;
}
final String s;
s = sb.crawlQueues.noticeURL.push(NoticedURL.StackType.LOCAL, request, profile, sb.robots);
if (s != null) {
this.rejectedUrlsCount++;
ConcurrentLog.info(THREAD_NAME, "addToCrawler: failed to add " + url.toNormalform(true) + ": " + s);
} else {
added++;
this.recrawledUrlsCount++;
}
}
this.urlstack.clear();
}
return (added > 0);
}
/**
* Process query and hand over urls to the crawler
*
* @return true if something processed
*/
@Override
public boolean job() {
// more than chunksize crawls are running, do nothing
if (sb.crawlQueues.coreCrawlJobSize() > this.chunksize) {
return false;
}
boolean didSomething = false;
if (this.urlstack.isEmpty()) {
if(!this.moreToRecrawl) {
/* We do not remove the thread from the Switchboard worker threads using serverSwitch.terminateThread(String,boolean),
* because we want to be able to provide a report after its termination */
terminate(false);
} else {
this.moreToRecrawl = processSingleQuery();
/* Even if no more URLs are to recrawl, the job has done something by searching the Solr index */
didSomething = true;
}
} else {
didSomething = feedToCrawler();
}
return didSomething;
}
@Override
public synchronized void start() {
this.startTime = LocalDateTime.now();
super.start();
}
@Override
public void terminate(boolean waitFor) {
super.terminate(waitFor);
this.endTime = LocalDateTime.now();
}
/**
* Selects documents to recrawl the urls
* @return true if query has more results
*/
private boolean processSingleQuery() {
if (!this.urlstack.isEmpty()) {
return true;
}
SolrDocumentList docList = null;
final SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
if (solrConnector == null || solrConnector.isClosed()) {
this.urlsToRecrawl = 0;
this.terminatedBySolrFailure = true;
return false;
}
try {
// query all or only httpstatus=200 depending on includefailed flag
docList = solrConnector.getDocumentListByQuery(RecrawlBusyThread.buildSelectionQuery(this.currentQuery, this.includefailed),
this.solrSortBy, this.chunkstart, this.chunksize, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
this.urlsToRecrawl = docList.getNumFound();
} catch (final Throwable e) {
this.urlsToRecrawl = 0;
this.terminatedBySolrFailure = true;
}
if (docList != null) {
Set<String> tobedeletedIDs = new HashSet<>();
for (final SolrDocument doc : docList) {
try {
this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())));
if (deleteOnRecrawl) tobedeletedIDs.add((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
} catch (final MalformedURLException ex) {
this.malformedUrlsCount++;
// if index entry hasn't a valid url (useless), delete it
tobedeletedIDs.add((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
this.malformedUrlsDeletedCount++;
ConcurrentLog.severe(THREAD_NAME, "deleted index document with invalid url " + (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
}
}
if (!tobedeletedIDs.isEmpty()) try {
solrConnector.deleteByIds(tobedeletedIDs);
solrConnector.commit(false);
} catch (IOException e) {
ConcurrentLog.severe(THREAD_NAME, "error deleting IDs ", e);
}
this.chunkstart = deleteOnRecrawl? 0 : this.chunkstart + this.chunksize;
}
if (docList == null || docList.size() < this.chunksize) {
return false;
}
return true;
}
/**
* @return a new default CrawlProfile instance to be used for recrawl jobs.
*/
public static CrawlProfile buildDefaultCrawlProfile() {
CrawlProfile profile = new CrawlProfile(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB, CrawlProfile.MATCH_ALL_STRING, // crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, // crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, // crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, // crawlerIpMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, // crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, // crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, // indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch
0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB,
ClientIdentification.yacyInternetCrawlerAgentName, null, null, 0);
return profile;
}
@Override
public int getJobCount() {
return this.urlstack.size();
}
/**
* @return The total number of candidate URLs found for recrawl
*/
public long getUrlsToRecrawl() {
return this.urlsToRecrawl;
}
/**
* @return The total number of URLs added to the crawler queue for recrawl
*/
public long getRecrawledUrlsCount() {
return this.recrawledUrlsCount;
}
/**
* @return The total number of URLs rejected for some reason by the crawl
* stacker or the crawler queue
*/
public long getRejectedUrlsCount() {
return this.rejectedUrlsCount;
}
/**
* @return The total number of malformed URLs found
*/
public long getMalformedUrlsCount() {
return this.malformedUrlsCount;
}
/**
* @return The total number of malformed URLs deleted from index
*/
public long getMalformedUrlsDeletedCount() {
return this.malformedUrlsDeletedCount;
}
/**
* @return true when the job terminated early because an error occurred when
* requesting the Solr index, or the Solr index was closed
*/
public boolean isTerminatedBySolrFailure() {
return this.terminatedBySolrFailure;
}
/** @return The recrawl job start time */
public LocalDateTime getStartTime() {
return this.startTime;
}
/** @return The recrawl job end time */
public LocalDateTime getEndTime() {
return this.endTime;
}
@Override
public void freemem() {
this.urlstack.clear();
}
}