yacy_search_server/source/net/yacy/crawler/RecrawlBusyThread.java

/**
 * RecrawlBusyThread.java
 * SPDX-FileCopyrightText: 2015 by Burkhard Buelte
 * SPDX-License-Identifier: GPL-2.0-or-later
 * First released 15.05.2015 at http://yacy.net
 *
 * This is a part of YaCy, a peer-to-peer based web search engine
 *
 * LICENSE
 *
 * This library is free software; you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or (at your option)
 * any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program in the file lgpl21.txt If not, see
 * <http://www.gnu.org/licenses/>.
 */
package net.yacy.crawler;

import java.io.IOException;
import java.net.MalformedURLException;
import java.time.LocalDateTime;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;

import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;

import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.Request;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionSchema;

/**
 * Selects documents by a query from the local index
 * and feeds the found urls to the crawler to recrawl the documents.
 * This is intended to keep the index up-to-date
 * Currently the doucments are selected by expired fresh_date_dt field
 * an added to the crawler in smaller chunks (see chunksize) as long as no other crawl is running.
 */
public class RecrawlBusyThread extends AbstractBusyThread {

    /** The thread name */
    public final static String THREAD_NAME = "recrawlindex";

    /** The default selection query */
    public static final String DEFAULT_QUERY = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]";

    /** Default value for inclusion or not of documents with a https status different from 200 (success) */
    public static final boolean DEFAULT_INCLUDE_FAILED = false;

    /** The default value whether to delete on Recrawl */
    public static final boolean DEFAULT_DELETE_ON_RECRAWL = false;

    /** The current query selecting documents to recrawl */
    private String currentQuery;

    /** flag if docs with httpstatus_i <> 200 shall be recrawled */
    private boolean includefailed;

    /** flag whether to delete on Recrawl */
    private boolean deleteOnRecrawl;

    private int chunkstart = 0;
    private final int chunksize = 100;
    private final Switchboard sb;

    /** buffer of urls to recrawl */
    private final Set<DigestURL> urlstack;

    /** The total number of candidate URLs found for recrawl */
    private long urlsToRecrawl = 0;

    /** Total number of URLs added to the crawler queue for recrawl */
    private long recrawledUrlsCount = 0;

    /** Total number of URLs rejected for some reason by the crawl stacker or the crawler queue */
    private long rejectedUrlsCount = 0;

    /** Total number of malformed URLs found */
    private long malformedUrlsCount = 0;

    /** Total number of malformed URLs deleted from index */
    private long malformedUrlsDeletedCount = 0;

    private final String solrSortBy;

    /** Set to true when more URLs are still to be processed */
    private boolean moreToRecrawl = true;

    /** True when the job terminated early because an error occurred when requesting the Solr index, or the Solr index was closed */
    private boolean terminatedBySolrFailure = false;

    /** The recrawl job start time */
    private LocalDateTime startTime;

    /** The recrawl job end time */
    private LocalDateTime endTime;

    /**
     * @param xsb
     *            the Switchboard instance holding server environment
     * @param query
     *            the Solr selection query
     * @param includeFailed
     *            set to true when documents with a https status different from 200
     *            (success) must be included
     */
    public RecrawlBusyThread(final Switchboard xsb, final String query, final boolean includeFailed, final boolean deleteOnRecrawl) {
        super(3000, 1000); // set lower limits of cycle delay
        this.setName(THREAD_NAME);
        this.setIdleSleep(10*60000); // set actual cycle delays
        this.setBusySleep(2*60000);
        this.setPriority(Thread.MIN_PRIORITY);
        this.setLoadPreReqisite(1);
        this.sb = xsb;
        this.currentQuery = query;
        this.includefailed = includeFailed;
        this.deleteOnRecrawl = deleteOnRecrawl;
        this.urlstack = new HashSet<>();
        // workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues
        // org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues.
        this.solrSortBy = CollectionSchema.load_date_dt.getSolrFieldName() + " asc";

        final SolrConnector solrConnector = this.sb.index.fulltext().getDefaultConnector();
        if (solrConnector != null && !solrConnector.isClosed()) {
            /* Ensure indexed data is up-to-date before running the main job */
            solrConnector.commit(true);
        }
    }

    /**
     * Set the query to select documents to recrawl
     * and resets the counter to start a fresh query loop
     * @param q select query
     * @param includefailedurls true=all http status docs are recrawled, false=httpstatus=200 docs are recrawled
     * @param deleteOnRecrawl
     */
    public void setQuery(String q, boolean includefailedurls, final boolean deleteOnRecrawl) {
        this.currentQuery = q;
        this.includefailed = includefailedurls;
        this.deleteOnRecrawl = deleteOnRecrawl;
        this.chunkstart = 0;
    }

    public String getQuery() {
        return this.currentQuery;
    }

    /**
     *
     * @param queryBase
     *            the base query
     * @param includeFailed
     *            set to true when documents with a https status different from 200
     *            (success) must be included
     * @return the Solr selection query for candidate URLs to recrawl
     */
    public static final String buildSelectionQuery(final String queryBase, final boolean includeFailed) {
        return includeFailed ? queryBase : queryBase + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)";
    }

    /**
     * Flag to include failed urls (httpstatus_i <> 200)
     * if true -> currentQuery is used as is,
     * if false -> the term " AND (httpstatus_i:200)" is appended to currentQuery
     * @param includefailedurls
     */
    public void setIncludeFailed(boolean includefailedurls) {
        this.includefailed = includefailedurls;
    }

    public boolean getIncludeFailed () {
        return this.includefailed;
    }

    public void setDeleteOnRecrawl(final boolean deleteOnRecrawl) {
        this.deleteOnRecrawl = deleteOnRecrawl;
    }

    public boolean getDeleteOnRecrawl() {
        return this.deleteOnRecrawl;
    }

    /**
     * feed urls to the local crawler
     * (Switchboard.addToCrawler() is not used here, as there existing urls are always skipped)
     *
     * @return true if urls were added/accepted to the crawler
     */
    private boolean feedToCrawler() {

        int added = 0;

        if (!this.urlstack.isEmpty()) {
            final CrawlProfile profile = this.sb.crawler.defaultRecrawlJobProfile;

            for (final DigestURL url : this.urlstack) {
                final Request request = new Request(ASCII.getBytes(this.sb.peers.mySeed().hash), url, null, "",
                        new Date(), profile.handle(), 0, profile.timezoneOffset());
                String acceptedError = this.sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0);
                if (!this.includefailed && acceptedError == null) { // skip check if failed docs to be included
                    acceptedError = this.sb.crawlStacker.checkAcceptanceInitially(url, profile);
                }
                if (acceptedError != null) {
                    this.rejectedUrlsCount++;
                    ConcurrentLog.info(THREAD_NAME, "addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);
                    continue;
                }
                final String s;
                s = this.sb.crawlQueues.noticeURL.push(NoticedURL.StackType.LOCAL, request, profile, this.sb.robots);

                if (s != null) {
                    this.rejectedUrlsCount++;
                    ConcurrentLog.info(THREAD_NAME, "addToCrawler: failed to add " + url.toNormalform(true) + ": " + s);
                } else {
                    added++;
                    this.recrawledUrlsCount++;
                }
            }
            this.urlstack.clear();
        }
        return (added > 0);
    }

    /**
     * Process query and hand over urls to the crawler
     *
     * @return true if something processed
     */
    @Override
    public boolean job() {
        // more than chunksize crawls are running, do nothing
        if (this.sb.crawlQueues.coreCrawlJobSize() > this.chunksize) {
            return false;
        }

        boolean didSomething = false;
        if (this.urlstack.isEmpty()) {
            if(!this.moreToRecrawl) {
                /* We do not remove the thread from the Switchboard worker threads using serverSwitch.terminateThread(String,boolean),
                 * because we want to be able to provide a report after its termination */
                this.terminate(false);
            } else {
                this.moreToRecrawl = this.processSingleQuery();
                /* Even if no more URLs are to recrawl, the job has done something by searching the Solr index */
                didSomething = true;
            }
        } else {
            didSomething = this.feedToCrawler();
        }
        return didSomething;
    }

    @Override
    public synchronized void start() {
        this.startTime = LocalDateTime.now();
        super.start();
    }

    @Override
    public void terminate(boolean waitFor) {
        super.terminate(waitFor);
        this.endTime = LocalDateTime.now();
    }

    /**
     * Selects documents to recrawl the urls
     * @return true if query has more results
     */
    private boolean processSingleQuery() {
        if (!this.urlstack.isEmpty()) {
            return true;
        }
        SolrDocumentList docList = null;
        final SolrConnector solrConnector = this.sb.index.fulltext().getDefaultConnector();
        if (solrConnector == null || solrConnector.isClosed()) {
            this.urlsToRecrawl = 0;
            this.terminatedBySolrFailure = true;
            return false;
        }

        try {
            // query all or only httpstatus=200 depending on includefailed flag
            docList = solrConnector.getDocumentListByQuery(RecrawlBusyThread.buildSelectionQuery(this.currentQuery, this.includefailed),
                this.solrSortBy, this.chunkstart, this.chunksize, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
            this.urlsToRecrawl = docList.getNumFound();
        } catch (final Throwable e) {
            this.urlsToRecrawl = 0;
            this.terminatedBySolrFailure = true;
        }

        if (docList != null) {
            final Set<String> tobedeletedIDs = new HashSet<>();
            for (final SolrDocument doc : docList) {
                try {
                    this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())));
                    if (this.deleteOnRecrawl) tobedeletedIDs.add((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
                } catch (final MalformedURLException ex) {
                    this.malformedUrlsCount++;
                    // if index entry hasn't a valid url (useless), delete it
                    tobedeletedIDs.add((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
                    this.malformedUrlsDeletedCount++;
                    ConcurrentLog.severe(THREAD_NAME, "deleted index document with invalid url " + (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
                }
            }

            if (!tobedeletedIDs.isEmpty()) try {
                solrConnector.deleteByIds(tobedeletedIDs);
                solrConnector.commit(false);
            } catch (final IOException e) {
                ConcurrentLog.severe(THREAD_NAME, "error deleting IDs ", e);
            }

            this.chunkstart = this.deleteOnRecrawl? 0 : this.chunkstart + this.chunksize;
        }

        if (docList == null || docList.size() < this.chunksize) {
            return false;
        }
        return true;
    }

    /**
     * @return a new default CrawlProfile instance to be used for recrawl jobs.
     */
    public static CrawlProfile buildDefaultCrawlProfile() {
        final CrawlProfile profile = new CrawlProfile(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB, CrawlProfile.MATCH_ALL_STRING, // crawlerUrlMustMatch
                CrawlProfile.MATCH_NEVER_STRING, // crawlerUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING, // crawlerIpMustMatch
                CrawlProfile.MATCH_NEVER_STRING, // crawlerIpMustNotMatch
                CrawlProfile.MATCH_NEVER_STRING, // crawlerCountryMustMatch
                CrawlProfile.MATCH_NEVER_STRING, // crawlerNoDepthLimitMatch
                CrawlProfile.MATCH_ALL_STRING, // indexUrlMustMatch
                CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch
                true, //noindexWhenCanonicalUnequalURL
                0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1,
                true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,
                "robot_" + CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB,
                ClientIdentification.yacyInternetCrawlerAgentName,
                TagValency.EVAL, null, null, 0);
        return profile;
    }

    @Override
    public int getJobCount() {
        return this.urlstack.size();
    }

    /**
     * @return The total number of candidate URLs found for recrawl
     */
    public long getUrlsToRecrawl() {
        return this.urlsToRecrawl;
    }

    /**
     * @return The total number of URLs added to the crawler queue for recrawl
     */
    public long getRecrawledUrlsCount() {
        return this.recrawledUrlsCount;
    }

    /**
     * @return The total number of URLs rejected for some reason by the crawl
     *         stacker or the crawler queue
     */
    public long getRejectedUrlsCount() {
        return this.rejectedUrlsCount;
    }

    /**
     * @return The total number of malformed URLs found
     */
    public long getMalformedUrlsCount() {
        return this.malformedUrlsCount;
    }

    /**
     * @return The total number of malformed URLs deleted from index
     */
    public long getMalformedUrlsDeletedCount() {
        return this.malformedUrlsDeletedCount;
    }

    /**
     * @return true when the job terminated early because an error occurred when
     *         requesting the Solr index, or the Solr index was closed
     */
    public boolean isTerminatedBySolrFailure() {
        return this.terminatedBySolrFailure;
    }

    /** @return The recrawl job start time */
    public LocalDateTime getStartTime() {
        return this.startTime;
    }

    /** @return The recrawl job end time */
    public LocalDateTime getEndTime() {
        return this.endTime;
    }

    @Override
    public void freemem() {
        this.urlstack.clear();
    }

}