You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
151 lines
5.1 KiB
151 lines
5.1 KiB
/**
|
|
* Balancer
|
|
* SPDX-FileCopyrightText: 2014 Michael Peter Christen <mc@yacy.net)>
|
|
* SPDX-License-Identifier: GPL-2.0-or-later
|
|
* Frankfurt am Main, Germany
|
|
* First released 14.04.2014 at https://yacy.net
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program in the file lgpl21.txt
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
|
|
package net.yacy.crawler;
|
|
|
|
import java.io.IOException;
|
|
import java.util.Iterator;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Set;
|
|
|
|
import net.yacy.cora.storage.HandleSet;
|
|
import net.yacy.cora.util.SpaceExceededException;
|
|
import net.yacy.crawler.data.CrawlProfile;
|
|
import net.yacy.crawler.retrieval.Request;
|
|
import net.yacy.crawler.robots.RobotsTxt;
|
|
|
|
public interface Balancer {
|
|
|
|
/**
|
|
* close the balancer object
|
|
*/
|
|
public void close();
|
|
|
|
/**
|
|
* delete all urls from the stack
|
|
*/
|
|
public void clear();
|
|
|
|
/**
|
|
* get one url from the crawl stack
|
|
* @param urlhash
|
|
* @return the request for an url by given url hash
|
|
* @throws IOException
|
|
*/
|
|
public Request get(final byte[] urlhash) throws IOException;
|
|
|
|
/**
|
|
* delete all urls from the stack by given profile handle
|
|
* @param profileHandle
|
|
* @param timeout
|
|
* @return the number of removed urls
|
|
* @throws IOException
|
|
* @throws SpaceExceededException
|
|
*/
|
|
public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException;
|
|
|
|
/**
|
|
* delete all urls which are stored for given host hashes
|
|
* @param hosthashes
|
|
* @return number of deleted urls
|
|
*/
|
|
public int removeAllByHostHashes(final Set<String> hosthashes);
|
|
|
|
/**
|
|
* @param urlHashes, a list of hashes that shall be removed
|
|
* @return number of entries that had been removed
|
|
* @throws IOException
|
|
*/
|
|
public int remove(final HandleSet urlHashes) throws IOException;
|
|
|
|
/**
|
|
* check if given url hash is contained in the balancer stack
|
|
* @param urlhashb
|
|
* @return true if the url is queued here, false otherwise
|
|
*/
|
|
public boolean has(final byte[] urlhashb);
|
|
|
|
/**
|
|
* get the size of the stack
|
|
* @return the number of urls waiting to be loaded
|
|
*/
|
|
public int size();
|
|
|
|
public int getOnDemandLimit();
|
|
|
|
public boolean getExceed134217727();
|
|
/**
|
|
* check if stack is empty
|
|
* @return true iff size() == 0
|
|
*/
|
|
public boolean isEmpty();
|
|
|
|
/**
|
|
* push a crawl request on the balancer stack
|
|
* @param entry
|
|
* @return null if this was successful or a String explaining what went wrong in case of an error
|
|
* @throws IOException
|
|
* @throws SpaceExceededException
|
|
*/
|
|
public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException;
|
|
|
|
/**
|
|
* get a list of domains that are currently maintained as domain stacks
|
|
* @return a map of clear text strings of host names (each host name eventually concatenated with a port, depending on the implementation)
|
|
* to an integer array: {the size of the domain stack, guessed delta waiting time}
|
|
*/
|
|
public Map<String, Integer[]> getDomainStackHosts(RobotsTxt robots);
|
|
|
|
/**
|
|
* get lists of crawl request entries for a specific host
|
|
* @param host
|
|
* @param maxcount
|
|
* @param maxtime
|
|
* @return a list of crawl loader requests
|
|
*/
|
|
public List<Request> getDomainStackReferences(final String host, int maxcount, final long maxtime);
|
|
|
|
/**
|
|
* get the next entry in this crawl queue in such a way that the domain access time delta is maximized
|
|
* and always above the given minimum delay time. In case the minimum time cannot ensured, this method pauses
|
|
* the necessary time until the url is released and returned as CrawlEntry object. In case that a profile
|
|
* for the computed Entry does not exist, null is returned
|
|
* @param delay true if the requester demands forced delays using explicit thread sleep
|
|
* @param cs
|
|
* @param robots
|
|
* @return a url in a CrawlEntry object
|
|
* @throws IOException
|
|
* @throws SpaceExceededException
|
|
*/
|
|
public Request pop(final boolean delay, final CrawlSwitchboard cs, final RobotsTxt robots) throws IOException;
|
|
|
|
/**
|
|
* iterate through all requests in the queue
|
|
* @return
|
|
* @throws IOException
|
|
*/
|
|
public Iterator<Request> iterator() throws IOException;
|
|
|
|
}
|