You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/net/yacy/search/AutoSearch.java

320 lines
13 KiB

/**
* AutoSearch.java
* Copyright 2015 by Burkhard Buelte
* First released 09.01.2015 at http://yacy.net
*
* This is a part of YaCy, a peer-to-peer based web search engine
*
* LICENSE
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import static net.yacy.cora.federate.opensearch.SRURSSConnector.loadSRURSS;
import net.yacy.cora.federate.solr.connector.RemoteSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.solr.instance.RemoteInstance;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.BookmarksDB.Bookmark;
import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.peers.Seed;
import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.CommonParams;
/**
* AutoSearch retrieves queries from Bookmarks or a property file (if existing)
* and loops to a list of connected peers and asks each for results which are
* added to the local index.
*/
public class AutoSearch extends AbstractBusyThread {
private Set<String> querystack; // serach query
public String currentQuery = null; // current query
private Set<String> currentTargets = null; // peer hashes
final Switchboard sb;
public int gotresults;
private long lastInitTime; // to recognize new data (Bookmarks) to import
public AutoSearch(Switchboard xsb) {
super(3000, 1000); // set lower limits of cycle delay
this.setIdleSleep(60000); // set actual cycle delays
this.setBusySleep(10000);
this.sb = xsb;
gotresults = 0;
querystack = new HashSet<String>();
this.lastInitTime = System.currentTimeMillis() - 600000; // init to now - 10 min
if (!checkBookmarkDB()) {
try {
// check for old queries in temp property file
File pfile = new File(xsb.dataPath, "DATA/SETTINGS/autosearch.conf");
if (pfile.exists()) {
ConcurrentLog.info(AutoSearch.class.getName(), "read queries from file " + pfile.getAbsolutePath());
Properties prop = new Properties();
FileInputStream fileIn = new FileInputStream(pfile);
prop.load(fileIn);
if (prop.size() > 0) {
Set<Object> all = prop.keySet();
for (Object s : all) {
String query = prop.getProperty((String) s);
if (query != null && !query.isEmpty()) {
querystack.add(query);
}
}
}
fileIn.close();
}
} catch (final IOException e) {
ConcurrentLog.warn(AutoSearch.class.getName(), "Error reading config file");
}
}
}
/**
* Save current queries to a (temporary) property file to allow continue
* after a restart. Existing file will be overwritten or deleted.
*/
private void saveasPropFile() {
File pfile = new File(sb.dataPath, "DATA/SETTINGS/autosearch.conf");
if (querystack.size() == 0) {
if (pfile.exists()) {
pfile.delete();
}
} else {
try {
Properties prop = new Properties();
for (String s : querystack) {
prop.put("query" + s.hashCode(), s);
}
OutputStream fileOut = new FileOutputStream(pfile);
prop.store(fileOut, "AutoSearch query list");
fileOut.close();
} catch (FileNotFoundException ex) {
ConcurrentLog.warn(AutoSearch.class.getName(), "can not create file " + pfile.getAbsolutePath());
} catch (IOException ex) {
ConcurrentLog.warn(AutoSearch.class.getName(), "IO error writing to file " + pfile.getAbsolutePath());
}
}
}
/**
* Get peers to query (peers connected)
*
* @return Set of peer hashes to contact
*/
private void initPeerList() {
if (currentTargets == null) {
currentTargets = new HashSet<String>();
}
// TODO: DHT peers could be excluded
Iterator<Seed> it = Switchboard.getSwitchboard().peers.seedsConnected(true, false, null, 0);
while (it.hasNext()) {
Seed s = it.next();
currentTargets.add(s.hash);
}
}
/**
* Check BookmarkDB for existing queries return true if new entry added to
* query queue. Store queries in (temporary) property file
*
* @return true if new query from bookmark was added
*/
private boolean checkBookmarkDB() {
int added = 0;
Iterator<Bookmark> it = Switchboard.getSwitchboard().bookmarksDB.getBookmarksIterator();
if (it != null) {
while (it.hasNext()) {
Bookmark bmk = it.next();
// get search bookmarks only
if (bmk.getFoldersString().startsWith("/search")) {
// take only new created or edited bookmarks
if (bmk.getTimeStamp() >= this.lastInitTime) {
final String query = bmk.getQuery();
if (query != null && !query.isEmpty()) {
{
querystack.add(query);
added++;
ConcurrentLog.info(AutoSearch.class.getName(), "add query from Bookmarks: query=" + query);
}
}
}
}
}
}
if (added > 0) {
this.lastInitTime = System.currentTimeMillis();
saveasPropFile();
return true;
}
return false;
}
/**
* Process query queue, select one query and peer to ask next
*
* @return true if something processed
*/
@Override
public boolean job() {
if (currentQuery == null && querystack != null && querystack.size() > 0) {
currentQuery = querystack.iterator().next();
querystack.remove(currentQuery); // imediate remove to asure no repeat
initPeerList(); // late initialization of peerlist to get currently connected
}
// ask next peer for search term
if (currentQuery != null && !currentQuery.isEmpty()) {
if (currentTargets != null && !currentTargets.isEmpty()) {
while (currentTargets.size() > 0) { // loop only to skip disconnected peers
String peerhash = currentTargets.iterator().next();
currentTargets.remove(peerhash);
Seed seed = Switchboard.getSwitchboard().peers.getConnected(peerhash);
if (seed != null) {
processSingleTarget(seed);
return true; // just one query per busycycle is intended
}
}
}
currentQuery = null;
}
// no search targets
checkBookmarkDB();
// TODO: do idle processing
// analyse content of local index
// extend search with learned new search terms
// follow most promising links
ConcurrentLog.fine(AutoSearch.class.getName(), "nothing to do");
return this.querystack.size() > 0;
}
/**
* Calls one peer for search results of the current query and adds it to the
* local index. Depending on peers SolrAvailable flag the a solr query or
* opensearch/rss query is used.
*
* @param seed the peer to ask
*/
private void processSingleTarget(Seed seed) {
ConcurrentLog.fine(AutoSearch.class.getName(), "ask " + seed.getIP() + " " + seed.getName() + " for query=" + currentQuery);
if (seed.getFlagSolrAvailable()) { // do a solr query
SolrDocumentList docList = null;
SolrQuery solrQuery = new SolrQuery();
// use remote defaults and ranking (to query their index right)
solrQuery.set(CommonParams.Q, currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)"); // except this yacy special
solrQuery.set("q.op", "AND"); // except ... no one word matches please
solrQuery.set(CommonParams.ROWS, "20");
this.setName("Protocol.solrQuery(" + solrQuery.getQuery() + " to " + seed.hash + ")");
try {
RemoteInstance instance = new RemoteInstance("http://" + seed.getPublicAddress(seed.getIP()) + "/solr/", null, null, 10000); // this is a 'patch configuration' which considers 'solr' as default collection
try {
SolrConnector solrConnector = new RemoteSolrConnector(instance, true, null);
if (!solrConnector.isClosed()) {
try {
QueryResponse rsp = solrConnector.getResponseByParams(solrQuery);
docList = rsp.getResults();
} catch (Throwable e) {
} finally {
solrConnector.close();
}
}
} catch (Throwable ee) {
} finally {
instance.close();
}
if (docList != null) {
for (SolrDocument d : docList) {
sb.index.fulltext().putDocument(sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(d));
this.gotresults++;
}
ConcurrentLog.info(AutoSearch.class.getName(), "added " + docList.size() + " results from " + seed.getName() + " to index for solrquery=" + currentQuery);
}
} catch (Throwable eee) {
}
} else { // do a yacysearch.rss query
final String rssSearchServiceURL = "http://" + seed.getPublicAddress(seed.getIP()) + "/yacysearch.rss";
try {
RSSFeed feed = loadSRURSS(
rssSearchServiceURL,
currentQuery,
0,
20,
CacheStrategy.IFFRESH,
false, // just local, as we ask others too
ClientIdentification.yacyInternetCrawlerAgent);
final List<DigestURL> urls = new ArrayList<DigestURL>();
for (final MultiProtocolURL entry : feed.getLinks()) {
urls.add(new DigestURL(entry, (byte[]) null));
this.gotresults++;
}
sb.addToIndex(urls, null, "AutoSearch", null, true);
ConcurrentLog.info(AutoSearch.class.getName(), "added " + urls.size() + " results from " + seed.getName() + " to index for query=" + currentQuery);
} catch (IOException ex) {
ConcurrentLog.info(AutoSearch.class.getName(), "no answer from " + seed.getName());
}
}
}
/**
* Estimate of queries to perform
*/
@Override
public int getJobCount() {
if (currentTargets != null) {
int cnt = currentTargets.size();
cnt += querystack.size() * sb.peers.sizeConnected();
return cnt;
}
return 0;
}
@Override
public void freemem() {
}
@Override
public void close() {
this.saveasPropFile(); // saves or deletes property file with queries
}
}