You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
320 lines
13 KiB
320 lines
13 KiB
/**
|
|
* AutoSearch.java
|
|
* Copyright 2015 by Burkhard Buelte
|
|
* First released 09.01.2015 at http://yacy.net
|
|
*
|
|
* This is a part of YaCy, a peer-to-peer based web search engine
|
|
*
|
|
* LICENSE
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program in the file lgpl21.txt
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
package net.yacy.search;
|
|
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
import java.io.FileNotFoundException;
|
|
import java.io.FileOutputStream;
|
|
import java.io.IOException;
|
|
import java.io.OutputStream;
|
|
import java.util.ArrayList;
|
|
import java.util.HashSet;
|
|
import java.util.Iterator;
|
|
import java.util.List;
|
|
import java.util.Properties;
|
|
import java.util.Set;
|
|
import net.yacy.cora.document.feed.RSSFeed;
|
|
import net.yacy.cora.document.id.DigestURL;
|
|
import net.yacy.cora.document.id.MultiProtocolURL;
|
|
import static net.yacy.cora.federate.opensearch.SRURSSConnector.loadSRURSS;
|
|
import net.yacy.cora.federate.solr.connector.RemoteSolrConnector;
|
|
import net.yacy.cora.federate.solr.connector.SolrConnector;
|
|
import net.yacy.cora.federate.solr.instance.RemoteInstance;
|
|
import net.yacy.cora.federate.yacy.CacheStrategy;
|
|
import net.yacy.cora.protocol.ClientIdentification;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
import net.yacy.data.BookmarksDB.Bookmark;
|
|
import net.yacy.kelondro.workflow.AbstractBusyThread;
|
|
import net.yacy.peers.Seed;
|
|
import net.yacy.search.schema.CollectionSchema;
|
|
import org.apache.solr.client.solrj.SolrQuery;
|
|
import org.apache.solr.client.solrj.response.QueryResponse;
|
|
import org.apache.solr.common.SolrDocument;
|
|
import org.apache.solr.common.SolrDocumentList;
|
|
import org.apache.solr.common.params.CommonParams;
|
|
|
|
/**
|
|
* AutoSearch retrieves queries from Bookmarks or a property file (if existing)
|
|
* and loops to a list of connected peers and asks each for results which are
|
|
* added to the local index.
|
|
*/
|
|
public class AutoSearch extends AbstractBusyThread {
|
|
|
|
private Set<String> querystack; // serach query
|
|
public String currentQuery = null; // current query
|
|
private Set<String> currentTargets = null; // peer hashes
|
|
final Switchboard sb;
|
|
public int gotresults;
|
|
private long lastInitTime; // to recognize new data (Bookmarks) to import
|
|
|
|
public AutoSearch(Switchboard xsb) {
|
|
super(3000, 1000); // set lower limits of cycle delay
|
|
this.setIdleSleep(60000); // set actual cycle delays
|
|
this.setBusySleep(10000);
|
|
this.sb = xsb;
|
|
|
|
gotresults = 0;
|
|
querystack = new HashSet<String>();
|
|
|
|
this.lastInitTime = System.currentTimeMillis() - 600000; // init to now - 10 min
|
|
if (!checkBookmarkDB()) {
|
|
try {
|
|
// check for old queries in temp property file
|
|
File pfile = new File(xsb.dataPath, "DATA/SETTINGS/autosearch.conf");
|
|
if (pfile.exists()) {
|
|
ConcurrentLog.info(AutoSearch.class.getName(), "read queries from file " + pfile.getAbsolutePath());
|
|
Properties prop = new Properties();
|
|
FileInputStream fileIn = new FileInputStream(pfile);
|
|
prop.load(fileIn);
|
|
if (prop.size() > 0) {
|
|
Set<Object> all = prop.keySet();
|
|
for (Object s : all) {
|
|
String query = prop.getProperty((String) s);
|
|
if (query != null && !query.isEmpty()) {
|
|
querystack.add(query);
|
|
}
|
|
}
|
|
}
|
|
fileIn.close();
|
|
}
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.warn(AutoSearch.class.getName(), "Error reading config file");
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Save current queries to a (temporary) property file to allow continue
|
|
* after a restart. Existing file will be overwritten or deleted.
|
|
*/
|
|
private void saveasPropFile() {
|
|
File pfile = new File(sb.dataPath, "DATA/SETTINGS/autosearch.conf");
|
|
if (querystack.size() == 0) {
|
|
if (pfile.exists()) {
|
|
pfile.delete();
|
|
}
|
|
} else {
|
|
try {
|
|
Properties prop = new Properties();
|
|
for (String s : querystack) {
|
|
prop.put("query" + s.hashCode(), s);
|
|
}
|
|
OutputStream fileOut = new FileOutputStream(pfile);
|
|
prop.store(fileOut, "AutoSearch query list");
|
|
fileOut.close();
|
|
} catch (FileNotFoundException ex) {
|
|
ConcurrentLog.warn(AutoSearch.class.getName(), "can not create file " + pfile.getAbsolutePath());
|
|
} catch (IOException ex) {
|
|
ConcurrentLog.warn(AutoSearch.class.getName(), "IO error writing to file " + pfile.getAbsolutePath());
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get peers to query (peers connected)
|
|
*
|
|
* @return Set of peer hashes to contact
|
|
*/
|
|
private void initPeerList() {
|
|
if (currentTargets == null) {
|
|
currentTargets = new HashSet<String>();
|
|
}
|
|
// TODO: DHT peers could be excluded
|
|
Iterator<Seed> it = Switchboard.getSwitchboard().peers.seedsConnected(true, false, null, 0);
|
|
while (it.hasNext()) {
|
|
Seed s = it.next();
|
|
currentTargets.add(s.hash);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check BookmarkDB for existing queries return true if new entry added to
|
|
* query queue. Store queries in (temporary) property file
|
|
*
|
|
* @return true if new query from bookmark was added
|
|
*/
|
|
private boolean checkBookmarkDB() {
|
|
int added = 0;
|
|
Iterator<Bookmark> it = Switchboard.getSwitchboard().bookmarksDB.getBookmarksIterator();
|
|
if (it != null) {
|
|
while (it.hasNext()) {
|
|
Bookmark bmk = it.next();
|
|
// get search bookmarks only
|
|
if (bmk.getFoldersString().startsWith("/search")) {
|
|
// take only new created or edited bookmarks
|
|
if (bmk.getTimeStamp() >= this.lastInitTime) {
|
|
final String query = bmk.getDescription();
|
|
if (!query.isEmpty() && query.startsWith("query=")) {
|
|
{
|
|
querystack.add(query.substring(6));
|
|
added++;
|
|
ConcurrentLog.info(AutoSearch.class.getName(), "add query from Bookmarks " + query);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (added > 0) {
|
|
this.lastInitTime = System.currentTimeMillis();
|
|
saveasPropFile();
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Process query queue, select one query and peer to ask next
|
|
*
|
|
* @return true if something processed
|
|
*/
|
|
@Override
|
|
public boolean job() {
|
|
|
|
if (currentQuery == null && querystack != null && querystack.size() > 0) {
|
|
currentQuery = querystack.iterator().next();
|
|
querystack.remove(currentQuery); // imediate remove to asure no repeat
|
|
initPeerList(); // late initialization of peerlist to get currently connected
|
|
}
|
|
|
|
// ask next peer for search term
|
|
if (currentQuery != null && !currentQuery.isEmpty()) {
|
|
if (currentTargets != null && !currentTargets.isEmpty()) {
|
|
while (currentTargets.size() > 0) { // loop only to skip disconnected peers
|
|
String peerhash = currentTargets.iterator().next();
|
|
currentTargets.remove(peerhash);
|
|
Seed seed = Switchboard.getSwitchboard().peers.getConnected(peerhash);
|
|
if (seed != null) {
|
|
processSingleTarget(seed);
|
|
return true; // just one query per busycycle is intended
|
|
}
|
|
}
|
|
}
|
|
currentQuery = null;
|
|
}
|
|
|
|
// no search targets
|
|
checkBookmarkDB();
|
|
|
|
// TODO: do idle processing
|
|
// analyse content of local index
|
|
// extend search with learned new search terms
|
|
// follow most promising links
|
|
ConcurrentLog.fine(AutoSearch.class.getName(), "nothing to do");
|
|
return this.querystack.size() > 0;
|
|
}
|
|
|
|
/**
|
|
* Calls one peer for search results of the current query and adds it to the
|
|
* local index. Depending on peers SolrAvailable flag the a solr query or
|
|
* opensearch/rss query is used.
|
|
*
|
|
* @param seed the peer to ask
|
|
*/
|
|
private void processSingleTarget(Seed seed) {
|
|
ConcurrentLog.fine(AutoSearch.class.getName(), "ask " + seed.getIP() + " " + seed.getName() + " for query=" + currentQuery);
|
|
|
|
if (seed.getFlagSolrAvailable()) { // do a solr query
|
|
SolrDocumentList docList = null;
|
|
SolrQuery solrQuery = new SolrQuery();
|
|
// use remote defaults and ranking (to query their index right)
|
|
solrQuery.set(CommonParams.Q, currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)"); // except this yacy special
|
|
solrQuery.set("q.op", "AND"); // except ... no one word matches please
|
|
solrQuery.set(CommonParams.ROWS, "20");
|
|
this.setName("Protocol.solrQuery(" + solrQuery.getQuery() + " to " + seed.hash + ")");
|
|
try {
|
|
RemoteInstance instance = new RemoteInstance("http://" + seed.getPublicAddress(seed.getIP()) + "/solr/", null, null, 10000); // this is a 'patch configuration' which considers 'solr' as default collection
|
|
try {
|
|
SolrConnector solrConnector = new RemoteSolrConnector(instance, true, null);
|
|
if (!solrConnector.isClosed()) {
|
|
try {
|
|
QueryResponse rsp = solrConnector.getResponseByParams(solrQuery);
|
|
docList = rsp.getResults();
|
|
} catch (Throwable e) {
|
|
} finally {
|
|
solrConnector.close();
|
|
}
|
|
}
|
|
} catch (Throwable ee) {
|
|
} finally {
|
|
instance.close();
|
|
}
|
|
if (docList != null) {
|
|
for (SolrDocument d : docList) {
|
|
sb.index.fulltext().putDocument(sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(d));
|
|
this.gotresults++;
|
|
}
|
|
ConcurrentLog.info(AutoSearch.class.getName(), "added " + docList.size() + " results from " + seed.getName() + " to index for solrquery=" + currentQuery);
|
|
}
|
|
} catch (Throwable eee) {
|
|
}
|
|
} else { // do a yacysearch.rss query
|
|
final String rssSearchServiceURL = "http://" + seed.getPublicAddress(seed.getIP()) + "/yacysearch.rss";
|
|
try {
|
|
RSSFeed feed = loadSRURSS(
|
|
rssSearchServiceURL,
|
|
currentQuery,
|
|
0,
|
|
20,
|
|
CacheStrategy.IFFRESH,
|
|
false, // just local, as we ask others too
|
|
ClientIdentification.yacyInternetCrawlerAgent);
|
|
final List<DigestURL> urls = new ArrayList<DigestURL>();
|
|
for (final MultiProtocolURL entry : feed.getLinks()) {
|
|
urls.add(new DigestURL(entry, (byte[]) null));
|
|
this.gotresults++;
|
|
}
|
|
sb.addToIndex(urls, null, "AutoSearch", null, true);
|
|
ConcurrentLog.info(AutoSearch.class.getName(), "added " + urls.size() + " results from " + seed.getName() + " to index for query=" + currentQuery);
|
|
} catch (IOException ex) {
|
|
ConcurrentLog.info(AutoSearch.class.getName(), "no answer from " + seed.getName());
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Estimate of queries to perform
|
|
*/
|
|
@Override
|
|
public int getJobCount() {
|
|
if (currentTargets != null) {
|
|
int cnt = currentTargets.size();
|
|
cnt += querystack.size() * sb.peers.sizeConnected();
|
|
return cnt;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
@Override
|
|
public void freemem() {
|
|
}
|
|
|
|
@Override
|
|
public void close() {
|
|
this.saveasPropFile(); // saves or deletes property file with queries
|
|
}
|
|
}
|