commit
8cafdb989a
@ -0,0 +1,321 @@
|
||||
/**
|
||||
* AutoSearch.java
|
||||
* Copyright 2015 by Burkhard Buelte
|
||||
* First released 09.01.2015 at http://yacy.net
|
||||
*
|
||||
* This is a part of YaCy, a peer-to-peer based web search engine
|
||||
*
|
||||
* LICENSE
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.search;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
import net.yacy.cora.document.feed.RSSFeed;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||
import static net.yacy.cora.federate.opensearch.SRURSSConnector.loadSRURSS;
|
||||
import net.yacy.cora.federate.solr.connector.RemoteSolrConnector;
|
||||
import net.yacy.cora.federate.solr.connector.SolrConnector;
|
||||
import net.yacy.cora.federate.solr.instance.RemoteInstance;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
import net.yacy.cora.protocol.ClientIdentification;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.data.BookmarksDB.Bookmark;
|
||||
import net.yacy.kelondro.workflow.AbstractBusyThread;
|
||||
import net.yacy.peers.Seed;
|
||||
import net.yacy.search.schema.CollectionSchema;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
|
||||
/**
|
||||
* AutoSearch retrieves queries from Bookmarks or a property file (if existing)
|
||||
* and loops to a list of connected peers and asks each for results which are
|
||||
* added to the local index.
|
||||
*/
|
||||
public class AutoSearch extends AbstractBusyThread {
|
||||
|
||||
private Set<String> querystack; // serach query
|
||||
public String currentQuery = null; // current query
|
||||
private Set<String> currentTargets = null; // peer hashes
|
||||
final Switchboard sb;
|
||||
public int gotresults;
|
||||
private long lastInitTime; // to recognize new data (Bookmarks) to import
|
||||
|
||||
public AutoSearch(Switchboard xsb) {
|
||||
super(3000, 1000); // set lower limits of cycle delay
|
||||
this.setIdleSleep(60000); // set actual cycle delays
|
||||
this.setBusySleep(10000);
|
||||
this.sb = xsb;
|
||||
|
||||
gotresults = 0;
|
||||
querystack = new HashSet<String>();
|
||||
|
||||
this.lastInitTime = System.currentTimeMillis() - 600000; // init to now - 10 min
|
||||
if (!checkBookmarkDB()) {
|
||||
try {
|
||||
// check for old queries in temp property file
|
||||
File pfile = new File(xsb.dataPath, "DATA/SETTINGS/autosearch.conf");
|
||||
if (pfile.exists()) {
|
||||
ConcurrentLog.info(AutoSearch.class.getName(), "read queries from file " + pfile.getAbsolutePath());
|
||||
Properties prop = new Properties();
|
||||
FileInputStream fileIn = new FileInputStream(pfile);
|
||||
prop.load(fileIn);
|
||||
if (prop.size() > 0) {
|
||||
Set<Object> all = prop.keySet();
|
||||
for (Object s : all) {
|
||||
String query = prop.getProperty((String) s);
|
||||
if (query != null && !query.isEmpty()) {
|
||||
querystack.add(query);
|
||||
}
|
||||
}
|
||||
}
|
||||
fileIn.close();
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.warn(AutoSearch.class.getName(), "Error reading config file");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save current queries to a (temporary) property file to allow continue
|
||||
* after a restart. Existing file will be overwritten or deleted.
|
||||
*/
|
||||
private void saveasPropFile() {
|
||||
File pfile = new File(sb.dataPath, "DATA/SETTINGS/autosearch.conf");
|
||||
if (querystack.size() == 0) {
|
||||
if (pfile.exists()) {
|
||||
pfile.delete();
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
Properties prop = new Properties();
|
||||
for (String s : querystack) {
|
||||
prop.put("query" + s.hashCode(), s);
|
||||
}
|
||||
OutputStream fileOut = new FileOutputStream(pfile);
|
||||
prop.store(fileOut, "AutoSearch query list");
|
||||
fileOut.close();
|
||||
} catch (FileNotFoundException ex) {
|
||||
ConcurrentLog.warn(AutoSearch.class.getName(), "can not create file " + pfile.getAbsolutePath());
|
||||
} catch (IOException ex) {
|
||||
ConcurrentLog.warn(AutoSearch.class.getName(), "IO error writing to file " + pfile.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get peers to query (peers connected)
|
||||
*
|
||||
* @return Set of peer hashes to contact
|
||||
*/
|
||||
private void initPeerList() {
|
||||
if (currentTargets == null) {
|
||||
currentTargets = new HashSet<String>();
|
||||
}
|
||||
// TODO: DHT peers could be excluded
|
||||
Iterator<Seed> it = Switchboard.getSwitchboard().peers.seedsConnected(true, false, null, 0);
|
||||
while (it.hasNext()) {
|
||||
Seed s = it.next();
|
||||
currentTargets.add(s.hash);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check BookmarkDB for existing queries return true if new entry added to
|
||||
* query queue. Store queries in (temporary) property file
|
||||
*
|
||||
* @return true if new query from bookmark was added
|
||||
*/
|
||||
private boolean checkBookmarkDB() {
|
||||
int added = 0;
|
||||
Iterator<Bookmark> it = Switchboard.getSwitchboard().bookmarksDB.getBookmarksIterator();
|
||||
if (it != null) {
|
||||
while (it.hasNext()) {
|
||||
Bookmark bmk = it.next();
|
||||
// get search bookmarks only
|
||||
if (bmk.getFoldersString().startsWith("/search")) {
|
||||
// take only new created or edited bookmarks
|
||||
if (bmk.getTimeStamp() >= this.lastInitTime) {
|
||||
final String query = bmk.getDescription();
|
||||
if (!query.isEmpty() && query.startsWith("query=")) {
|
||||
{
|
||||
querystack.add(query.substring(6));
|
||||
added++;
|
||||
ConcurrentLog.info(AutoSearch.class.getName(), "add query from Bookmarks " + query);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (added > 0) {
|
||||
this.lastInitTime = System.currentTimeMillis();
|
||||
saveasPropFile();
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process query queue, select one query and peer to ask next
|
||||
*
|
||||
* @return true if something processed
|
||||
*/
|
||||
@Override
|
||||
public boolean job() {
|
||||
|
||||
if (currentQuery == null && querystack != null && querystack.size() > 0) {
|
||||
currentQuery = querystack.iterator().next();
|
||||
querystack.remove(currentQuery); // imediate remove to asure no repeat
|
||||
initPeerList(); // late initialization of peerlist to get currently connected
|
||||
}
|
||||
|
||||
// ask next peer for search term
|
||||
if (currentQuery != null && !currentQuery.isEmpty()) {
|
||||
if (currentTargets != null && !currentTargets.isEmpty()) {
|
||||
while (currentTargets.size() > 0) { // loop only to skip disconnected peers
|
||||
String peerhash = currentTargets.iterator().next();
|
||||
currentTargets.remove(peerhash);
|
||||
Seed seed = Switchboard.getSwitchboard().peers.getConnected(peerhash);
|
||||
if (seed != null) {
|
||||
processSingleTarget(seed);
|
||||
return true; // just one query per busycycle is intended
|
||||
}
|
||||
}
|
||||
}
|
||||
currentQuery = null;
|
||||
}
|
||||
|
||||
// no search targets
|
||||
checkBookmarkDB();
|
||||
|
||||
// TODO: do idle processing
|
||||
// analyse content of local index
|
||||
// extend search with learned new search terms
|
||||
// follow most promising links
|
||||
ConcurrentLog.fine(AutoSearch.class.getName(), "nothing to do");
|
||||
return this.querystack.size() > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls one peer for search results of the current query and adds it to the
|
||||
* local index. Depending on peers SolrAvailable flag the a solr query or
|
||||
* opensearch/rss query is used.
|
||||
*
|
||||
* @param seed the peer to ask
|
||||
*/
|
||||
private void processSingleTarget(Seed seed) {
|
||||
ConcurrentLog.fine(AutoSearch.class.getName(), "ask " + seed.getIP() + " " + seed.getName() + " for query=" + currentQuery);
|
||||
|
||||
if (seed.getFlagSolrAvailable()) { // do a solr query
|
||||
SolrDocumentList docList = null;
|
||||
SolrQuery solrQuery = new SolrQuery();
|
||||
// use remote defaults and ranking (to query their index right)
|
||||
solrQuery.set(CommonParams.Q, currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)"); // except this yacy special
|
||||
solrQuery.set("q.op", "AND"); // except ... no one word matches please
|
||||
solrQuery.set(CommonParams.ROWS, "20");
|
||||
this.setName("Protocol.solrQuery(" + solrQuery.getQuery() + " to " + seed.hash + ")");
|
||||
try {
|
||||
RemoteInstance instance = new RemoteInstance("http://" + seed.getPublicAddress(seed.getIP()) + "/solr/", null, null, 10000); // this is a 'patch configuration' which considers 'solr' as default collection
|
||||
try {
|
||||
SolrConnector solrConnector = new RemoteSolrConnector(instance, true, null);
|
||||
if (!solrConnector.isClosed()) {
|
||||
try {
|
||||
QueryResponse rsp = solrConnector.getResponseByParams(solrQuery);
|
||||
docList = rsp.getResults();
|
||||
} catch (Throwable e) {
|
||||
} finally {
|
||||
solrConnector.close();
|
||||
}
|
||||
}
|
||||
} catch (Throwable ee) {
|
||||
} finally {
|
||||
instance.close();
|
||||
}
|
||||
if (docList != null) {
|
||||
for (SolrDocument d : docList) {
|
||||
sb.index.fulltext().putDocument(sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(d));
|
||||
this.gotresults++;
|
||||
}
|
||||
ConcurrentLog.info(AutoSearch.class.getName(), "added " + docList.size() + " results from " + seed.getName() + " to index for solrquery=" + currentQuery);
|
||||
}
|
||||
} catch (Throwable eee) {
|
||||
}
|
||||
} else { // do a yacysearch.rss query
|
||||
final String rssSearchServiceURL = "http://" + seed.getPublicAddress(seed.getIP()) + "/yacysearch.rss";
|
||||
try {
|
||||
RSSFeed feed = loadSRURSS(
|
||||
rssSearchServiceURL,
|
||||
currentQuery,
|
||||
0,
|
||||
20,
|
||||
CacheStrategy.IFFRESH,
|
||||
false, // just local, as we ask others too
|
||||
ClientIdentification.yacyInternetCrawlerAgent);
|
||||
final List<DigestURL> urls = new ArrayList<DigestURL>();
|
||||
for (final MultiProtocolURL entry : feed.getLinks()) {
|
||||
urls.add(new DigestURL(entry, (byte[]) null));
|
||||
this.gotresults++;
|
||||
}
|
||||
sb.addToIndex(urls, null, "AutoSearch", null, true);
|
||||
ConcurrentLog.info(AutoSearch.class.getName(), "added " + urls.size() + " results from " + seed.getName() + " to index for query=" + currentQuery);
|
||||
} catch (IOException ex) {
|
||||
ConcurrentLog.info(AutoSearch.class.getName(), "no answer from " + seed.getName());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate of queries to perform
|
||||
*/
|
||||
@Override
|
||||
public int getJobCount() {
|
||||
if (currentTargets != null) {
|
||||
int cnt = currentTargets.size();
|
||||
cnt += querystack.size() * sb.peers.sizeConnected();
|
||||
return cnt;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void freemem() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
this.saveasPropFile(); // saves or deletes property file with queries
|
||||
}
|
||||
}
|
Loading…
Reference in new issue