From 4214f250d0e34ca914f45eb46c8a464d55ee2e24 Mon Sep 17 00:00:00 2001 From: reger Date: Fri, 9 Jan 2015 02:06:30 +0100 Subject: [PATCH] Add option for extended search (Autosearch) to Bookmark.html asking all connected peers for the searchterm added as description to the bookmark created by the bookmark icon. Intended for searches/research projects with not sufficient results from local and DHT selected remote target peers. Function: the process checks newly created bookmarks for description starting with "query=..." and takes this to ask every peer for 20 search results and adds it to the local index in a background job. link to start/stop the process added to /Bookmarks.html --- htroot/Bookmarks.html | 29 ++- htroot/Bookmarks.java | 38 +++ source/net/yacy/data/BookmarksDB.java | 9 + source/net/yacy/search/AutoSearch.java | 322 +++++++++++++++++++++++++ 4 files changed, 397 insertions(+), 1 deletion(-) create mode 100644 source/net/yacy/search/AutoSearch.java diff --git a/htroot/Bookmarks.html b/htroot/Bookmarks.html index 504732c83..22701bf1f 100644 --- a/htroot/Bookmarks.html +++ b/htroot/Bookmarks.html @@ -202,13 +202,40 @@ To see a list of all APIs, please visit the #[name]# (#[num]#) #{/optlist}# - +

#{taglist}##[name]# #{/taglist}#

+ +
+ Auto Search + #(autosearchrunning)# +

+

start autosearch of new bookmarks +

+
#[msg]#
+ :: + + + + + + + + + + +
autosearch queue:#[jobcount]#
received results:#[totalcount]#
current query:#[query]#
+ #(/autosearchrunning)# +
+

This starts a serach of new or modified bookmarks since startup + in folder "search" with description "query="
+ Every peer online will be ask for results. +

+
diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 266d94cf4..d7fc531e6 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -55,7 +55,9 @@ import net.yacy.data.BookmarksDB.Tag; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.kelondro.workflow.BusyThread; import net.yacy.peers.NewsPool; +import net.yacy.search.AutoSearch; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -372,6 +374,42 @@ public class Bookmarks { count = 0; count = recurseFolders(BookmarkHelper.getFolderList("/", sb.bookmarksDB.getTagIterator(isAdmin)), "/", 0, true, ""); prop.put("display_folderlist", count); + + BusyThread bt = sb.getThread("autosearch"); + if (bt != null) { + prop.put("display_autosearchrunning","1"); + prop.put("display_autosearchrunning_msg", "" ); + if (post != null && post.containsKey("stopautosearch")) { + sb.terminateThread("autosearch", false); + prop.put("display_autosearchrunning_msg", "autosearch will terminate"); + prop.put("display_autosearchrunning","0"); + } + int jobs = bt.getJobCount(); + prop.put("display_autosearchrunning_jobcount", jobs); + int cnt=0; + String qstr = ""; + if (bt instanceof AutoSearch) { + cnt = ((AutoSearch) bt).gotresults; + qstr = ((AutoSearch) bt).currentQuery; + if (qstr == null) qstr = "---"; + } + prop.put("display_autosearchrunning_totalcount", cnt); + prop.put("display_autosearchrunning_query", qstr); + + } else { + prop.put("display_autosearchrunning", "0"); + prop.put("display_autosearchrunning_msg", ""); + if (post != null && post.containsKey("startautosearch")) { + sb.deployThread( + "autosearch", + "Auto Search", + "query all peers for given search terms", + null, + new AutoSearch(Switchboard.getSwitchboard()), + 1000); + prop.put("display_autosearchrunning_msg", "autsearch job started"); + } + } } return prop; // return from serverObjects respond() } diff --git a/source/net/yacy/data/BookmarksDB.java b/source/net/yacy/data/BookmarksDB.java index 049513af3..c232b85af 100644 --- a/source/net/yacy/data/BookmarksDB.java +++ b/source/net/yacy/data/BookmarksDB.java @@ -208,6 +208,15 @@ public class BookmarksDB { return set.iterator(); } + public Iterator getBookmarksIterator() { + try { + return new bookmarkIterator(true); + } catch (IOException ex) { + ConcurrentLog.logException(ex); + } + return null; + } + public Iterator getBookmarksIterator(final String tagName, final boolean priv){ final TreeSet set=new TreeSet(new bookmarkComparator(true)); final String tagHash=BookmarkHelper.tagHash(tagName); diff --git a/source/net/yacy/search/AutoSearch.java b/source/net/yacy/search/AutoSearch.java new file mode 100644 index 000000000..3548df031 --- /dev/null +++ b/source/net/yacy/search/AutoSearch.java @@ -0,0 +1,322 @@ +/** + * AutoSearch.java + * Copyright 2015 by Burkhard Buelte + * First released 09.01.2015 at http://yacy.net + * + * This is a part of YaCy, a peer-to-peer based web search engine + * + * LICENSE + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.search; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Properties; +import java.util.Set; +import net.yacy.cora.document.feed.RSSFeed; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; +import static net.yacy.cora.federate.opensearch.SRURSSConnector.loadSRURSS; +import net.yacy.cora.federate.solr.connector.RemoteSolrConnector; +import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.cora.federate.solr.instance.RemoteInstance; +import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.protocol.ClientIdentification; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.data.BookmarksDB.Bookmark; +import net.yacy.kelondro.workflow.AbstractBusyThread; +import net.yacy.peers.Seed; +import net.yacy.search.schema.CollectionSchema; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.params.CommonParams; + +/** + * AutoSearch retrieves queries from Bookmarks or a property file (if existing) + * and loops to a list of connected peers and asks each for results which are + * added to the local index. + */ +public class AutoSearch extends AbstractBusyThread { + + private Set querystack; // serach query + public String currentQuery = null; // current query + private Set currentTargets = null; // peer hashes + final Switchboard sb; + public int gotresults; + private long lastInitTime; // to recognize new data (Bookmarks) to import + + public AutoSearch(Switchboard xsb) { + super(3000, 1000); // set lower limits of cycle delay + this.setIdleSleep(60000); // set actual cycle delays + this.setBusySleep(10000); + this.sb = xsb; + + gotresults = 0; + querystack = new HashSet(); + + this.lastInitTime = System.currentTimeMillis() - 600000; // init to now - 10 min + if (!checkBookmarkDB()) { + try { + // check for old queries in temp property file + File pfile = new File(xsb.dataPath, "DATA/SETTINGS/autosearch.conf"); + if (pfile.exists()) { + ConcurrentLog.info(AutoSearch.class.getName(), "read queries from file " + pfile.getAbsolutePath()); + Properties prop = new Properties(); + FileInputStream fileIn = new FileInputStream(pfile); + prop.load(fileIn); + if (prop.size() > 0) { + Set all = prop.keySet(); + for (Object s : all) { + String query = prop.getProperty((String) s); + if (query != null && !query.isEmpty()) { + querystack.add(query); + } + } + } + fileIn.close(); + } + } catch (final IOException e) { + ConcurrentLog.warn(AutoSearch.class.getName(), "Error reading config file"); + } + } + } + + /** + * Save current queries to a (temporary) property file to allow continue + * after a restart. Existing file will be overwritten or deleted. + */ + private void saveasPropFile() { + File pfile = new File(sb.dataPath, "DATA/SETTINGS/autosearch.conf"); + if (querystack.size() == 0) { + if (pfile.exists()) { + pfile.delete(); + } + } else { + try { + Properties prop = new Properties(); + for (String s : querystack) { + prop.put("query" + s.hashCode(), s); + } + OutputStream fileOut = new FileOutputStream(pfile); + prop.store(fileOut, "AutoSearch query list"); + fileOut.close(); + } catch (FileNotFoundException ex) { + ConcurrentLog.warn(AutoSearch.class.getName(), "can not create file " + pfile.getAbsolutePath()); + } catch (IOException ex) { + ConcurrentLog.warn(AutoSearch.class.getName(), "IO error writing to file " + pfile.getAbsolutePath()); + } + } + } + + /** + * Get peers to query (peers connected) + * + * @return Set of peer hashes to contact + */ + private void initPeerList() { + if (currentTargets == null) { + currentTargets = new HashSet(); + } + // TODO: DHT peers could be excluded + Iterator it = Switchboard.getSwitchboard().peers.seedsConnected(true, false, null, 0); + int i=0; + while (it.hasNext() && i<5) { + Seed s = it.next(); + currentTargets.add(s.hash); i++; + } + } + + /** + * Check BookmarkDB for existing queries return true if new entry added to + * query queue. Store queries in (temporary) property file + * + * @return true if new query from bookmark was added + */ + private boolean checkBookmarkDB() { + int added = 0; + Iterator it = Switchboard.getSwitchboard().bookmarksDB.getBookmarksIterator(); + if (it != null) { + while (it.hasNext()) { + Bookmark bmk = it.next(); + // get search bookmarks only + if (bmk.getFoldersString().startsWith("/search")) { + // take only new created or edited bookmarks + if (bmk.getTimeStamp() >= this.lastInitTime) { + final String query = bmk.getDescription(); + if (!query.isEmpty() && query.startsWith("query=")) { + { + querystack.add(query.substring(6)); + added++; + ConcurrentLog.info(AutoSearch.class.getName(), "add query from Bookmarks " + query); + } + } + } + } + } + } + if (added > 0) { + this.lastInitTime = System.currentTimeMillis(); + saveasPropFile(); + return true; + } else { + return false; + } + } + + /** + * Process query queue, select one query and peer to ask next + * + * @return true if something processed + */ + @Override + public boolean job() { + + if (currentQuery == null && querystack != null && querystack.size() > 0) { + currentQuery = querystack.iterator().next(); + querystack.remove(currentQuery); // imediate remove to asure no repeat + initPeerList(); // late initialization of peerlist to get currently connected + } + + // ask next peer for search term + if (currentQuery != null && !currentQuery.isEmpty()) { + if (currentTargets != null && !currentTargets.isEmpty()) { + while (currentTargets.size() > 0) { // loop only to skip disconnected peers + String peerhash = currentTargets.iterator().next(); + currentTargets.remove(peerhash); + Seed seed = Switchboard.getSwitchboard().peers.getConnected(peerhash); + if (seed != null) { + processSingleTarget(seed); + return true; // just one query per busycycle is intended + } + } + } + currentQuery = null; + } + + // no search targets + checkBookmarkDB(); + + // TODO: do idle processing + // analyse content of local index + // extend search with learned new search terms + // follow most promising links + ConcurrentLog.fine(AutoSearch.class.getName(), "nothing to do"); + return this.querystack.size() > 0; + } + + /** + * Calls one peer for search results of the current query and adds it to the + * local index. Depending on peers SolrAvailable flag the a solr query or + * opensearch/rss query is used. + * + * @param seed the peer to ask + */ + private void processSingleTarget(Seed seed) { + ConcurrentLog.fine(AutoSearch.class.getName(), "ask " + seed.getIP() + " " + seed.getName() + " for query=" + currentQuery); + + if (seed.getFlagSolrAvailable()) { // do a solr query + SolrDocumentList docList = null; + SolrQuery solrQuery = new SolrQuery(); + // use remote defaults and ranking (to query their index right) + solrQuery.set(CommonParams.Q, currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)"); // except this yacy special + solrQuery.set("q.op", "AND"); // except ... no one word matches please + solrQuery.set(CommonParams.ROWS, "20"); + this.setName("Protocol.solrQuery(" + solrQuery.getQuery() + " to " + seed.hash + ")"); + try { + RemoteInstance instance = new RemoteInstance("http://" + seed.getPublicAddress(seed.getIP()) + "/solr/", null, null, 10000); // this is a 'patch configuration' which considers 'solr' as default collection + try { + SolrConnector solrConnector = new RemoteSolrConnector(instance, true, null); + if (!solrConnector.isClosed()) { + try { + QueryResponse rsp = solrConnector.getResponseByParams(solrQuery); + docList = rsp.getResults(); + } catch (Throwable e) { + } finally { + solrConnector.close(); + } + } + } catch (Throwable ee) { + } finally { + instance.close(); + } + if (docList != null) { + for (SolrDocument d : docList) { + sb.index.fulltext().putDocument(sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(d)); + this.gotresults++; + } + ConcurrentLog.info(AutoSearch.class.getName(), "added " + docList.size() + " results from " + seed.getName() + " to index for solrquery=" + currentQuery); + } + } catch (Throwable eee) { + } + } else { // do a yacysearch.rss query + final String rssSearchServiceURL = "http://" + seed.getPublicAddress(seed.getIP()) + "/yacysearch.rss"; + try { + RSSFeed feed = loadSRURSS( + rssSearchServiceURL, + currentQuery, + 0, + 20, + CacheStrategy.IFFRESH, + false, // just local, as we ask others too + ClientIdentification.yacyInternetCrawlerAgent); + final List urls = new ArrayList(); + for (final MultiProtocolURL entry : feed.getLinks()) { + urls.add(new DigestURL(entry, (byte[]) null)); + this.gotresults++; + } + sb.addToIndex(urls, null, "AutoSearch", null, true); + ConcurrentLog.info(AutoSearch.class.getName(), "added " + urls.size() + " results from " + seed.getName() + " to index for query=" + currentQuery); + } catch (IOException ex) { + ConcurrentLog.info(AutoSearch.class.getName(), "no answer from " + seed.getName()); + } + } + } + + /** + * Estimate of queries to perform + */ + @Override + public int getJobCount() { + if (currentTargets != null) { + int cnt = currentTargets.size(); + cnt += querystack.size() * sb.peers.sizeConnected(); + return cnt; + } else { + return 0; + } + } + + @Override + public void freemem() { + } + + @Override + public void close() { + this.saveasPropFile(); // saves or deletes property file with queries + } +}