From a2cb366b25e7e472fbac6d793b761569c1450a88 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 20 Jul 2014 00:00:43 +0200 Subject: [PATCH] Combine /heuristic search modifier with opensearch configured targets - with search modifier /heuristic a request is send to all configured opensearch target systems (old /heuristic/blekko modifier not longer valid) - this allows to use opensearch heuristic on individual search request (in contrast to configuration HEURISTIC_OPENSEARCH=true which sends a osd request on all global searches - the index.html searchoption text adjusted to be displayed only if option configured - add Archive-It to predefined systems --- defaults/heuristicopensearch.conf | 1 + htroot/index.html | 4 +- htroot/index.java | 8 +++- htroot/yacysearch.java | 8 ++-- .../opensearch/OpenSearchConnector.java | 4 +- .../federate/opensearch/SRURSSConnector.java | 4 +- source/net/yacy/search/Switchboard.java | 38 +++++++------------ 7 files changed, 29 insertions(+), 38 deletions(-) diff --git a/defaults/heuristicopensearch.conf b/defaults/heuristicopensearch.conf index ffd7030ef..143f25a03 100644 --- a/defaults/heuristicopensearch.conf +++ b/defaults/heuristicopensearch.conf @@ -18,3 +18,4 @@ #WordPress.com = http://en.search.wordpress.com/?q={searchTerms}&f=feed&page={startPage?} #Search WordPress.com Blogs #Sueddeutsche.de = http://suche.sueddeutsche.de/query/{searchTerms}?output=rss # Sueddeutsche Zeitung Artikel Archiv #Los Angeles Times = http://framework.latimes.com/?s={searchTerms}&feed=rss2 +#Archive-It = http://archive-it.org/seam/resource/opensearch?q={searchTerms}&n=20 # archiving cultural heritage on the web diff --git a/htroot/index.html b/htroot/index.html index 39c309195..97ff4a39f 100644 --- a/htroot/index.html +++ b/htroot/index.html @@ -175,8 +175,8 @@
heuristics
-
/heuristic/blekko
-
add search results from blekko
+
/heuristic
+
add search results from #[count]# external opensearch systems
#(/heuristic)# diff --git a/htroot/index.java b/htroot/index.java index 7f1f268e1..c35928d25 100644 --- a/htroot/index.java +++ b/htroot/index.java @@ -28,6 +28,7 @@ // javac -classpath .:../classes index.java // if the shell's current path is HTROOT +import java.io.IOException; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.protocol.RequestHeader; @@ -66,7 +67,12 @@ public class index { if (!sb.getConfigBool("search.options", true)) { searchoptions = 0; } else { // show heuristic hint on search option screen - prop.put("searchoptions_heuristic", sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false)); + int osdcnt = 0; // (only if some are active and heuristic is not ON by config) + try { + osdcnt = sb.tables.size("opensearchsys"); + } catch (IOException ex) { } + prop.put("searchoptions_heuristic", !sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) && osdcnt > 0); + prop.put("searchoptions_heuristic_count", osdcnt); } final String former = (post == null) ? "" : post.get("former", ""); final int count = Math.min(100, (post == null) ? 10 : post.getInt("count", 10)); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index e3ec46db4..6d4d420e4 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -467,10 +467,10 @@ public class yacysearch { } } - final int heuristicBlekko = querystring.indexOf("/heuristic/blekko", 0); + final int heuristicBlekko = querystring.indexOf("/heuristic", 0); if ( heuristicBlekko >= 0 ) { - querystring = querystring.replace("/heuristic/blekko", ""); - modifier.add("/heuristic/blekko"); + querystring = querystring.replace("/heuristic", ""); + modifier.add("/heuristic"); } final int tldp = querystring.indexOf("tld:", 0); @@ -708,7 +708,7 @@ public class yacysearch { sb.heuristicSite(theSearch, modifier.sitehost); } if ( heuristicBlekko >= 0 && authenticated && !stealthmode ) { - sb.heuristicRSS("http://blekko.com/ws/$+/rss", theSearch, "blekko"); + OpenSearchConnector.query(sb, theSearch); } if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) && authenticated && !stealthmode) { OpenSearchConnector.query(sb, theSearch); diff --git a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java index 9590da53b..68fee2161 100644 --- a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java +++ b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java @@ -107,9 +107,7 @@ public class OpenSearchConnector { Tables.Row row = ossysworktable.next(); String osurl = row.get("url", ""); String name = row.get("title", ""); - // to reuse existing heuristicRSS procedure replace querystring with "$" - // querystring is inserted/replaced inside heuristicRSS - sb.heuristicRSS(parseSearchTemplate(osurl, "$", 0, theSearch.query.itemsPerPage), theSearch, "opensearch:" + name); + sb.heuristicRSS(parseSearchTemplate(osurl, theSearch.query.getQueryGoal().getQueryString(false), 0, theSearch.query.itemsPerPage), theSearch, name); } } catch (final IOException ex) { ConcurrentLog.warn("OpenSearchConnector.query", "failed reading table opensearchsys"); diff --git a/source/net/yacy/cora/federate/opensearch/SRURSSConnector.java b/source/net/yacy/cora/federate/opensearch/SRURSSConnector.java index b1f11427d..3a0d83d18 100644 --- a/source/net/yacy/cora/federate/opensearch/SRURSSConnector.java +++ b/source/net/yacy/cora/federate/opensearch/SRURSSConnector.java @@ -116,7 +116,7 @@ public class SRURSSConnector extends Thread implements SearchAccumulator { final long st = System.currentTimeMillis(); RSSFeed feed; try { - feed = loadSRURSS(urlBase, query, timeout, startRecord, recordsPerSession, verify, global, agent); + feed = loadSRURSS(urlBase, query, startRecord, recordsPerSession, verify, global, agent); } catch (final IOException e1) { //e1.printStackTrace(); break mainloop; @@ -151,13 +151,11 @@ public class SRURSSConnector extends Thread implements SearchAccumulator { * @param maximumRecords maximum number of records * @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned * @param global if true also search results from other peers are included - * @param timeout milliseconds that are waited at maximum for a search result * @return */ public static RSSFeed loadSRURSS( final String rssSearchServiceURL, final String query, - final long timeout, final int startRecord, final int maximumRecords, final CacheStrategy cacheStrategy, diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index ea48acef2..b4afb9065 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -3545,39 +3545,29 @@ public final class Switchboard extends serverSwitch { }.start(); } - // blekko pattern: http://blekko.com/ws/$+/rss + /** + * Queries a remote opensearch system, expects RSS feed as response, parses the RSS feed and + * - adds the results to the results of the searchEvent + * - adds the results to the local index + * + * @param urlpattern the search query url (e.g. http://search.org?query=searchword) + * @param searchEvent + * @param feedName short/internal name of the remote system + */ public final void heuristicRSS( final String urlpattern, final SearchEvent searchEvent, final String feedName) { - final int p = urlpattern.indexOf('$'); - if ( p < 0 ) { - return; - } + new Thread() { @Override public void run() { - String queryString = searchEvent.query.getQueryGoal().getQueryString(false); - Thread.currentThread().setName("Switchboard.heuristicRSS:" + queryString); - final int meta = queryString.indexOf("heuristic:", 0); - if ( meta >= 0 ) { - final int q = queryString.indexOf(' ', meta); - if ( q >= 0 ) { - queryString = queryString.substring(0, meta) + queryString.substring(q + 1); - } else { - queryString = queryString.substring(0, meta); - } - } - - final String urlString = - urlpattern.substring(0, p) - + queryString.trim().replaceAll(" ", "+") - + urlpattern.substring(p + 1); + Thread.currentThread().setName("heuristicRSS:" + feedName); final DigestURL url; try { - url = new DigestURL(MultiProtocolURL.unescape(urlString)); + url = new DigestURL(MultiProtocolURL.unescape(urlpattern)); } catch (final MalformedURLException e1 ) { - ConcurrentLog.warn("heuristicRSS", "url not well-formed: '" + urlString + "'"); + ConcurrentLog.warn("heuristicRSS", "url not well-formed: '" + urlpattern + "'"); return; } @@ -3588,7 +3578,6 @@ public final class Switchboard extends serverSwitch { final Response response = Switchboard.this.loader.load(Switchboard.this.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent); final byte[] resource = (response == null) ? null : response.getContent(); - //System.out.println("BLEKKO: " + UTF8.String(resource)); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); if ( rss != null ) { final Map links = new TreeMap<>(); @@ -3610,7 +3599,6 @@ public final class Switchboard extends serverSwitch { addAllToIndex(null, links, searchEvent, feedName, CrawlProfile.collectionParser("rss"), true); } } catch (final Throwable e ) { - //Log.logException(e); } finally { searchEvent.oneFeederTerminated(); }