Combine /heuristic search modifier with opensearch configured targets

- with search modifier /heuristic a request is send to all configured opensearch target systems (old /heuristic/blekko modifier not longer valid)
- this allows to use opensearch heuristic on individual search request (in contrast to configuration HEURISTIC_OPENSEARCH=true which sends a osd request on all global searches
- the index.html searchoption text adjusted to be displayed only if option configured
- add Archive-It to predefined systems
pull/1/head
reger 11 years ago
parent 2de159719b
commit a2cb366b25

@ -18,3 +18,4 @@
#WordPress.com = http://en.search.wordpress.com/?q={searchTerms}&f=feed&page={startPage?} #Search WordPress.com Blogs
#Sueddeutsche.de = http://suche.sueddeutsche.de/query/{searchTerms}?output=rss # Sueddeutsche Zeitung Artikel Archiv
#Los Angeles Times = http://framework.latimes.com/?s={searchTerms}&feed=rss2
#Archive-It = http://archive-it.org/seam/resource/opensearch?q={searchTerms}&n=20 # archiving cultural heritage on the web

@ -175,8 +175,8 @@
<dt style="width:100px">heuristics</dt>
<dd>
<dl style="width:500px">
<dt>/heuristic/blekko</dt>
<dd>add search results from blekko</dd>
<dt>/heuristic</dt>
<dd>add search results from #[count]# external opensearch systems</dd>
</dl>
</dd>
#(/heuristic)#

@ -28,6 +28,7 @@
// javac -classpath .:../classes index.java
// if the shell's current path is HTROOT
import java.io.IOException;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.protocol.RequestHeader;
@ -66,7 +67,12 @@ public class index {
if (!sb.getConfigBool("search.options", true)) {
searchoptions = 0;
} else { // show heuristic hint on search option screen
prop.put("searchoptions_heuristic", sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false));
int osdcnt = 0; // (only if some are active and heuristic is not ON by config)
try {
osdcnt = sb.tables.size("opensearchsys");
} catch (IOException ex) { }
prop.put("searchoptions_heuristic", !sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) && osdcnt > 0);
prop.put("searchoptions_heuristic_count", osdcnt);
}
final String former = (post == null) ? "" : post.get("former", "");
final int count = Math.min(100, (post == null) ? 10 : post.getInt("count", 10));

@ -467,10 +467,10 @@ public class yacysearch {
}
}
final int heuristicBlekko = querystring.indexOf("/heuristic/blekko", 0);
final int heuristicBlekko = querystring.indexOf("/heuristic", 0);
if ( heuristicBlekko >= 0 ) {
querystring = querystring.replace("/heuristic/blekko", "");
modifier.add("/heuristic/blekko");
querystring = querystring.replace("/heuristic", "");
modifier.add("/heuristic");
}
final int tldp = querystring.indexOf("tld:", 0);
@ -708,7 +708,7 @@ public class yacysearch {
sb.heuristicSite(theSearch, modifier.sitehost);
}
if ( heuristicBlekko >= 0 && authenticated && !stealthmode ) {
sb.heuristicRSS("http://blekko.com/ws/$+/rss", theSearch, "blekko");
OpenSearchConnector.query(sb, theSearch);
}
if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) && authenticated && !stealthmode) {
OpenSearchConnector.query(sb, theSearch);

@ -107,9 +107,7 @@ public class OpenSearchConnector {
Tables.Row row = ossysworktable.next();
String osurl = row.get("url", "");
String name = row.get("title", "");
// to reuse existing heuristicRSS procedure replace querystring with "$"
// querystring is inserted/replaced inside heuristicRSS
sb.heuristicRSS(parseSearchTemplate(osurl, "$", 0, theSearch.query.itemsPerPage), theSearch, "opensearch:" + name);
sb.heuristicRSS(parseSearchTemplate(osurl, theSearch.query.getQueryGoal().getQueryString(false), 0, theSearch.query.itemsPerPage), theSearch, name);
}
} catch (final IOException ex) {
ConcurrentLog.warn("OpenSearchConnector.query", "failed reading table opensearchsys");

@ -116,7 +116,7 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
final long st = System.currentTimeMillis();
RSSFeed feed;
try {
feed = loadSRURSS(urlBase, query, timeout, startRecord, recordsPerSession, verify, global, agent);
feed = loadSRURSS(urlBase, query, startRecord, recordsPerSession, verify, global, agent);
} catch (final IOException e1) {
//e1.printStackTrace();
break mainloop;
@ -151,13 +151,11 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
* @param maximumRecords maximum number of records
* @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned
* @param global if true also search results from other peers are included
* @param timeout milliseconds that are waited at maximum for a search result
* @return
*/
public static RSSFeed loadSRURSS(
final String rssSearchServiceURL,
final String query,
final long timeout,
final int startRecord,
final int maximumRecords,
final CacheStrategy cacheStrategy,

@ -3545,39 +3545,29 @@ public final class Switchboard extends serverSwitch {
}.start();
}
// blekko pattern: http://blekko.com/ws/$+/rss
/**
* Queries a remote opensearch system, expects RSS feed as response, parses the RSS feed and
* - adds the results to the results of the searchEvent
* - adds the results to the local index
*
* @param urlpattern the search query url (e.g. http://search.org?query=searchword)
* @param searchEvent
* @param feedName short/internal name of the remote system
*/
public final void heuristicRSS(
final String urlpattern,
final SearchEvent searchEvent,
final String feedName) {
final int p = urlpattern.indexOf('$');
if ( p < 0 ) {
return;
}
new Thread() {
@Override
public void run() {
String queryString = searchEvent.query.getQueryGoal().getQueryString(false);
Thread.currentThread().setName("Switchboard.heuristicRSS:" + queryString);
final int meta = queryString.indexOf("heuristic:", 0);
if ( meta >= 0 ) {
final int q = queryString.indexOf(' ', meta);
if ( q >= 0 ) {
queryString = queryString.substring(0, meta) + queryString.substring(q + 1);
} else {
queryString = queryString.substring(0, meta);
}
}
final String urlString =
urlpattern.substring(0, p)
+ queryString.trim().replaceAll(" ", "+")
+ urlpattern.substring(p + 1);
Thread.currentThread().setName("heuristicRSS:" + feedName);
final DigestURL url;
try {
url = new DigestURL(MultiProtocolURL.unescape(urlString));
url = new DigestURL(MultiProtocolURL.unescape(urlpattern));
} catch (final MalformedURLException e1 ) {
ConcurrentLog.warn("heuristicRSS", "url not well-formed: '" + urlString + "'");
ConcurrentLog.warn("heuristicRSS", "url not well-formed: '" + urlpattern + "'");
return;
}
@ -3588,7 +3578,6 @@ public final class Switchboard extends serverSwitch {
final Response response =
Switchboard.this.loader.load(Switchboard.this.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
final byte[] resource = (response == null) ? null : response.getContent();
//System.out.println("BLEKKO: " + UTF8.String(resource));
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
if ( rss != null ) {
final Map<AnchorURL, String> links = new TreeMap<>();
@ -3610,7 +3599,6 @@ public final class Switchboard extends serverSwitch {
addAllToIndex(null, links, searchEvent, feedName, CrawlProfile.collectionParser("rss"), true);
}
} catch (final Throwable e ) {
//Log.logException(e);
} finally {
searchEvent.oneFeederTerminated();
}

Loading…
Cancel
Save