From a2cb366b25e7e472fbac6d793b761569c1450a88 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 20 Jul 2014 00:00:43 +0200 Subject: [PATCH 1/4] Combine /heuristic search modifier with opensearch configured targets - with search modifier /heuristic a request is send to all configured opensearch target systems (old /heuristic/blekko modifier not longer valid) - this allows to use opensearch heuristic on individual search request (in contrast to configuration HEURISTIC_OPENSEARCH=true which sends a osd request on all global searches - the index.html searchoption text adjusted to be displayed only if option configured - add Archive-It to predefined systems --- defaults/heuristicopensearch.conf | 1 + htroot/index.html | 4 +- htroot/index.java | 8 +++- htroot/yacysearch.java | 8 ++-- .../opensearch/OpenSearchConnector.java | 4 +- .../federate/opensearch/SRURSSConnector.java | 4 +- source/net/yacy/search/Switchboard.java | 38 +++++++------------ 7 files changed, 29 insertions(+), 38 deletions(-) diff --git a/defaults/heuristicopensearch.conf b/defaults/heuristicopensearch.conf index ffd7030ef..143f25a03 100644 --- a/defaults/heuristicopensearch.conf +++ b/defaults/heuristicopensearch.conf @@ -18,3 +18,4 @@ #WordPress.com = http://en.search.wordpress.com/?q={searchTerms}&f=feed&page={startPage?} #Search WordPress.com Blogs #Sueddeutsche.de = http://suche.sueddeutsche.de/query/{searchTerms}?output=rss # Sueddeutsche Zeitung Artikel Archiv #Los Angeles Times = http://framework.latimes.com/?s={searchTerms}&feed=rss2 +#Archive-It = http://archive-it.org/seam/resource/opensearch?q={searchTerms}&n=20 # archiving cultural heritage on the web diff --git a/htroot/index.html b/htroot/index.html index 39c309195..97ff4a39f 100644 --- a/htroot/index.html +++ b/htroot/index.html @@ -175,8 +175,8 @@
heuristics
-
/heuristic/blekko
-
add search results from blekko
+
/heuristic
+
add search results from #[count]# external opensearch systems
#(/heuristic)# diff --git a/htroot/index.java b/htroot/index.java index 7f1f268e1..c35928d25 100644 --- a/htroot/index.java +++ b/htroot/index.java @@ -28,6 +28,7 @@ // javac -classpath .:../classes index.java // if the shell's current path is HTROOT +import java.io.IOException; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.protocol.RequestHeader; @@ -66,7 +67,12 @@ public class index { if (!sb.getConfigBool("search.options", true)) { searchoptions = 0; } else { // show heuristic hint on search option screen - prop.put("searchoptions_heuristic", sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false)); + int osdcnt = 0; // (only if some are active and heuristic is not ON by config) + try { + osdcnt = sb.tables.size("opensearchsys"); + } catch (IOException ex) { } + prop.put("searchoptions_heuristic", !sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) && osdcnt > 0); + prop.put("searchoptions_heuristic_count", osdcnt); } final String former = (post == null) ? "" : post.get("former", ""); final int count = Math.min(100, (post == null) ? 10 : post.getInt("count", 10)); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index e3ec46db4..6d4d420e4 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -467,10 +467,10 @@ public class yacysearch { } } - final int heuristicBlekko = querystring.indexOf("/heuristic/blekko", 0); + final int heuristicBlekko = querystring.indexOf("/heuristic", 0); if ( heuristicBlekko >= 0 ) { - querystring = querystring.replace("/heuristic/blekko", ""); - modifier.add("/heuristic/blekko"); + querystring = querystring.replace("/heuristic", ""); + modifier.add("/heuristic"); } final int tldp = querystring.indexOf("tld:", 0); @@ -708,7 +708,7 @@ public class yacysearch { sb.heuristicSite(theSearch, modifier.sitehost); } if ( heuristicBlekko >= 0 && authenticated && !stealthmode ) { - sb.heuristicRSS("http://blekko.com/ws/$+/rss", theSearch, "blekko"); + OpenSearchConnector.query(sb, theSearch); } if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) && authenticated && !stealthmode) { OpenSearchConnector.query(sb, theSearch); diff --git a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java index 9590da53b..68fee2161 100644 --- a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java +++ b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java @@ -107,9 +107,7 @@ public class OpenSearchConnector { Tables.Row row = ossysworktable.next(); String osurl = row.get("url", ""); String name = row.get("title", ""); - // to reuse existing heuristicRSS procedure replace querystring with "$" - // querystring is inserted/replaced inside heuristicRSS - sb.heuristicRSS(parseSearchTemplate(osurl, "$", 0, theSearch.query.itemsPerPage), theSearch, "opensearch:" + name); + sb.heuristicRSS(parseSearchTemplate(osurl, theSearch.query.getQueryGoal().getQueryString(false), 0, theSearch.query.itemsPerPage), theSearch, name); } } catch (final IOException ex) { ConcurrentLog.warn("OpenSearchConnector.query", "failed reading table opensearchsys"); diff --git a/source/net/yacy/cora/federate/opensearch/SRURSSConnector.java b/source/net/yacy/cora/federate/opensearch/SRURSSConnector.java index b1f11427d..3a0d83d18 100644 --- a/source/net/yacy/cora/federate/opensearch/SRURSSConnector.java +++ b/source/net/yacy/cora/federate/opensearch/SRURSSConnector.java @@ -116,7 +116,7 @@ public class SRURSSConnector extends Thread implements SearchAccumulator { final long st = System.currentTimeMillis(); RSSFeed feed; try { - feed = loadSRURSS(urlBase, query, timeout, startRecord, recordsPerSession, verify, global, agent); + feed = loadSRURSS(urlBase, query, startRecord, recordsPerSession, verify, global, agent); } catch (final IOException e1) { //e1.printStackTrace(); break mainloop; @@ -151,13 +151,11 @@ public class SRURSSConnector extends Thread implements SearchAccumulator { * @param maximumRecords maximum number of records * @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned * @param global if true also search results from other peers are included - * @param timeout milliseconds that are waited at maximum for a search result * @return */ public static RSSFeed loadSRURSS( final String rssSearchServiceURL, final String query, - final long timeout, final int startRecord, final int maximumRecords, final CacheStrategy cacheStrategy, diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index ea48acef2..b4afb9065 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -3545,39 +3545,29 @@ public final class Switchboard extends serverSwitch { }.start(); } - // blekko pattern: http://blekko.com/ws/$+/rss + /** + * Queries a remote opensearch system, expects RSS feed as response, parses the RSS feed and + * - adds the results to the results of the searchEvent + * - adds the results to the local index + * + * @param urlpattern the search query url (e.g. http://search.org?query=searchword) + * @param searchEvent + * @param feedName short/internal name of the remote system + */ public final void heuristicRSS( final String urlpattern, final SearchEvent searchEvent, final String feedName) { - final int p = urlpattern.indexOf('$'); - if ( p < 0 ) { - return; - } + new Thread() { @Override public void run() { - String queryString = searchEvent.query.getQueryGoal().getQueryString(false); - Thread.currentThread().setName("Switchboard.heuristicRSS:" + queryString); - final int meta = queryString.indexOf("heuristic:", 0); - if ( meta >= 0 ) { - final int q = queryString.indexOf(' ', meta); - if ( q >= 0 ) { - queryString = queryString.substring(0, meta) + queryString.substring(q + 1); - } else { - queryString = queryString.substring(0, meta); - } - } - - final String urlString = - urlpattern.substring(0, p) - + queryString.trim().replaceAll(" ", "+") - + urlpattern.substring(p + 1); + Thread.currentThread().setName("heuristicRSS:" + feedName); final DigestURL url; try { - url = new DigestURL(MultiProtocolURL.unescape(urlString)); + url = new DigestURL(MultiProtocolURL.unescape(urlpattern)); } catch (final MalformedURLException e1 ) { - ConcurrentLog.warn("heuristicRSS", "url not well-formed: '" + urlString + "'"); + ConcurrentLog.warn("heuristicRSS", "url not well-formed: '" + urlpattern + "'"); return; } @@ -3588,7 +3578,6 @@ public final class Switchboard extends serverSwitch { final Response response = Switchboard.this.loader.load(Switchboard.this.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent); final byte[] resource = (response == null) ? null : response.getContent(); - //System.out.println("BLEKKO: " + UTF8.String(resource)); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); if ( rss != null ) { final Map links = new TreeMap<>(); @@ -3610,7 +3599,6 @@ public final class Switchboard extends serverSwitch { addAllToIndex(null, links, searchEvent, feedName, CrawlProfile.collectionParser("rss"), true); } } catch (final Throwable e ) { - //Log.logException(e); } finally { searchEvent.oneFeederTerminated(); } From 7c1706d83a8c542af4a453c53f124d8997f67234 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 20 Jul 2014 00:06:22 +0200 Subject: [PATCH 2/4] use CRLF in generated bat command scripts for windows - for easier viewing with standard viewers --- .../net/yacy/peers/operation/yacyRelease.java | 78 +++++++++---------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/source/net/yacy/peers/operation/yacyRelease.java b/source/net/yacy/peers/operation/yacyRelease.java index 1f3094ec7..54dc4b8da 100644 --- a/source/net/yacy/peers/operation/yacyRelease.java +++ b/source/net/yacy/peers/operation/yacyRelease.java @@ -398,19 +398,19 @@ public final class yacyRelease extends yacyVersion { try{ ConcurrentLog.info("RESTART", "INITIATED"); final String script = - "@echo off" + serverCore.LF_STRING + - "title YaCy restarter" + serverCore.LF_STRING + - "set loading=YACY RESTARTER" + serverCore.LF_STRING + - "echo %loading%" + serverCore.LF_STRING + - "cd \"" + sb.getDataPath().toString() + "/DATA/RELEASE/".replace("/", File.separator) + "\"" + serverCore.LF_STRING + - ":WAIT" + serverCore.LF_STRING + - "set loading=%loading%." + serverCore.LF_STRING + - "cls" + serverCore.LF_STRING + - "echo %loading%" + serverCore.LF_STRING + - "ping -n 2 127.0.0.1 >nul" + serverCore.LF_STRING + - "IF exist ..\\yacy.running goto WAIT" + serverCore.LF_STRING + - "cd \"" + sb.getAppPath().toString() + "\"" + serverCore.LF_STRING + - "start /MIN CMD /C " + starterFile + serverCore.LF_STRING; + "@echo off" + serverCore.CRLF_STRING + + "title YaCy restarter" + serverCore.CRLF_STRING + + "set loading=YACY RESTARTER" + serverCore.CRLF_STRING + + "echo %loading%" + serverCore.CRLF_STRING + + "cd \"" + sb.getDataPath().toString() + "/DATA/RELEASE/".replace("/", File.separator) + "\"" + serverCore.CRLF_STRING + + ":WAIT" + serverCore.CRLF_STRING + + "set loading=%loading%." + serverCore.CRLF_STRING + + "cls" + serverCore.CRLF_STRING + + "echo %loading%" + serverCore.CRLF_STRING + + "ping -n 2 127.0.0.1 >nul" + serverCore.CRLF_STRING + + "IF exist ..\\yacy.running goto WAIT" + serverCore.CRLF_STRING + + "cd \"" + sb.getAppPath().toString() + "\"" + serverCore.CRLF_STRING + + "start /MIN CMD /C " + starterFile + serverCore.CRLF_STRING; final File scriptFile = new File(sb.getDataPath(), "DATA/RELEASE/restart.bat".replace("/", File.separator)); OS.deployScript(scriptFile, script); ConcurrentLog.info("RESTART", "wrote restart-script to " + scriptFile.getAbsolutePath()); @@ -495,38 +495,38 @@ public final class yacyRelease extends yacyVersion { if (startType.exists()) starterFile = "startYACY.bat"; // startType noconsole if (startParameter.startsWith("-gui")) starterFile += " " + startParameter; script = - "@echo off" + serverCore.LF_STRING + - "title YaCy updater" + serverCore.LF_STRING + - "set loading=YACY UPDATER" + serverCore.LF_STRING + - "echo %loading%" + serverCore.LF_STRING + - "cd \"" + sb.getDataPath().toString() + "/DATA/RELEASE/".replace("/", File.separator) + "\"" + serverCore.LF_STRING + - - ":WAIT" + serverCore.LF_STRING + - "set loading=%loading%." + serverCore.LF_STRING + - "cls" + serverCore.LF_STRING + - "echo %loading%" + serverCore.LF_STRING + - "ping -n 2 127.0.0.1 >nul" + serverCore.LF_STRING + - "IF exist ..\\yacy.running goto WAIT" + serverCore.LF_STRING + - "IF not exist yacy goto NODATA" + serverCore.LF_STRING + - - "cd yacy" + serverCore.LF_STRING + - "del /Q \"" + sb.getAppPath().toString() + "\\lib\\*\" >nul" + serverCore.LF_STRING + - "xcopy *.* \"" + sb.getAppPath().toString() + "\" /E /Y >nul" + serverCore.LF_STRING + + "@echo off" + serverCore.CRLF_STRING + + "title YaCy updater" + serverCore.CRLF_STRING + + "set loading=YACY UPDATER" + serverCore.CRLF_STRING + + "echo %loading%" + serverCore.CRLF_STRING + + "cd \"" + sb.getDataPath().toString() + "/DATA/RELEASE/".replace("/", File.separator) + "\"" + serverCore.CRLF_STRING + + + ":WAIT" + serverCore.CRLF_STRING + + "set loading=%loading%." + serverCore.CRLF_STRING + + "cls" + serverCore.CRLF_STRING + + "echo %loading%" + serverCore.CRLF_STRING + + "ping -n 2 127.0.0.1 >nul" + serverCore.CRLF_STRING + + "IF exist ..\\yacy.running goto WAIT" + serverCore.CRLF_STRING + + "IF not exist yacy goto NODATA" + serverCore.CRLF_STRING + + + "cd yacy" + serverCore.CRLF_STRING + + "del /Q \"" + sb.getAppPath().toString() + "\\lib\\*\" >nul" + serverCore.CRLF_STRING + + "xcopy *.* \"" + sb.getAppPath().toString() + "\" /E /Y >nul" + serverCore.CRLF_STRING + // /E - all subdirectories // /Y - don't ask - "cd .." + serverCore.LF_STRING + - "rd yacy /S /Q" + serverCore.LF_STRING + + "cd .." + serverCore.CRLF_STRING + + "rd yacy /S /Q" + serverCore.CRLF_STRING + // /S delete tree // /Q don't ask - "goto END" + serverCore.LF_STRING + + "goto END" + serverCore.CRLF_STRING + - ":NODATA" + serverCore.LF_STRING + - "echo YACY UPDATER ERROR: NO UPDATE SOURCE FILES ON FILESYSTEM" + serverCore.LF_STRING + - "pause" + serverCore.LF_STRING + + ":NODATA" + serverCore.CRLF_STRING + + "echo YACY UPDATER ERROR: NO UPDATE SOURCE FILES ON FILESYSTEM" + serverCore.CRLF_STRING + + "pause" + serverCore.CRLF_STRING + - ":END" + serverCore.LF_STRING + - "cd \"" + sb.getAppPath().toString() + "\"" + serverCore.LF_STRING + - "start /MIN CMD /C " + starterFile + serverCore.LF_STRING; + ":END" + serverCore.CRLF_STRING + + "cd \"" + sb.getAppPath().toString() + "\"" + serverCore.CRLF_STRING + + "start /MIN CMD /C " + starterFile + serverCore.CRLF_STRING; scriptFileName = "update.bat"; } else { // unix/linux script = From 5f5fb4ecdcd4b811ddb2845e8a9ca8e531e0b1ed Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 20 Jul 2014 02:49:49 +0200 Subject: [PATCH 3/4] remove unused static (RSS)search from protocol --- source/net/yacy/peers/Protocol.java | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index cd270f69d..adc8531dc 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -68,11 +68,9 @@ import net.yacy.cora.document.feed.RSSFeed; import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.document.feed.RSSReader; import net.yacy.cora.document.id.MultiProtocolURL; -import net.yacy.cora.federate.opensearch.SRURSSConnector; import net.yacy.cora.federate.solr.connector.RemoteSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.solr.instance.RemoteInstance; -import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Digest; import net.yacy.cora.protocol.ClientIdentification; @@ -556,29 +554,6 @@ public final class Protocol { } } - public static RSSFeed search( - final Seed targetSeed, - final String query, - final CacheStrategy verify, - final boolean global, - final long timeout, - final int startRecord, - final int maximumRecords) throws IOException { - final String address = - (targetSeed == null || targetSeed == Switchboard.getSwitchboard().peers.mySeed()) ? "localhost:" - + Switchboard.getSwitchboard().getConfig("port", "8090") : targetSeed.getClusterAddress(); - final String urlBase = "http://" + address + "/yacysearch.rss"; - return SRURSSConnector.loadSRURSS( - urlBase, - query, - timeout, - startRecord, - maximumRecords, - verify, - global, - ClientIdentification.yacyInternetCrawlerAgent); - } - protected static int primarySearch( final SearchEvent event, final String wordhashes, From 8004cfc96191afa0a18effff2178db4fe2fa4ede Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 20 Jul 2014 12:28:59 +0200 Subject: [PATCH 4/4] fix input boostfield factor of 0.0 in RankingSolr - input was accepted and stored but not editeable (added check factor >0.0 during edit) - make use of some more predefined solr constants --- htroot/RankingSolr_p.java | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/htroot/RankingSolr_p.java b/htroot/RankingSolr_p.java index aa930c9be..a668460ea 100644 --- a/htroot/RankingSolr_p.java +++ b/htroot/RankingSolr_p.java @@ -29,6 +29,8 @@ import net.yacy.search.query.SearchEventCache; import net.yacy.search.schema.CollectionSchema; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.DisMaxParams; public class RankingSolr_p { @@ -54,8 +56,10 @@ public class RankingSolr_p { if (fieldValue == null || fieldValue.length() == 0) continue; try { float boost = Float.parseFloat(fieldValue); - if (boostString.length() > 0) boostString.append(','); - boostString.append(field.getSolrFieldName()).append('^').append(Float.toString(boost)); + if (boost > 0.0f) { // don't allow <= 0 + if (boostString.length() > 0) boostString.append(','); + boostString.append(field.getSolrFieldName()).append('^').append(Float.toString(boost)); + } } catch (final NumberFormatException e) { continue; } @@ -74,7 +78,7 @@ public class RankingSolr_p { } if (post != null && post.containsKey("EnterBQ")) { - String bq = post.get("bq"); + String bq = post.get(DisMaxParams.BQ); if (bq != null) { sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ + profileNr, bq); sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).setBoostQuery(bq); @@ -89,7 +93,7 @@ public class RankingSolr_p { } if (post != null && post.containsKey("EnterFQ")) { - String fq = post.get("fq"); + String fq = post.get(CommonParams.FQ); if (fq != null) { sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_FILTERQUERY_ + profileNr, fq); sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).setFilterQuery(fq); @@ -104,7 +108,7 @@ public class RankingSolr_p { } if (post != null && post.containsKey("EnterBF")) { - String bf = post.get("bf"); + String bf = post.get(DisMaxParams.BF); if (bf != null) { sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTION_ + profileNr, bf); sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).setBoostFunction(bf); @@ -139,9 +143,9 @@ public class RankingSolr_p { i++; } prop.put("boosts", i); - prop.put("fq", ranking.getFilterQuery()); - prop.put("bq", ranking.getBoostQuery()); - prop.put("bf", ranking.getBoostFunction()); + prop.put(CommonParams.FQ, ranking.getFilterQuery()); + prop.put(DisMaxParams.BQ, ranking.getBoostQuery()); + prop.put(DisMaxParams.BF, ranking.getBoostFunction()); for (int j = 0; j < 4; j++) { prop.put("profiles_" + j + "_nr", j);